In [1]:
# DB Connection
import sqlite3

# Casual
import numpy as np
import pandas as pd
import matplotlib.pyplot as plt
%matplotlib inline

# For Naive Bayes
from sklearn.naive_bayes import MultinomialNB
from sklearn.pipeline import make_pipeline
from sklearn.feature_extraction.text import TfidfVectorizer

# For Random Forest
from sklearn.ensemble import RandomForestClassifier

# NLTK
import nltk
from nltk.corpus import stopwords
import re

# Train - Test Split
from sklearn.model_selection import train_test_split

# Testing
from sklearn.metrics import classification_report, confusion_matrix, accuracy_score

# Dump Models
import pickle

In [2]:
#################### SZÖVEGELEMZÉS (KATEGÓRIA PREDIKTÁLÁS, ÉRZELEM OSZTÁLYOZÁS) ####################

In [20]:
# Adatbázis

connection = sqlite3.connect("../data/podcasts.db")

In [21]:
#################### 1. KATEGÓRIA PREDIKCIÓ - TELJES ####################

In [22]:
# Adat betöltés

full_category_predict = pd.read_sql("select reviews.content, categories.category from reviews, categories where reviews.podcast_id = categories.podcast_id", connection)
full_category_predict_content = full_category_predict['content'].values.tolist()
full_category_predict_category = full_category_predict['category'].values.tolist()

In [23]:
# Train - Test Split

X_train, X_test, y_train, y_test = train_test_split(full_category_predict_content, full_category_predict_category, test_size=0.05, random_state=0)

In [24]:
%%time
# Tanulás

fullcategorypredict = make_pipeline(TfidfVectorizer(), MultinomialNB())
fullcategorypredict.fit(X_train, y_train)

Wall time: 48.8 s


Pipeline(steps=[('tfidfvectorizer', TfidfVectorizer()),
                ('multinomialnb', MultinomialNB())])

In [29]:
# Prediktálás

def predict_category_full(s):
    pred = fullcategorypredict.predict([s])
    return pred

print(predict_category_full('bank system, valuta, credit card'))

['business']


In [31]:
# Tesztelés

predictions = fullcategorypredict.predict(X_test)

print(confusion_matrix(y_test,predictions))
print(classification_report(y_test,predictions))
print(accuracy_score(y_test, predictions))

[[ 223    0    0    0    6    0    0  262    0    0    0 1133   38    0
     0    0    4   17   13 2559    1    0    0    0   10  177]
 [  11    0    0    0    0    0    0   63    0    0    0   37    2    0
     0    0    0    1    0  196    0    0    0    0    3    0]
 [  10    0    0    0    0    0    0   36    0    0    0   54    5    0
     0    0    0    0    0  189    0    0    0    0    0    0]
 [  26    0    0    0    0    0    0   17    0    0    0  123    3    0
     0    0    0    0    0  314    0    0    0    0    0    1]
 [  60    0    0    0    0    0    0   24    0    0    0  303    1    0
     0    0    0    2    0  579    0    0    0    0    0   18]
 [  53    0    0    0    0    0    0   63    0    0    0  108    6    0
     0    0    0    2    3  244    0    0    0    0    4   33]
 [   0    0    0    0    0    0    0    1    0    0    0    3    0    0
     0    0    0    0   12   26    0    1    0    0    0    0]
 [   9    0    0    0    0    0    0 3043    0    0    

  _warn_prf(average, modifier, msg_start, len(result))
  _warn_prf(average, modifier, msg_start, len(result))


                                   precision    recall  f1-score   support

                             arts       0.42      0.05      0.09      4443
                      arts-design       0.00      0.00      0.00       313
              arts-fashion-beauty       0.00      0.00      0.00       294
                        arts-food       0.00      0.00      0.00       484
             arts-performing-arts       0.00      0.00      0.00       987
                 arts-visual-arts       0.00      0.00      0.00       516
                         buddhism       0.00      0.00      0.00        43
                         business       0.41      0.49      0.45      6267
                 business-careers       0.33      0.00      0.00      1793
               business-investing       0.33      0.00      0.00      1265
                     christianity       0.07      0.00      0.00      2629
                           comedy       0.41      0.63      0.50     11432
                        

  _warn_prf(average, modifier, msg_start, len(result))


In [22]:
# Mentés

pickle.dump(fullcategorypredict, open('../models/full-category-predict.pkl','wb'))

In [23]:
#################### 2. KATEGÓRIA PREDIKCIÓ - RÉSZLEGES ####################

In [25]:
# Adat betöltés

partial_category_predict = pd.read_sql("select reviews.content, categories.category from reviews, categories where reviews.podcast_id = categories.podcast_id", connection)

In [26]:
# Szelektálás

categories = ['technology', 'arts-food','hinduism', 'spirituality']

partial_category_predict = partial_category_predict[partial_category_predict['category'].isin(categories)]
category = partial_category_predict['category'].values.tolist()
content = partial_category_predict['content'].values.tolist()

In [27]:
%%time
# Tanulás

partialcategorypredict = make_pipeline(TfidfVectorizer(), MultinomialNB())
partialcategorypredict.fit(content, category)

Wall time: 2.35 s


Pipeline(steps=[('tfidfvectorizer', TfidfVectorizer()),
                ('multinomialnb', MultinomialNB())])

In [29]:
# Prediktálás

def predict_category_partial(s):
    pred = partialcategorypredict.predict([s])
    return pred

print(predict_category_partial('hamburger, tasty, cheese, milk'))

['arts-food']


In [30]:
# Mentés

# Web alkalmazás logika

In [31]:
#################### 3. ÉRZELEM OSZTÁLYOZÁS ####################

# https://stackabuse.com/python-for-nlp-sentiment-analysis-with-scikit-learn/

In [5]:
# Adat betöltés

sentiment_data = pd.read_sql("""
    select
        content,
        case when rating < 4 then 0
            when rating > 3 then 1
        end as sentiment
    from reviews
""", connection)

sentiment_data.sentiment.value_counts()

1    1034569
0     113645
Name: sentiment, dtype: int64

In [6]:
# Adat csökkentés

sentiment_data_positive = sentiment_data[sentiment_data['sentiment'] == 1].head(110000)
sentiment_data_negative = sentiment_data[sentiment_data['sentiment'] == 0].head(110000)

frames = [sentiment_data_positive, sentiment_data_negative]

sentiment = pd.concat(frames, ignore_index=True)
features = sentiment.content
labels = sentiment.sentiment

In [7]:
# Tisztítás

processed_features = []

for sentence in range(0, len(features)):
    # Remove all the special characters
    processed_feature = re.sub(r'\W', ' ', str(features[sentence]))

    # remove all single characters
    processed_feature= re.sub(r'\s+[a-zA-Z]\s+', ' ', processed_feature)

    # Remove single characters from the start
    processed_feature = re.sub(r'\^[a-zA-Z]\s+', ' ', processed_feature) 

    # Substituting multiple spaces with single space
    processed_feature = re.sub(r'\s+', ' ', processed_feature, flags=re.I)

    # Removing prefixed 'b'
    processed_feature = re.sub(r'^b\s+', '', processed_feature)

    # Converting to Lowercase
    processed_feature = processed_feature.lower()

    processed_features.append(processed_feature)

In [9]:
%%time
# TF - IDF

vectorizer = TfidfVectorizer (max_features=200, min_df=7, max_df=0.8, stop_words=stopwords.words('english'))
processed_features = vectorizer.fit_transform(processed_features).toarray()

Wall time: 6.78 s


In [13]:
# TF - IDF Mentés

pickle.dump(vectorizer, open('../models/tfidf.pkl','wb'))

In [10]:
# Train - Test Split

X_train, X_test, y_train, y_test = train_test_split(processed_features, labels, test_size=0.05, random_state=0)

In [11]:
%%time
# Tanulás

text_classifier = RandomForestClassifier(n_estimators=200, random_state=0)
text_classifier.fit(X_train, y_train)

Wall time: 5min 47s


RandomForestClassifier(n_estimators=200, random_state=0)

In [12]:
# Tesztelés

predictions = text_classifier.predict(X_test)

print(confusion_matrix(y_test,predictions))
print(classification_report(y_test,predictions))
print(accuracy_score(y_test, predictions))

[[4558  863]
 [1148 4431]]
              precision    recall  f1-score   support

           0       0.80      0.84      0.82      5421
           1       0.84      0.79      0.82      5579

    accuracy                           0.82     11000
   macro avg       0.82      0.82      0.82     11000
weighted avg       0.82      0.82      0.82     11000

0.8171818181818182


In [14]:
# Model Mentés

pickle.dump(text_classifier, open('../models/text-classifier.pkl','wb'))