In [1]:
# DB Connection
import sqlite3

# Casual
import numpy as np
import pandas as pd
import matplotlib.pyplot as plt
%matplotlib inline

# For Naive Bayes
from sklearn.naive_bayes import MultinomialNB
from sklearn.pipeline import make_pipeline
from sklearn.feature_extraction.text import TfidfVectorizer

# For Random Forest
from sklearn.ensemble import RandomForestClassifier

# NLTK
import nltk

from nltk.corpus import stopwords
from nltk.classify import SklearnClassifier
import re

# Wordcloud
from wordcloud import WordCloud,STOPWORDS

# Train - Test Split
from sklearn.model_selection import train_test_split

# Testing
from sklearn.metrics import classification_report, confusion_matrix, accuracy_score

# Dump Models
import pickle

In [2]:
#################### SZÖVEGELEMZÉS (KATEGÓRIA PREDIKTÁLÁS, ÉRZELEM OSZTÁLYOZÁS) ####################

In [3]:
# Adatbázis

connection = sqlite3.connect("podcasts.db")

In [8]:
#################### 1. KATEGÓRIA PREDIKCIÓ - TELJES ####################

In [9]:
# Adat betöltés

full_cat_pred = pd.read_sql("select reviews.content, categories.category from reviews, categories where reviews.podcast_id = categories.podcast_id", connection)
full_cat_pred_content = full_cat_pred['content'].values.tolist()
full_cat_pred_category = full_cat_pred['category'].values.tolist()

In [10]:
# Tanulás

fullcatpred = make_pipeline(TfidfVectorizer(), MultinomialNB())
fullcatpred.fit(full_cat_pred_content, full_cat_pred_category)

Pipeline(steps=[('tfidfvectorizer', TfidfVectorizer()),
                ('multinomialnb', MultinomialNB())])

In [11]:
# Prediktálás

def predict_category_full(s):
    pred = fullcatpred.predict([s])
    return pred

print(predict_category_full('bank system, valuta, credit card'))

['business']


In [12]:
# Mentés

pickle.dump(fullcatpred, open('fullcatpred.pkl','wb'))

In [13]:
#################### 2. KATEGÓRIA PREDIKCIÓ - RÉSZLEGES ####################

In [14]:
# Adat betöltés

par_cat_pred = pd.read_sql("select reviews.content, categories.category from reviews, categories where reviews.podcast_id = categories.podcast_id", connection)

In [15]:
# Szelektálás

categories = ['technology', 'arts-food','hinduism', 'spirituality']

def decision_maker(subject, categories):
    x = False
    for cat in categories:
        if subject == cat:
            x = True  
    return x

par_cat_pred_content = []
par_cat_pred_category = []

for ind in par_cat_pred.index:
    if (decision_maker(par_cat_pred['category'][ind], categories)):
        par_cat_pred_content.append(par_cat_pred['content'][ind])
        par_cat_pred_category.append(par_cat_pred['category'][ind])

In [16]:
# Tanulás

parcatpred = make_pipeline(TfidfVectorizer(), MultinomialNB())
parcatpred.fit(par_cat_pred_content, par_cat_pred_category)

Pipeline(steps=[('tfidfvectorizer', TfidfVectorizer()),
                ('multinomialnb', MultinomialNB())])

In [17]:
# Prediktálás

def predict_category_par(s):
    pred = parcatpred.predict([s])
    return pred

print(predict_category_par('hamburger, tasty, cheese, milk'))

['arts-food']


In [18]:
# Mentés

# Web alkalmazás logika

In [19]:
#################### 3. ÉRZELEM OSZTÁLYOZÁS ####################
# https://stackabuse.com/python-for-nlp-sentiment-analysis-with-scikit-learn/

In [4]:
# Adat betöltés

sentiment_data = pd.read_sql("""
    select
        content,
        case when rating < 5 then 0
            when rating > 4 then 1
        end as sentiment
    from reviews
""", connection)

sentiment_data.sentiment.value_counts()

1    996894
0    151320
Name: sentiment, dtype: int64

In [5]:
# Adat csökkentés

sentiment_data_positive = sentiment_data[sentiment_data['sentiment'] == 1].head(150000)
sentiment_data_negative = sentiment_data[sentiment_data['sentiment'] == 0].head(150000)

frames = [sentiment_data_positive, sentiment_data_negative]

sentiment = pd.concat(frames, ignore_index=True)
features = sentiment.content
labels = sentiment.sentiment

In [6]:
# Tisztítás

processed_features = []

for sentence in range(0, len(features)):
    # Remove all the special characters
    processed_feature = re.sub(r'\W', ' ', str(features[sentence]))

    # remove all single characters
    processed_feature= re.sub(r'\s+[a-zA-Z]\s+', ' ', processed_feature)

    # Remove single characters from the start
    processed_feature = re.sub(r'\^[a-zA-Z]\s+', ' ', processed_feature) 

    # Substituting multiple spaces with single space
    processed_feature = re.sub(r'\s+', ' ', processed_feature, flags=re.I)

    # Removing prefixed 'b'
    processed_feature = re.sub(r'^b\s+', '', processed_feature)

    # Converting to Lowercase
    processed_feature = processed_feature.lower()

    processed_features.append(processed_feature)

In [7]:
# TF - IDF

vectorizer = TfidfVectorizer (max_features=200, min_df=7, max_df=0.8, stop_words=stopwords.words('english'))
processed_features = vectorizer.fit_transform(processed_features).toarray()

In [8]:
# TF - IDF Mentés

pickle.dump(vectorizer, open('tfidf.pkl','wb'))

In [9]:
# Train - Test Split

X_train, X_test, y_train, y_test = train_test_split(processed_features, labels, test_size=0.05, random_state=0)

In [10]:
# Tanulás

text_classifier = RandomForestClassifier(n_estimators=200, random_state=0)
text_classifier.fit(X_train, y_train)

RandomForestClassifier(n_estimators=200, random_state=0)

In [11]:
# Model Mentés

pickle.dump(text_classifier, open('textclassifier.pkl','wb'))

In [12]:
# Tesztelés

predictions = text_classifier.predict(X_test)

print(confusion_matrix(y_test,predictions))
print(classification_report(y_test,predictions))
print(accuracy_score(y_test, predictions))

[[5822 1625]
 [1730 5823]]
              precision    recall  f1-score   support

           0       0.77      0.78      0.78      7447
           1       0.78      0.77      0.78      7553

    accuracy                           0.78     15000
   macro avg       0.78      0.78      0.78     15000
weighted avg       0.78      0.78      0.78     15000

0.7763333333333333
