In [1]:
import pandas as pd
from vaderSentiment.vaderSentiment import SentimentIntensityAnalyzer
from nltk.corpus import stopwords
from nltk.stem import PorterStemmer
from joblib import load, dump 
import string
import re
from sklearn.feature_extraction.text import CountVectorizer

ps = PorterStemmer()
stopwords_set = set(stopwords.words('english'))

In [2]:
def clean_review(sentence):
    no_punc = [c for c in sentence if c not in string.punctuation]
    no_punc = ''.join(no_punc)
    no_stopwords = [w.lower() for w in no_punc.split() if (w not in stopwords_set) and (len(re.search('^\s*[0-9]*', w)[0]) == 0)]    
    stemmed_words = [ps.stem(w) for w in no_stopwords]
    return stemmed_words

def clean_review_len(sentence):
    return len(clean_review(sentence))

In [3]:
analyzer = SentimentIntensityAnalyzer()

def get_sentiment(s):
    vs = analyzer.polarity_scores(s)
    if vs['compound'] >= 0.05:
        return 2
    elif vs['compound'] <= -0.05:
        return 0
    else:
        return 1
def get_strict_sentiment(s):
    vs = analyzer.polarity_scores(s)
    if vs['compound'] >= 0.0:
        return 2
    else:
        return 0

In [4]:
def create_word_freq_table(input_df):
    bow_transformer = CountVectorizer(analyzer=clean_review).fit(input_df['review_body'])
    transformed_input = bow_transformer.transform(input_df['review_body'])
    count_vect_df = pd.DataFrame(transformed_input.todense(), columns=bow_transformer.get_feature_names())
    return count_vect_df

In [5]:
json_reviews_by_sent = load('json_review_dataframe_by_sent')
json_reviews_by_sent['sentiment'] = json_reviews_by_sent['review_body'].apply(get_sentiment)
json_word_freq_table = create_word_freq_table(json_reviews_by_sent)
json_word_freq_table['_sentiment'] = json_reviews_by_sent['sentiment']
json_word_freq_table['_classification'] = json_reviews_by_sent['classification']

In [6]:
# json_reviews_by_sent['clean_review_body'] = json_reviews_by_sent['review_body'].apply(clean_review)
# json_reviews_by_sent['review_body'] = json_reviews_by_sent['review_body'].apply(fix_tiktok)
# json_reviews_by_sent = load('json_review_dataframe')
print('Num informative: ', len(json_reviews_by_sent[(json_reviews_by_sent['classification'] == 'informative')]))
print('Num non-informative: ', len(json_reviews_by_sent[(json_reviews_by_sent['classification'] == 'non-informative')]))
print('Num vague: ', len(json_reviews_by_sent[(json_reviews_by_sent['classification'] == 'vague')]))
# perform whatever manipulations you want on the json_reviews_by_sent dataframe
non_informative_df = json_reviews_by_sent[(json_reviews_by_sent['classification'] == 'non-informative')]
informative_df = json_reviews_by_sent[(json_reviews_by_sent['classification'] == 'informative')]
classification_df = informative_df.append(non_informative_df[:361], ignore_index=True)

# drop irrelevant columns
classification_df.drop(['review_num', 'application', 'length'], axis=1, inplace=True)


# With words_freq_table
non_informative_df = json_word_freq_table[(json_word_freq_table['_classification'] == 'non-informative')]
informative_df = json_word_freq_table[(json_word_freq_table['_classification'] == 'informative')]
classification_df = informative_df.append(non_informative_df[:361], ignore_index=True)
# classification_df = classification_df.append(json_word_freq_table[(json_word_freq_table['_classification'] == 'vague')], ignore_index=True)

# trying 3 categories
# classification_df = json_word_freq_table[:1000]

Num informative:  815
Num non-informative:  1267
Num vague:  131


In [11]:
from sklearn.feature_extraction.text import CountVectorizer
from sklearn.feature_extraction.text import TfidfTransformer
from mlxtend.preprocessing import DenseTransformer
from sklearn.preprocessing import StandardScaler
from sklearn.naive_bayes import MultinomialNB
from sklearn.naive_bayes import BernoulliNB
from sklearn.ensemble import RandomForestClassifier
from sklearn.svm import SVC
from sklearn.model_selection import GridSearchCV
from sklearn.model_selection import train_test_split
from sklearn.metrics import classification_report

# msg_train, msg_test, label_train, label_test = train_test_split(classification_df.drop('classification', axis=1), classification_df['classification'], test_size=0.3, random_state=42)
msg_train, msg_test, label_train, label_test = train_test_split(classification_df.drop('_classification', axis=1), classification_df['_classification'], test_size=0.3, random_state=42)
param_grid = {'classifier__C': [0.1,1, 10, 100, 1000], 'classifier__gamma': [1,0.1,0.01,0.001,0.0001], 'classifier__kernel': ['rbf']} 
param_grid = {'C': [0.1,1, 10, 100, 1000], 'gamma': [1,0.1,0.01,0.001,0.0001], 'kernel': ['rbf']} 

In [12]:
mnb = MultinomialNB()
mnb.fit(msg_train, label_train)
predictions1 = mnb.predict(msg_test)
print('Multinomial')
print(classification_report(predictions1, label_test))

rfc = RandomForestClassifier(n_estimators=200)
rfc.fit(msg_train, label_train)
predictions2 = rfc.predict(msg_test)
print('Random Forest Classifier')
print(classification_report(predictions2, label_test))

# grid = GridSearchCV(SVC(), param_grid, refit=True, verbose=0, cv=10)
# grid.fit(msg_train, label_train)
# predictions3 = grid.predict(msg_test)
# print('Support Vector Machine')
# print(grid.best_params_)
# print(grid.best_estimator_)
# print(classification_report(predictions3, label_test))


svm = SVC(C = 10, gamma= 0.01)
svm.fit(msg_train, label_train)
predictions3 = svm.predict(msg_test)
print('SVM')
print(classification_report(predictions3, label_test))

bnb = BernoulliNB()
bnb.fit(msg_train, label_train)
predictions4 = bnb.predict(msg_test)
print('Bernoulli')
print(classification_report(predictions4, label_test))

Multinomial
                 precision    recall  f1-score   support

    informative       0.94      0.81      0.87       289
non-informative       0.48      0.78      0.60        64

       accuracy                           0.81       353
      macro avg       0.71      0.80      0.73       353
   weighted avg       0.86      0.81      0.82       353

Random Forest Classifier
                 precision    recall  f1-score   support

    informative       0.88      0.93      0.90       235
non-informative       0.84      0.74      0.78       118

       accuracy                           0.86       353
      macro avg       0.86      0.83      0.84       353
   weighted avg       0.86      0.86      0.86       353

SVM
                 precision    recall  f1-score   support

    informative       0.94      0.94      0.94       250
non-informative       0.85      0.85      0.85       103

       accuracy                           0.91       353
      macro avg       0.89      0.90   

In [13]:
from sklearn.model_selection import cross_val_score

In [14]:
scores1 = cross_val_score(mnb, msg_train, label_train, cv=10)
print("Accuracy: %0.2f (+/- %0.2f)" % (scores1.mean(), scores1.std() * 2))
scores2 = cross_val_score(rfc, msg_train, label_train, cv=10)
print("Accuracy: %0.2f (+/- %0.2f)" % (scores2.mean(), scores2.std() * 2))
scores3 = cross_val_score(svm, msg_train, label_train, cv=10)
print("Accuracy: %0.2f (+/- %0.2f)" % (scores3.mean(), scores3.std() * 2))
scores4 = cross_val_score(bnb, msg_train, label_train, cv=10)
print("Accuracy: %0.2f (+/- %0.2f)" % (scores4.mean(), scores4.std() * 2))

Accuracy: 0.78 (+/- 0.06)
Accuracy: 0.86 (+/- 0.07)
Accuracy: 0.90 (+/- 0.07)
Accuracy: 0.81 (+/- 0.06)


In [50]:
predictions1 = mnb.predict(msg_test)
predictions2 = rfc.predict(msg_test)
predictions3 = svm.predict(msg_test)
predictions4 = bnb.predict(msg_test)

NameError: name 'rfc' is not defined

In [51]:
from sklearn.metrics import classification_report
print('Multinomial')
print(classification_report(predictions1, label_test))
print('RFC')
print(classification_report(predictions2, label_test))
print('SVM')
print(classification_report(predictions3, label_test))
print('Bernoulli')
print(classification_report(predictions4, label_test))

Multinomial
                 precision    recall  f1-score   support

    informative       0.94      0.81      0.87       289
non-informative       0.48      0.78      0.60        64

       accuracy                           0.81       353
      macro avg       0.71      0.80      0.73       353
   weighted avg       0.86      0.81      0.82       353

RFC


NameError: name 'predictions2' is not defined

In [52]:
best_model = load('best_model')
pred = best_model.predict(json_word_freq_table[1001:].drop('_classification', axis=1).fillna(0))
for i in range(len(pred)):
    if json_reviews_by_sent['classification'][1001+i] == None:
        json_reviews_by_sent['classification'][1001+i] = pred[i]

NameError: name 'svm' is not defined