In [1]:
import numpy as np
import pandas as pd
from sklearn.feature_extraction.text import CountVectorizer, TfidfVectorizer
from sklearn.model_selection import train_test_split
from sklearn.naive_bayes import MultinomialNB
from sklearn.metrics import classification_report
from sklearn.model_selection import GridSearchCV
from sklearn.linear_model import LogisticRegression

In [2]:
df = pd.read_csv('final.csv')
dfu = pd.read_csv('final_upweigting.csv')

In [3]:

df_x = df['Review']
df_y = df['Label']
x_train, x_test, y_train, y_test = train_test_split(df_x, df_y, test_size=0.2, random_state=4)
cv_u = CountVectorizer()
cv_b = CountVectorizer(ngram_range=(2, 2))
tv = TfidfVectorizer()

unigram_train = cv_u.fit_transform(x_train)
unigram_test = cv_u.transform(x_test)
print("Number of features - unigram: {}".format(len(cv_u.get_feature_names())))

bigram_train = cv_b.fit_transform(x_train)
bigram_test = cv_b.transform(x_test)
print("Number of features - bigram: {}".format(len(cv_b.get_feature_names())))

tfidf_train = tv.fit_transform(x_train)
tfidf_test = tv.transform(x_test)
print("Number of features - tfidf: {}".format(len(tv.get_feature_names())))

Number of features - unigram: 21325
Number of features - bigram: 251583
Number of features - tfidf: 21325


In [4]:
print('Multinomial naive bayes - Count vectorizer(unigram):')
for dataset in [df,dfu]:
    mnb = MultinomialNB()
    mnb.fit(unigram_train,y_train)
    y_pred = mnb.predict(unigram_test)
    if not dataset.equals(df):
        print('Dataset with upweighting:')
    else:
        print('Dataset without upweighting:')
    print(classification_report(y_test, y_pred))

Multinomial naive bayes - Count vectorizer(unigram):
Dataset without upweighting:
              precision    recall  f1-score   support

           0       0.69      0.71      0.70      1536
           1       0.96      0.96      0.96     12343

   micro avg       0.93      0.93      0.93     13879
   macro avg       0.83      0.83      0.83     13879
weighted avg       0.93      0.93      0.93     13879

Dataset with upweighting:
              precision    recall  f1-score   support

           0       0.69      0.71      0.70      1536
           1       0.96      0.96      0.96     12343

   micro avg       0.93      0.93      0.93     13879
   macro avg       0.83      0.83      0.83     13879
weighted avg       0.93      0.93      0.93     13879



In [5]:
print('Multinomial naive bayes - Count vectorizer(bigram):')
for dataset in [df,dfu]:
    mnb = MultinomialNB()
    mnb.fit(bigram_train,y_train)
    y_pred = mnb.predict(bigram_test)
    if not dataset.equals(df):
        print('Dataset with upweighting:')
    else:
        print('Dataset without upweighting:')
    print(classification_report(y_test, y_pred))

Multinomial naive bayes - Count vectorizer(bigram):
Dataset without upweighting:
              precision    recall  f1-score   support

           0       0.82      0.38      0.52      1536
           1       0.93      0.99      0.96     12343

   micro avg       0.92      0.92      0.92     13879
   macro avg       0.87      0.69      0.74     13879
weighted avg       0.92      0.92      0.91     13879

Dataset with upweighting:
              precision    recall  f1-score   support

           0       0.82      0.38      0.52      1536
           1       0.93      0.99      0.96     12343

   micro avg       0.92      0.92      0.92     13879
   macro avg       0.87      0.69      0.74     13879
weighted avg       0.92      0.92      0.91     13879



In [6]:
print('Multinomial naive bayes - tfidf vectorizer(bigram):')
for dataset in [df,dfu]:
    mnb = MultinomialNB()
    mnb.fit(tfidf_train,y_train)
    y_pred = mnb.predict(tfidf_test)
    if not dataset.equals(df):
        print('Dataset with upweighting:')
    else:
        print('Dataset without upweighting:')
    print(classification_report(y_test, y_pred))

Multinomial naive bayes - tfidf vectorizer(bigram):
Dataset without upweighting:
              precision    recall  f1-score   support

           0       0.91      0.17      0.29      1536
           1       0.91      1.00      0.95     12343

   micro avg       0.91      0.91      0.91     13879
   macro avg       0.91      0.59      0.62     13879
weighted avg       0.91      0.91      0.88     13879

Dataset with upweighting:
              precision    recall  f1-score   support

           0       0.91      0.17      0.29      1536
           1       0.91      1.00      0.95     12343

   micro avg       0.91      0.91      0.91     13879
   macro avg       0.91      0.59      0.62     13879
weighted avg       0.91      0.91      0.88     13879

