In [13]:
import numpy as np
import pandas as pd
from IPython.display import Markdown, display
from sklearn.feature_extraction.text import CountVectorizer, TfidfVectorizer
from sklearn.model_selection import train_test_split
from sklearn.naive_bayes import MultinomialNB
from sklearn.metrics import classification_report
from sklearn.model_selection import GridSearchCV
from sklearn.linear_model import LogisticRegression

In [2]:
df = pd.read_csv('final.csv')
dfu = pd.read_csv('final_upweigting.csv')

In [8]:
def mnb(x_train,y_train,x_test,y_test):
    mnb = MultinomialNB()
    mnb.fit(x_train,y_train)
    y_pred = mnb.predict(x_test)
    print(classification_report(y_test, y_pred))

In [14]:
for dataset in [df,dfu]:
    if not dataset.equals(dfu):
        display(Markdown('<h1>Dataset without upweighting:</h1>'))
    else:
        display(Markdown('<h1>Dataset with upweighting:</h1>'))
    df_x = dataset['Review']
    df_y = dataset['Label']
    x_train, x_test, y_train, y_test = train_test_split(df_x, df_y, test_size=0.2, random_state=4)
    cv_u = CountVectorizer()
    cv_b = CountVectorizer(ngram_range=(2, 2))
    tv = TfidfVectorizer()

    unigram_train = cv_u.fit_transform(x_train)
    unigram_test = cv_u.transform(x_test)
    print("Number of features - unigram: {}".format(len(cv_u.get_feature_names())))

    bigram_train = cv_b.fit_transform(x_train)
    bigram_test = cv_b.transform(x_test)
    print("Number of features - bigram: {}".format(len(cv_b.get_feature_names())))

    tfidf_train = tv.fit_transform(x_train)
    tfidf_test = tv.transform(x_test)
    print("Number of features - tfidf: {}".format(len(tv.get_feature_names())))
    
    display(Markdown('<h2>Multinomial bayes(unigram vectorization):</h2>'))
    mnb(unigram_train,y_train,unigram_test,y_test)
    
    
    display(Markdown('<h2>Multinomial bayes(bigram vectorization):</h2>'))
    mnb(bigram_train,y_train,bigram_test,y_test)
    
    display(Markdown('<h2>Multinomial bayes(tfidf vectorization):</h2>'))
    mnb(tfidf_train,y_train,tfidf_test,y_test)

<h1>Dataset without upweighting:</h1>

Number of features - unigram: 21325
Number of features - bigram: 251583
Number of features - tfidf: 21325


<h2>Multinomial bayes(unigram vectorization):</h2>

              precision    recall  f1-score   support

           0       0.69      0.71      0.70      1536
           1       0.96      0.96      0.96     12343

   micro avg       0.93      0.93      0.93     13879
   macro avg       0.83      0.83      0.83     13879
weighted avg       0.93      0.93      0.93     13879



<h2>Multinomial bayes(bigram vectorization):</h2>

              precision    recall  f1-score   support

           0       0.82      0.38      0.52      1536
           1       0.93      0.99      0.96     12343

   micro avg       0.92      0.92      0.92     13879
   macro avg       0.87      0.69      0.74     13879
weighted avg       0.92      0.92      0.91     13879



<h2>Multinomial bayes(tfidf vectorization):</h2>

              precision    recall  f1-score   support

           0       0.91      0.17      0.29      1536
           1       0.91      1.00      0.95     12343

   micro avg       0.91      0.91      0.91     13879
   macro avg       0.91      0.59      0.62     13879
weighted avg       0.91      0.91      0.88     13879



<h1>Dataset with upweighting:</h1>

Number of features - unigram: 21325
Number of features - bigram: 259453
Number of features - tfidf: 21325


<h2>Multinomial bayes(unigram vectorization):</h2>

              precision    recall  f1-score   support

           0       0.67      0.75      0.71      1536
           1       0.97      0.95      0.96     12343

   micro avg       0.93      0.93      0.93     13879
   macro avg       0.82      0.85      0.83     13879
weighted avg       0.94      0.93      0.93     13879



<h2>Multinomial bayes(bigram vectorization):</h2>

              precision    recall  f1-score   support

           0       0.79      0.49      0.61      1536
           1       0.94      0.98      0.96     12343

   micro avg       0.93      0.93      0.93     13879
   macro avg       0.87      0.74      0.78     13879
weighted avg       0.92      0.93      0.92     13879



<h2>Multinomial bayes(tfidf vectorization):</h2>

              precision    recall  f1-score   support

           0       0.92      0.21      0.34      1536
           1       0.91      1.00      0.95     12343

   micro avg       0.91      0.91      0.91     13879
   macro avg       0.91      0.60      0.64     13879
weighted avg       0.91      0.91      0.88     13879

