In [1]:
## Python in build modules:
import os
import re
from collections import Counter
import time

## EDA libraries
import numpy as np
import pandas as pd
import matplotlib.pyplot as plt
%matplotlib inline
import seaborn as sns

## Metrics (sklearn)
from sklearn.metrics import confusion_matrix, classification_report
from sklearn.metrics import recall_score, precision_score, f1_score, accuracy_score

## Models (sklearn)
from sklearn.feature_extraction.text import CountVectorizer
from sklearn.naive_bayes import MultinomialNB
from sklearn.model_selection import train_test_split, KFold
from sklearn.pipeline import make_pipeline

In [2]:
from functions import print_report, check_model, vectorize_X

In [3]:
## Importing the data

path = 'data/train/'

count = 0
labels = []
contents = []

for label in ['neg','pos']:
    filenames = os.listdir(path + label)
    for filename in filenames:
        count += 1
        with open(os.path.join(path, label, filename), 'r') as f:
            labels.append(1 if label == 'pos' else 0) # 1 is positve 0 is negative
            contents.append(f.read())
print(count)
            
data = pd.DataFrame({
    'contents' : contents,
    'labels': labels,

})

data = data.sample(frac=1, random_state=42).reset_index(drop=True) # This code will shuffle the data (just in case!)

25000


In [4]:
X = data.contents
y = data.labels
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.33, random_state=42)
train_df = pd.concat([X_train, y_train],axis=1)

In [10]:
mnb = MultinomialNB()
cv = CountVectorizer(stop_words={'english'}, ngram_range=(2,3), min_df=2, max_df=0.4)


for cv ,clf, clf_name in [(cv , mnb, 'Multinomial Naive Bayes')]:
    # Vectorization
    start = time.process_time()
    X_train_vectorized, X_test_vectorized = vectorize_X(cv, X_train, X_test)
    print('The amount of features in the Vectorized X train is: {:,} '.format(X_train_vectorized.shape[1]))
    print(f'The amount of time it took to vectorize was: {time.process_time() - start}\n')
    
    # Running the model
    print(clf_name)
    clf.fit(X_train_vectorized, y_train)
    check_model(clf, X_test_vectorized, y_test)
    print("_________________________________________________")

The amount of features in the Vectorized X train is: 584,934 
The amount of time it took to vectorize was: 13.381627999999996

Multinomial Naive Bayes
Confusion Matrix
[[3704  433]
 [ 470 3643]]


Classification Report
              precision    recall  f1-score   support

           0       0.89      0.90      0.89      4137
           1       0.89      0.89      0.89      4113

    accuracy                           0.89      8250
   macro avg       0.89      0.89      0.89      8250
weighted avg       0.89      0.89      0.89      8250



Other Metrics:
Pression Score: 0.8937684003925417
Accuracy Score: 0.8905454545454545
Recall Score: 0.8857281789448092
f1 Score 0.8897301257784832
_________________________________________________


In [43]:
probability = clf.predict_proba(X_test_vectorized)
probability = pd.Series([round(i[0],5) for i in probability])
prediction = pd.Series(clf.predict(X_test_vectorized))

In [35]:
test_df = pd.DataFrame(np.concatenate([X_test, y_test]).reshape(2,len(X_test))).T
test_df.columns = ['content', 'true_labels']

In [48]:
df1 = pd.concat([test_df, prediction, probability], axis=1)
df1.columns = ["content","true_label","prediction","probability"]

In [54]:
true_pos  = df1[(df1.true_label == 1) & (df1.prediction == 1)]
true_neg  = df1[(df1.true_label == 0) & (df1.prediction == 0)]
false_pos = df1[(df1.true_label == 0) & (df1.prediction == 1)]
false_neg = df1[(df1.true_label == 1) & (df1.prediction == 0)]

In [88]:
len(true_pos[true_pos.probability > .10]) / 3300

0.026060606060606062

In [90]:
true_neg[true_neg.probability < .9]

Unnamed: 0,content,true_label,prediction,probability
178,"If I write a review about a movie, maybe it wi...",0,0,0.56674
569,Joseph L. Mankiewicz's Sleuth didn't need a re...,0,0,0.64162
570,...is the only way to describe this movie abou...,0,0,0.85570
598,Disappointing and irritating. The screenwriter...,0,0,0.72820
783,"Remade today, this film would be a very creepy...",0,0,0.55275
...,...,...,...,...
7637,This is just short of a full blown gore fest b...,0,0,0.71064
7673,"I usually comment only on movies that I like, ...",0,0,0.89838
7730,"Falsely accused, skirt-chasing chums John Wayn...",0,0,0.88185
7801,"Is this the ""worse"" Star Trek TOS episode? May...",0,0,0.80032


In [93]:
false_pos[false_pos.probability > .10]

Unnamed: 0,content,true_label,prediction,probability
115,This film concerns the story of Eddy as mentio...,0,1,0.18239
287,Another movie to suffer without an adventure t...,0,1,0.4288
614,Anyone that has see Tammuz's Child Eaters know...,0,1,0.26542
617,What's with Indonesian musical movies? Never h...,0,1,0.44515
633,I am at a distinct disadvantage here. I have n...,0,1,0.26773
798,You may consider a couple of facts in the disc...,0,1,0.30678
847,I'm both amused and disgusted by the people wh...,0,1,0.20214
939,"The Honey, I Shrunk the Kids franchise was a h...",0,1,0.21388
944,"Maybe you shouldn't compare, but Wild Style an...",0,1,0.42348
1179,This movie has not aged well. Maybe it's just ...,0,1,0.29736


In [97]:
false_pos.loc[847].content

'I\'m both amused and disgusted by the people who claim that this movie is so accurate about Vietnam, and WERE NEVER THERE. This movie is about as true about the whole Vietnam war as the Rodney King beating is true about ALL police officers. Yes, bad things do (and did) happen, but in general the people there are just like you and me. They have morals, they are not killing machines, they do not all do drugs. Atrocities were the exception in Vietnam, not the rule. They happened far more infrequently than the "hype" would lead you believe. Oliver Stone has a knack for making movies that show the Vietnam war as this brutal bloodbath, but are based as much in reality as Star Wars. If you honestly believe the stereotypes of Vietnam, do yourself a favour and learn the truth. Fact: the Viet Cong and NVA did far worse things to the South Vietnamese than ANY soldier in the US Armed Forces ever did. Fact: the soldiers in World War II treated the enemy far worse in general than the soldiers in Vi