In [2]:
## Python in build modules:
import os
import re

## EDA libraries
import numpy as np
import pandas as pd
import matplotlib.pyplot as plt
%matplotlib inline
import seaborn as sns

## Metrics (sklearn)
from sklearn.metrics import confusion_matrix, classification_report
from sklearn.metrics import recall_score, precision_score, f1_score, accuracy_score

## Models (sklearn)
from sklearn.feature_extraction.text import CountVectorizer
from sklearn.naive_bayes import MultinomialNB
from sklearn.model_selection import train_test_split

In [3]:
path = 'data/train/'

count = 0
labels = []
contents = []

for label in ['neg','pos']:
    filenames = os.listdir(path + label)
    for filename in filenames:
        count += 1
        with open(os.path.join(path, label, filename), 'r') as f:
            labels.append(1 if label == 'pos' else 0)
            contents.append(f.read())
print(count)
            
data = pd.DataFrame({
    'contents' : contents,
    'labels': labels,

})

data = data.sample(frac=1, random_state=42).reset_index(drop=True)

25000


#### Just to be clear negative is 0 and positive is 1

In [4]:
data.labels.value_counts()

0    12500
1    12500
Name: labels, dtype: int64

Ok, the data is **completely balanced**.

 ###### At this moment we are going to see 5 examples of each column

In [5]:
for label in data.labels.unique():
    print("negative" if label ==  0 else "positive")
    print("\n")
    for content in data.contents[data.labels == label].sample(5):
        print(content)
        print("\n")

negative


...is the only way to describe this movie about subjects that should be surefire: scandal, sex, celebrity, power. Kirsten Dunst grins her way through her role as silent movie star Marion Davies like she thinks she's in "Legally Blonde." The guy who plays William Randolph Hearst overacts to the point where you want to reach into the screen and slap him. Eddie Izzard is pretty good, except that he's playing Charlie Chaplin, and is about, oh, 125 lbs too heavy for the part? Hard to believe this hamfisted, uneven wreck was directed by Peter Bogdanovich, but then again, he hasn't made a watchable movie in, what? 30 years? Sometimes, there's just no coming back.


This foolish, implausible tale is redeemed only by the opening scene in which a hard-boiled police detective delivers some nearly-audible lines confirming our greatest fears: He is dead. Perhaps the film would have been saved had the director forgone the dazzling star power of A. Martinez in favor of this sadly-anonymous

In [6]:
def filter(x):
    return ("Great" in x) or ("great" in x)

In [7]:
using_great = data[data.contents.apply(filter)].labels.value_counts()

In [8]:
len(data)

25000

In [9]:
len(data)/2

12500.0

In [10]:
using_great[1]

4584

In [11]:
p1 = 4585 / 12500
p2 = 2297 / 12500
print(p1, p2)

0.3668 0.18376


In [12]:
print("\n")





We can see that just looking for the word "great we can find" 36% of all the data with the positive reviews. How ever this comes at the expense of finding also getting 18% of numbers that are negative. We can at basic level see that some obvious words can have an impact; but it is not that simble.

In [13]:
def print_report(y_true, prediction):
    print('Confusion Matrix')
    print(confusion_matrix(y_true, prediction))
    print("\n")
    print('Classification Report')
    print(classification_report(y_true, prediction))
    print("\n")
    print("Other Metrics")
    print(f'Pression Score: {precision_score(y_true, prediction)}')
    print(f'Accuracy Score: {accuracy_score(y_true, prediction)}')
    print(f'Recall Score: {recall_score(y_true, prediction)}')
    print(f'f1 Score {f1_score(y_true, prediction)}')

In [14]:
X = data.contents
y = data.labels

In [15]:
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.33, random_state=42)
print(X_train.shape)
print(X_test.shape)

(16750,)
(8250,)


#### Model number 1

In [18]:
count_vectorizer = CountVectorizer(stop_words='english', ngram_range=(1,1))
X_train_vectorized = count_vectorizer.fit_transform(X_train)
X_test_vectorized  = count_vectorizer.transform(X_test)

For some reason if I add the amount of ngram_range to for example 1,3 then I have a problem; I am going to test different stuff afterwards. 

In [19]:
mnb = MultinomialNB()

def check_model(clf, X_test, y_test):
    prediction = clf.predict(X_test)
    print_report(y_test, prediction)

for clf, clf_name in [(mnb, 'Multinomial Naive Bayes')]:
    print(clf_name)
    clf.fit(X_train_vectorized, y_train)
    check_model(clf, X_test_vectorized, y_test)
    print("_________________________________________________")

Multinomial Naive Bayes
Confusion Matrix
[[3612  525]
 [ 674 3439]]


Classification Report
              precision    recall  f1-score   support

           0       0.84      0.87      0.86      4137
           1       0.87      0.84      0.85      4113

    accuracy                           0.85      8250
   macro avg       0.86      0.85      0.85      8250
weighted avg       0.86      0.85      0.85      8250



Other Metrics
Pression Score: 0.8675580221997982
Accuracy Score: 0.8546666666666667
Recall Score: 0.836129345976173
f1 Score 0.8515537947257644
_________________________________________________


#### Model number 2

In [20]:
count_vectorizer = CountVectorizer(stop_words='english', ngram_range=(1,3))
X_train_vectorized = count_vectorizer.fit_transform(X_train)
X_test_vectorized  = count_vectorizer.transform(X_test)

In [21]:
mnb = MultinomialNB()

def check_model(clf, X_test, y_test):
    prediction = clf.predict(X_test)
    print_report(y_test, prediction)

for clf, clf_name in [(mnb, 'Multinomial Naive Bayes')]:
    print(clf_name)
    clf.fit(X_train_vectorized, y_train)
    check_model(clf, X_test_vectorized, y_test)
    print("_________________________________________________")

Multinomial Naive Bayes
Confusion Matrix
[[3663  474]
 [ 545 3568]]


Classification Report
              precision    recall  f1-score   support

           0       0.87      0.89      0.88      4137
           1       0.88      0.87      0.88      4113

    accuracy                           0.88      8250
   macro avg       0.88      0.88      0.88      8250
weighted avg       0.88      0.88      0.88      8250



Other Metrics
Pression Score: 0.8827313211281543
Accuracy Score: 0.8764848484848485
Recall Score: 0.8674933138828106
f1 Score 0.8750459840588596
_________________________________________________
