In [114]:
## Python in build modules:
import os
import re
from collections import Counter

## EDA libraries
import numpy as np
import pandas as pd
import matplotlib.pyplot as plt
%matplotlib inline
import seaborn as sns

## Metrics (sklearn)
from sklearn.metrics import confusion_matrix, classification_report
from sklearn.metrics import recall_score, precision_score, f1_score, accuracy_score

## Models (sklearn)
from sklearn.feature_extraction.text import CountVectorizer
from sklearn.naive_bayes import MultinomialNB
from sklearn.model_selection import train_test_split

In [None]:
def print_report(y_true, prediction):
    print('Confusion Matrix')
    print(confusion_matrix(y_true, prediction))
    print("\n")
    print('Classification Report')
    print(classification_report(y_true, prediction))
    print("\n")
    print("Other Metrics")
    print(f'Pression Score: {precision_score(y_true, prediction)}')
    print(f'Accuracy Score: {accuracy_score(y_true, prediction)}')
    print(f'Recall Score: {recall_score(y_true, prediction)}')
    print(f'f1 Score {f1_score(y_true, prediction)}')
    
def check_model(clf, X_test, y_test):
    prediction = clf.predict(X_test)
    print_report(y_test, prediction)

### 1) Exploratory Analysis

In [3]:
path = 'data/train/'

count = 0
labels = []
contents = []

for label in ['neg','pos']:
    filenames = os.listdir(path + label)
    for filename in filenames:
        count += 1
        with open(os.path.join(path, label, filename), 'r') as f:
            labels.append(1 if label == 'pos' else 0) # 1 is positve 0 is negative
            contents.append(f.read())
print(count)
            
data = pd.DataFrame({
    'contents' : contents,
    'labels': labels,

})

data = data.sample(frac=1, random_state=42).reset_index(drop=True)

25000


#### Just to be clear negative is 0 and positive is 1

In [4]:
data.labels.value_counts()

0    12500
1    12500
Name: labels, dtype: int64

Ok, the data is **completely balanced**.

 ###### At this moment we are going to see 5 examples of each column

In [5]:
for label in data.labels.unique():
    print("negative" if label ==  0 else "positive")
    print("\n")
    for content in data.contents[data.labels == label].sample(5):
        print(content)
        print("\n")

negative


The plot was quite interesting, with the Russian revolution background. I also enjoyed seeing Budapest as the movie was partly filmed there. Sadly, there was zero chemistry between the 2 main characters so it was hard to believe the love story between them. The love scenes were forced and mechanical. Jordan kirckland was really stiff, almost icy. Rob Stewart was quite charming and boyish, so she looked like his older sister rather than his girlfriend. The ending, when we finally figure out who was actually after them, was quite weak and made no sense whatsoever. I think I would have enjoyed this movie a lot better if the relationship between the 2 actors was slightly more credible.


Well our standards have gone into the toilet. The direction was poor, the acting was mediocre and the writing was amateurish. And those are the good points. Hopefully there won't be a sequel. Otherwise, I might have to leave the country.


This film is a travesty, and isn't fit to keep company w

###### From now on I am going to be using the training data to explore the data so I can get some conclusions.

In [14]:
X = data.contents
y = data.labels
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.33, random_state=42)

In [102]:
train_df.labels.value_counts() # Just checking the proportions haven't change much

1    8387
0    8363
Name: labels, dtype: int64

###### Mean of length

In [111]:
mean_length_full = np.round(X_train.map(len).mean(),2)
train_df = pd.concat([X_train, y_train], axis=1)
mean_lenght_pos = train_df[train_df.labels == 1].contents.map(len).mean()
mean_length_neg = train_df[train_df.labels == 0].contents.map(len).mean()
ratio_pos =  mean_lenght_pos / mean_length_full
ratio_neg =  mean_length_neg / mean_length_full

print(f'Mean length of all reviews: {mean_length_full}')
print(f'Mean length in positive reviews: {mean_lenght_pos}')
print(f'Mean length in negative reviews: {mean_length_neg}')
print(f'Ratio positive: {ratio_pos}')
print(f'Ration negative: {ratio_neg}')

Mean length of all reviews: 1328.28
Mean length in positive reviews: 1352.3945391677596
Mean length in negative reviews: 1304.1033122085375
Ratio positive: 1.0181547107294844
Ration negative: 0.9817985004731966


We can run an **A/B test** here, but there is an indication that the **length** of the commentary has **nothing to do** with the idea if it is positive or negative.

###### Most common word for each label

In [227]:
cv = CountVectorizer(stop_words={'english'})
list_of_words = cv.fit_transform(X_train).toarray()
sum_of_words = pd.Series(list_of_words.sum(axis=0))
sum_of_words = sum_of_words.sort_values(ascending=False)[:20]

words = []
numbers = []

for num in sum_of_words.index:
    word = (list(cv.vocabulary_.keys())[list(cv.vocabulary_.values()).index(num)])
    numbers.append(sum_of_words.loc[num])
    words.append(word)
    
pd.DataFrame({
    "words" : words,
    "counts" : numbers,
})

Unnamed: 0,word,number
0,the,226374
1,and,110239
2,of,97692
3,to,91327
4,is,72318
5,br,68495
6,it,65011
7,in,62845
8,this,50974
9,that,49116


### 2) Running the first models

#### Model number 1

In [16]:
count_vectorizer = CountVectorizer(stop_words='english', ngram_range=(1,1))
X_train_vectorized = count_vectorizer.fit_transform(X_train)
X_test_vectorized  = count_vectorizer.transform(X_test)

For some reason if I add the amount of ngram_range to for example 1,3 then I have a problem; I am going to test different stuff afterwards. 

In [17]:
mnb = MultinomialNB()

for clf, clf_name in [(mnb, 'Multinomial Naive Bayes')]:
    print(clf_name)
    clf.fit(X_train_vectorized, y_train)
    check_model(clf, X_test_vectorized, y_test)
    print("_________________________________________________")

Multinomial Naive Bayes
Confusion Matrix
[[3612  525]
 [ 674 3439]]


Classification Report
              precision    recall  f1-score   support

           0       0.84      0.87      0.86      4137
           1       0.87      0.84      0.85      4113

    accuracy                           0.85      8250
   macro avg       0.86      0.85      0.85      8250
weighted avg       0.86      0.85      0.85      8250



Other Metrics
Pression Score: 0.8675580221997982
Accuracy Score: 0.8546666666666667
Recall Score: 0.836129345976173
f1 Score 0.8515537947257644
_________________________________________________


#### Model number 2

In [20]:
count_vectorizer = CountVectorizer(stop_words='english', ngram_range=(1,3))
X_train_vectorized = count_vectorizer.fit_transform(X_train)
X_test_vectorized  = count_vectorizer.transform(X_test)

In [21]:
mnb = MultinomialNB()

for clf, clf_name in [(mnb, 'Multinomial Naive Bayes')]:
    print(clf_name)
    clf.fit(X_train_vectorized, y_train)
    check_model(clf, X_test_vectorized, y_test)
    print("_________________________________________________")

Multinomial Naive Bayes
Confusion Matrix
[[3663  474]
 [ 545 3568]]


Classification Report
              precision    recall  f1-score   support

           0       0.87      0.89      0.88      4137
           1       0.88      0.87      0.88      4113

    accuracy                           0.88      8250
   macro avg       0.88      0.88      0.88      8250
weighted avg       0.88      0.88      0.88      8250



Other Metrics
Pression Score: 0.8827313211281543
Accuracy Score: 0.8764848484848485
Recall Score: 0.8674933138828106
f1 Score 0.8750459840588596
_________________________________________________


### Checking different count vectorizers

In [31]:
count_vectorizer = CountVectorizer(stop_words='english', ngram_range=(1,3))
X_train_vectorized = count_vectorizer.fit_transform(X_train)

In [32]:
first_result = X_train_vectorized.shape[1]

In [36]:
count_vectorizer = CountVectorizer(stop_words='english', ngram_range=(1,3), lowercase=False)
X_train_vectorized = count_vectorizer.fit_transform(X_train)
X_test_vectorized  = count_vectorizer.transform(X_test)

In [37]:
mnb = MultinomialNB(alpha=1)

for clf, clf_name in [(mnb, 'Multinomial Naive Bayes')]:
    print(clf_name)
    clf.fit(X_train_vectorized, y_train)
    check_model(clf, X_test_vectorized, y_test)
    print("_________________________________________________")

Multinomial Naive Bayes
Confusion Matrix
[[3682  455]
 [ 573 3540]]


Classification Report
              precision    recall  f1-score   support

           0       0.87      0.89      0.88      4137
           1       0.89      0.86      0.87      4113

    accuracy                           0.88      8250
   macro avg       0.88      0.88      0.88      8250
weighted avg       0.88      0.88      0.88      8250



Other Metrics
Pression Score: 0.886107634543179
Accuracy Score: 0.8753939393939394
Recall Score: 0.8606856309263311
f1 Score 0.8732116428219042
_________________________________________________


In [50]:
for i in range(4,11,1):
    count_vectorizer = CountVectorizer(stop_words='english', ngram_range=(1,3), lowercase=False, max_df=i/10)
    X_train_vectorized = count_vectorizer.fit_transform(X_train)
    print(f'This is the shape for {i/10}')
    print(X_train_vectorized.shape)

This is the shape for 0.4
(16750, 3220267)
This is the shape for 0.5
(16750, 3220271)
This is the shape for 0.6
(16750, 3220274)
This is the shape for 0.7
(16750, 3220275)
This is the shape for 0.8
(16750, 3220276)
This is the shape for 0.9
(16750, 3220276)
This is the shape for 1.0
(16750, 3220276)


In [53]:
for i in [2,5,12,20,30]:
    count_vectorizer = CountVectorizer(stop_words='english', ngram_range=(1,3), lowercase=False, min_df=i)
    X_train_vectorized = count_vectorizer.fit_transform(X_train)
    print(f'This is the shape for {i}')
    print(X_train_vectorized.shape)

This is the shape for 2
(16750, 279654)
This is the shape for 5
(16750, 67094)
This is the shape for 12
(16750, 25427)
This is the shape for 20
(16750, 15013)
This is the shape for 30
(16750, 9999)


In [54]:
count_vectorizer = CountVectorizer(stop_words='english', ngram_range=(1,3), lowercase=False, min_df=2)
X_train_vectorized = count_vectorizer.fit_transform(X_train)
X_test_vectorized  = count_vectorizer.transform(X_test)

In [55]:
mnb = MultinomialNB(alpha=1)

for clf, clf_name in [(mnb, 'Multinomial Naive Bayes')]:
    print(clf_name)
    clf.fit(X_train_vectorized, y_train)
    check_model(clf, X_test_vectorized, y_test)
    print("_________________________________________________")

Multinomial Naive Bayes
Confusion Matrix
[[3658  479]
 [ 543 3570]]


Classification Report
              precision    recall  f1-score   support

           0       0.87      0.88      0.88      4137
           1       0.88      0.87      0.87      4113

    accuracy                           0.88      8250
   macro avg       0.88      0.88      0.88      8250
weighted avg       0.88      0.88      0.88      8250



Other Metrics
Pression Score: 0.8816991849839466
Accuracy Score: 0.8761212121212121
Recall Score: 0.8679795769511306
f1 Score 0.874785591766724
_________________________________________________


In [56]:
count_vectorizer = CountVectorizer(stop_words='english', ngram_range=(1,3), lowercase=False, min_df=10)
X_train_vectorized = count_vectorizer.fit_transform(X_train)
X_test_vectorized  = count_vectorizer.transform(X_test)

In [57]:
mnb = MultinomialNB(alpha=1)

for clf, clf_name in [(mnb, 'Multinomial Naive Bayes')]:
    print(clf_name)
    clf.fit(X_train_vectorized, y_train)
    check_model(clf, X_test_vectorized, y_test)
    print("_________________________________________________")

Multinomial Naive Bayes
Confusion Matrix
[[3582  555]
 [ 576 3537]]


Classification Report
              precision    recall  f1-score   support

           0       0.86      0.87      0.86      4137
           1       0.86      0.86      0.86      4113

    accuracy                           0.86      8250
   macro avg       0.86      0.86      0.86      8250
weighted avg       0.86      0.86      0.86      8250



Other Metrics
Pression Score: 0.8643695014662757
Accuracy Score: 0.862909090909091
Recall Score: 0.8599562363238512
f1 Score 0.8621572212065813
_________________________________________________


In [58]:
count_vectorizer = CountVectorizer(stop_words='english', ngram_range=(1,3), lowercase=False, max_df=0.4)
X_train_vectorized = count_vectorizer.fit_transform(X_train)
X_test_vectorized  = count_vectorizer.transform(X_test)

In [59]:
mnb = MultinomialNB(alpha=1)

for clf, clf_name in [(mnb, 'Multinomial Naive Bayes')]:
    print(clf_name)
    clf.fit(X_train_vectorized, y_train)
    check_model(clf, X_test_vectorized, y_test)
    print("_________________________________________________")

Multinomial Naive Bayes
Confusion Matrix
[[3661  476]
 [ 528 3585]]


Classification Report
              precision    recall  f1-score   support

           0       0.87      0.88      0.88      4137
           1       0.88      0.87      0.88      4113

    accuracy                           0.88      8250
   macro avg       0.88      0.88      0.88      8250
weighted avg       0.88      0.88      0.88      8250



Other Metrics
Pression Score: 0.8827874907658212
Accuracy Score: 0.8783030303030303
Recall Score: 0.8716265499635303
f1 Score 0.8771715194519206
_________________________________________________


In [66]:
count_vectorizer = CountVectorizer(stop_words='english', ngram_range=(1,3), lowercase=False, max_df=0.4, min_df=2)
X_train_vectorized = count_vectorizer.fit_transform(X_train)
X_test_vectorized  = count_vectorizer.transform(X_test)

In [67]:
mnb = MultinomialNB(alpha=1)

for clf, clf_name in [(mnb, 'Multinomial Naive Bayes')]:
    print(clf_name)
    clf.fit(X_train_vectorized, y_train)
    check_model(clf, X_test_vectorized, y_test)
    print("_________________________________________________")

Multinomial Naive Bayes
Confusion Matrix
[[3648  489]
 [ 524 3589]]


Classification Report
              precision    recall  f1-score   support

           0       0.87      0.88      0.88      4137
           1       0.88      0.87      0.88      4113

    accuracy                           0.88      8250
   macro avg       0.88      0.88      0.88      8250
weighted avg       0.88      0.88      0.88      8250



Other Metrics
Pression Score: 0.8800882785679255
Accuracy Score: 0.8772121212121212
Recall Score: 0.8725990761001702
f1 Score 0.8763276767183494
_________________________________________________


In [76]:
count_vectorizer = CountVectorizer(stop_words='english', ngram_range=(1,3), lowercase=True, max_df=0.3)
X_train_vectorized = count_vectorizer.fit_transform(X_train)
X_test_vectorized  = count_vectorizer.transform(X_test)

In [77]:
mnb = MultinomialNB(alpha=1)

for clf, clf_name in [(mnb, 'Multinomial Naive Bayes')]:
    print(clf_name)
    clf.fit(X_train_vectorized, y_train)
    check_model(clf, X_test_vectorized, y_test)
    print("_________________________________________________")

Multinomial Naive Bayes
Confusion Matrix
[[3642  495]
 [ 506 3607]]


Classification Report
              precision    recall  f1-score   support

           0       0.88      0.88      0.88      4137
           1       0.88      0.88      0.88      4113

    accuracy                           0.88      8250
   macro avg       0.88      0.88      0.88      8250
weighted avg       0.88      0.88      0.88      8250



Other Metrics
Pression Score: 0.879327157484154
Accuracy Score: 0.8786666666666667
Recall Score: 0.8769754437150499
f1 Score 0.8781497261107729
_________________________________________________


In [72]:
count_vectorizer = CountVectorizer(stop_words='english', ngram_range=(1,3), lowercase=True, max_features=12000)
X_train_vectorized = count_vectorizer.fit_transform(X_train)
X_test_vectorized  = count_vectorizer.transform(X_test)

In [73]:
mnb = MultinomialNB(alpha=1)

for clf, clf_name in [(mnb, 'Multinomial Naive Bayes')]:
    print(clf_name)
    clf.fit(X_train_vectorized, y_train)
    check_model(clf, X_test_vectorized, y_test)
    print("_________________________________________________")

Multinomial Naive Bayes
Confusion Matrix
[[3514  623]
 [ 595 3518]]


Classification Report
              precision    recall  f1-score   support

           0       0.86      0.85      0.85      4137
           1       0.85      0.86      0.85      4113

    accuracy                           0.85      8250
   macro avg       0.85      0.85      0.85      8250
weighted avg       0.85      0.85      0.85      8250



Other Metrics
Pression Score: 0.8495532480077276
Accuracy Score: 0.8523636363636363
Recall Score: 0.8553367371748116
f1 Score 0.8524351829416041
_________________________________________________


In [78]:
count_vectorizer = CountVectorizer(stop_words='english', ngram_range=(1,3), lowercase=True, max_features=3)
X_train_vectorized = count_vectorizer.fit_transform(X_train)
X_test_vectorized  = count_vectorizer.transform(X_test)

In [79]:
mnb = MultinomialNB(alpha=1)

for clf, clf_name in [(mnb, 'Multinomial Naive Bayes')]:
    print(clf_name)
    clf.fit(X_train_vectorized, y_train)
    check_model(clf, X_test_vectorized, y_test)
    print("_________________________________________________")

Multinomial Naive Bayes
Confusion Matrix
[[1964 2173]
 [1622 2491]]


Classification Report
              precision    recall  f1-score   support

           0       0.55      0.47      0.51      4137
           1       0.53      0.61      0.57      4113

    accuracy                           0.54      8250
   macro avg       0.54      0.54      0.54      8250
weighted avg       0.54      0.54      0.54      8250



Other Metrics
Pression Score: 0.5340909090909091
Accuracy Score: 0.54
Recall Score: 0.6056406515925116
f1 Score 0.5676199156887319
_________________________________________________
