In [2]:
## Python in build modules:
import os
import re
import csv
import sys

## EDA libraries
import numpy as np
import pandas as pd
import matplotlib.pyplot as plt
%matplotlib inline
import seaborn as sns

## Metrics (sklearn)
from sklearn.metrics import confusion_matrix, classification_report
from sklearn.metrics import recall_score, precision_score, f1_score, accuracy_score

## Models (sklearn)
from sklearn.model_selection import train_test_split

## Other libraries
import joblib


## My own functions
sys.path.append("/Users/sebastianvier/Documents/code/movie_reviews")
from functions import print_report, check_model, vectorize_X
sys.path.append("/Users/sebastianvier/Documents/code/movie_reviews/SVM_models")

In [2]:
with open("data/amazon_review.txt", "r") as f:
    amazon_lines =  list(f.readlines())

In [3]:
def spliter(x):
    return x.split(":", 1)

title = []
comment = []
label = []
for line in amazon_lines:
    full_info = line[11:].split(":",1)
    title.append(full_info[0])
    comment.append(full_info[1])
    if line[:10] == '__label__1':
        label.append(0)
    elif line[:10] == '__label__2':
        label.append(1)
        
amazon_rev = pd.DataFrame({
    "title"   : title,
    "contents" : comment,
    "labels"   : label,
    
})

In [4]:
path = "exctracted_data/"
clf_mnb_1 = joblib.load(path + "clf_mnb_1.sav")
cv_mnb_1  = joblib.load(path + "cv_mnb_1.sav")
clf_svm_1 = joblib.load(path + "clf_svm_1.sav")
cv_svm_1  = joblib.load(path + "cv_svm_1.sav")
clf_svm_2 = joblib.load(path + "clf_svm_2.sav")
cv_svm_2  = joblib.load(path + "cv_svm_2.sav")

In [6]:
amz_8k1 = amazon_rev[:8000]
amz_8k1_X = amz_8k1.contents
amz_8k1_y = amz_8k1.labels

In [12]:
amz_8k1_X[10]

" A complete waste of time. Typographical errors, poor grammar, and a totally pathetic plot add up to absolutely nothing. I'm embarrassed for this author and very disappointed I actually paid for this book.\n"

In [10]:
def calculate_scores(X,y,cv,clf):
    ## Vectorize
    X_vectorized = cv.transform(X)
    
    ## Predict
    prediction = clf.predict(X_vectorized)
    
    ## Get scores
    print_report(y, prediction)

In [11]:
## First try model 1
calculate_scores(amz_8k1_X, amz_8k1_y, cv_mnb_1, clf_mnb_1)

Confusion Matrix
[[3488  609]
 [ 789 3114]]


Classification Report
              precision    recall  f1-score   support

           0       0.82      0.85      0.83      4097
           1       0.84      0.80      0.82      3903

    accuracy                           0.83      8000
   macro avg       0.83      0.82      0.82      8000
weighted avg       0.83      0.83      0.83      8000



Other Metrics:
Pression Score: 0.8364222401289283
Accuracy Score: 0.82525
Recall Score: 0.797847809377402
f1 Score 0.8166797797010228


In [14]:
## First try model 2
calculate_scores(amz_8k1_X, amz_8k1_y, cv_svm_1, clf_svm_1)

Confusion Matrix
[[3138  959]
 [ 773 3130]]


Classification Report
              precision    recall  f1-score   support

           0       0.80      0.77      0.78      4097
           1       0.77      0.80      0.78      3903

    accuracy                           0.78      8000
   macro avg       0.78      0.78      0.78      8000
weighted avg       0.78      0.78      0.78      8000



Other Metrics:
Pression Score: 0.7654683296649547
Accuracy Score: 0.7835
Recall Score: 0.8019472200871125
f1 Score 0.7832832832832833


In [15]:
## First try model 3
calculate_scores(amz_8k1_X, amz_8k1_y, cv_svm_2, clf_svm_2)

Confusion Matrix
[[3220  877]
 [ 545 3358]]


Classification Report
              precision    recall  f1-score   support

           0       0.86      0.79      0.82      4097
           1       0.79      0.86      0.83      3903

    accuracy                           0.82      8000
   macro avg       0.82      0.82      0.82      8000
weighted avg       0.82      0.82      0.82      8000



Other Metrics:
Pression Score: 0.7929161747343566
Accuracy Score: 0.82225
Recall Score: 0.8603638227004868
f1 Score 0.8252641926763332


In [17]:
amz_8k1 = amazon_rev[8000:16000]
amz_8k1_X = amz_8k1.contents
amz_8k1_y = amz_8k1.labels

In [19]:
## Second try model 1
calculate_scores(amz_8k1_X, amz_8k1_y, cv_mnb_1, clf_mnb_1)

Confusion Matrix
[[3238  647]
 [ 846 3269]]


Classification Report
              precision    recall  f1-score   support

           0       0.79      0.83      0.81      3885
           1       0.83      0.79      0.81      4115

    accuracy                           0.81      8000
   macro avg       0.81      0.81      0.81      8000
weighted avg       0.81      0.81      0.81      8000



Other Metrics:
Pression Score: 0.8347803881511746
Accuracy Score: 0.813375
Recall Score: 0.7944106925880924
f1 Score 0.8140953804009464


In [20]:
## Second try model 2
calculate_scores(amz_8k1_X, amz_8k1_y, cv_svm_1, clf_svm_1)

Confusion Matrix
[[2889  996]
 [ 744 3371]]


Classification Report
              precision    recall  f1-score   support

           0       0.80      0.74      0.77      3885
           1       0.77      0.82      0.79      4115

    accuracy                           0.78      8000
   macro avg       0.78      0.78      0.78      8000
weighted avg       0.78      0.78      0.78      8000



Other Metrics:
Pression Score: 0.7719258071902908
Accuracy Score: 0.7825
Recall Score: 0.8191980558930742
f1 Score 0.7948597029002595


In [18]:
## Second try model 3
calculate_scores(amz_8k1_X, amz_8k1_y, cv_svm_2, clf_svm_2)

Confusion Matrix
[[2961  924]
 [ 606 3509]]


Classification Report
              precision    recall  f1-score   support

           0       0.83      0.76      0.79      3885
           1       0.79      0.85      0.82      4115

    accuracy                           0.81      8000
   macro avg       0.81      0.81      0.81      8000
weighted avg       0.81      0.81      0.81      8000



Other Metrics:
Pression Score: 0.7915632754342432
Accuracy Score: 0.80875
Recall Score: 0.85273390036452
f1 Score 0.8210107627515209


In [29]:
def pseudo_folds(X,y,cv, clf):
    scores = []
    for num in range(0,20000, 4000):
    
        X_split = X[num : num + 4000]
        y_split = y[num : num + 4000]

        ## Vectorize
        X_vectorized = cv.transform(X_split)

        ## Predict
        prediction = clf.predict(X_vectorized)

        ## Get scores
        score = f1_score(y_split, prediction)
        scores.append(score)
    scores = np.array(scores)
    print(scores.mean())
    print(scores.std())

In [25]:
X = amazon_rev.contents
y = amazon_rev.labels

In [30]:
# Pseudo Folds on model 1
pseudo_folds(X, y, cv_mnb_1, clf_mnb_1)

0.8175788579191037
0.013124354411699552


In [31]:
# Pseudo Folds on model 2
pseudo_folds(X, y, cv_svm_1, clf_svm_1)

0.793489635655307
0.01498045688262852


In [32]:
# Pseudo Folds on model 3
pseudo_folds(X, y, cv_svm_2, clf_svm_2)

0.8279218329538967
0.015305042756373984
