In [1]:
## Python in build modules:
import os
import re
from collections import Counter
import time
import sys

## EDA libraries
import numpy as np
import pandas as pd
import matplotlib.pyplot as plt
%matplotlib inline
import seaborn as sns

## Metrics (sklearn)
from sklearn.metrics import confusion_matrix, classification_report, make_scorer
from sklearn.metrics import recall_score, precision_score, f1_score, accuracy_score

## Models (sklearn)
from sklearn.feature_extraction.text import CountVectorizer, TfidfVectorizer
from sklearn.svm import SVC
from sklearn.model_selection import train_test_split, cross_val_predict
from sklearn.pipeline import make_pipeline
from sklearn.preprocessing import StandardScaler

## My own functions
sys.path.append("/Users/sebastianvier/Documents/code/movie_reviews")
from functions import print_report, check_model, vectorize_X
sys.path.append("/Users/sebastianvier/Documents/code/movie_reviews/SVM_models")

In [2]:
## Importing the data

path = '../data/train/'

count = 0
labels = []
contents = []

for label in ['neg','pos']:
    filenames = os.listdir(path + label)
    for filename in filenames:
        count += 1
        with open(os.path.join(path, label, filename), 'r') as f:
            labels.append(1 if label == 'pos' else 0) # 1 is positve 0 is negative
            contents.append(f.read())
print(count)
            
data = pd.DataFrame({
    'contents' : contents,
    'labels': labels,

})

data = data.sample(frac=1, random_state=42).reset_index(drop=True) # This code will shuffle the data (just in case!)
X = data.contents
y = data.labels

25000


In [3]:
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.33, random_state=42)

In [16]:
cv = TfidfVectorizer(ngram_range=(1,1), min_df=2)

X_train_vectorized = cv.fit_transform(X_train)
X_test_vectorized = cv.transform(X_test)

In [17]:
print(X_train_vectorized.shape[1])

80486


In [15]:
# Perform classification with SVM, kernel=linear

clf = make_pipeline(StandardScaler(with_mean=False), SVC(gamma='auto'))
t0 = time.time()
scaler = StandardScaler()
clf = SVC(kernel='linear', verbose=True)
clf.fit(X_train_vectorized[:15000], y_train[:15000])
t1 = time.time()
print(t1-t0)
prediction = clf.predict(X_test_vectorized)
t2 = time.time()
print(t2-t0)
print_report(y_test,prediction)

[LibSVM]114.98307919502258
162.10718512535095
Confusion Matrix
[[3602  535]
 [ 417 3696]]


Classification Report
              precision    recall  f1-score   support

           0       0.90      0.87      0.88      4137
           1       0.87      0.90      0.89      4113

    accuracy                           0.88      8250
   macro avg       0.88      0.88      0.88      8250
weighted avg       0.88      0.88      0.88      8250



Other Metrics:
Pression Score: 0.8735523516899079
Accuracy Score: 0.8846060606060606
Recall Score: 0.8986141502552881
f1 Score 0.8859060402684564


In [4]:
cv = TfidfVectorizer(ngram_range=(1,3), min_df=10, max_df=.5)

X_train_vectorized = cv.fit_transform(X_train)
X_test_vectorized = cv.transform(X_test)

print(X_train_vectorized.shape[1])

80486


In [5]:
# Perform classification with SVM, kernel=linear

clf = make_pipeline(StandardScaler(with_mean=False), SVC(gamma='auto')) #kernel='linear'
t0 = time.time()
scaler = StandardScaler()
clf = SVC(kernel='linear', verbose=True)
clf.fit(X_train_vectorized[:2000], y_train[:2000])
t1 = time.time()
print(t1-t0)
prediction = clf.predict(X_test_vectorized)
t2 = time.time()
print(t2-t0)
print_report(y_test,prediction)

[LibSVM]4.8187761306762695
23.178221225738525
Confusion Matrix
[[3392  745]
 [ 419 3694]]


Classification Report
              precision    recall  f1-score   support

           0       0.89      0.82      0.85      4137
           1       0.83      0.90      0.86      4113

    accuracy                           0.86      8250
   macro avg       0.86      0.86      0.86      8250
weighted avg       0.86      0.86      0.86      8250



Other Metrics:
Pression Score: 0.8321694075242172
Accuracy Score: 0.858909090909091
Recall Score: 0.8981278871869681
f1 Score 0.8638914873713751


In [6]:
# Perform classification with SVM, kernel=linear

clf = make_pipeline(StandardScaler(with_mean=False), SVC(gamma='auto')) #kernel='linear'
t0 = time.time()
scaler = StandardScaler()
clf = SVC(kernel='linear', verbose=True)
clf.fit(X_train_vectorized[:4000], y_train[:4000])
t1 = time.time()
print(t1-t0)
prediction = clf.predict(X_test_vectorized)
t2 = time.time()
print(t2-t0)
print_report(y_test,prediction)

[LibSVM]18.393959760665894
51.24034595489502
Confusion Matrix
[[3514  623]
 [ 392 3721]]


Classification Report
              precision    recall  f1-score   support

           0       0.90      0.85      0.87      4137
           1       0.86      0.90      0.88      4113

    accuracy                           0.88      8250
   macro avg       0.88      0.88      0.88      8250
weighted avg       0.88      0.88      0.88      8250



Other Metrics:
Pression Score: 0.8565837937384899
Accuracy Score: 0.876969696969697
Recall Score: 0.9046924386092876
f1 Score 0.8799810807614994


In [7]:
# Perform classification with SVM, kernel=linear

clf = make_pipeline(StandardScaler(with_mean=False), SVC(gamma='auto')) #kernel='linear'
t0 = time.time()
scaler = StandardScaler()
clf = SVC(kernel='linear', verbose=True)
clf.fit(X_train_vectorized[:6000], y_train[:6000])
t1 = time.time()
print(t1-t0)
prediction = clf.predict(X_test_vectorized)
t2 = time.time()
print(t2-t0)
print_report(y_test,prediction)

[LibSVM]38.80088686943054
83.2681930065155
Confusion Matrix
[[3563  574]
 [ 367 3746]]


Classification Report
              precision    recall  f1-score   support

           0       0.91      0.86      0.88      4137
           1       0.87      0.91      0.89      4113

    accuracy                           0.89      8250
   macro avg       0.89      0.89      0.89      8250
weighted avg       0.89      0.89      0.89      8250



Other Metrics:
Pression Score: 0.8671296296296296
Accuracy Score: 0.8859393939393939
Recall Score: 0.9107707269632871
f1 Score 0.8884145618403889


In [10]:
cv = TfidfVectorizer(ngram_range=(1,3), min_df=25, max_df=.4)

X_train_vectorized = cv.fit_transform(X_train)
X_test_vectorized = cv.transform(X_test)

print(X_train_vectorized.shape[1])

30230


In [13]:
# Perform classification with SVM, kernel=linear

clf = make_pipeline(StandardScaler(with_mean=False), SVC(gamma='auto')) #kernel='linear'
t0 = time.time()
scaler = StandardScaler()
clf = SVC(kernel='linear', verbose=True)
clf.fit(X_train_vectorized[:15000], y_train[:15000])
t1 = time.time()
print(t1-t0)
prediction = clf.predict(X_test_vectorized)
t2 = time.time()
print(t2-t0)
print_report(y_test,prediction)

[LibSVM]237.36030101776123
306.2224600315094
Confusion Matrix
[[3644  493]
 [ 401 3712]]


Classification Report
              precision    recall  f1-score   support

           0       0.90      0.88      0.89      4137
           1       0.88      0.90      0.89      4113

    accuracy                           0.89      8250
   macro avg       0.89      0.89      0.89      8250
weighted avg       0.89      0.89      0.89      8250



Other Metrics:
Pression Score: 0.8827586206896552
Accuracy Score: 0.8916363636363637
Recall Score: 0.9025042548018478
f1 Score 0.8925222409232988


In [15]:
cv = TfidfVectorizer(ngram_range=(1,3), min_df=20, max_df=.4)

X_train_vectorized = cv.fit_transform(X_train)
X_test_vectorized = cv.transform(X_test)

print(X_train_vectorized.shape[1])

38455


In [16]:
# Perform classification with SVM, kernel=linear

clf = make_pipeline(StandardScaler(with_mean=False), SVC(gamma='auto')) #kernel='linear'
t0 = time.time()
scaler = StandardScaler()
clf = SVC(kernel='linear', verbose=True)
clf.fit(X_train_vectorized[:17000], y_train[:17000])
t1 = time.time()
print(t1-t0)
prediction = clf.predict(X_test_vectorized)
t2 = time.time()
print(t2-t0)
print_report(y_test,prediction)

[LibSVM]363.1141290664673
438.5624990463257
Confusion Matrix
[[3687  450]
 [ 395 3718]]


Classification Report
              precision    recall  f1-score   support

           0       0.90      0.89      0.90      4137
           1       0.89      0.90      0.90      4113

    accuracy                           0.90      8250
   macro avg       0.90      0.90      0.90      8250
weighted avg       0.90      0.90      0.90      8250



Other Metrics:
Pression Score: 0.8920345489443378
Accuracy Score: 0.8975757575757576
Recall Score: 0.9039630440068077
f1 Score 0.8979591836734694
