In [None]:
## Python Libraries
import os
import re
from collections import Counter
import time
import sys

## EDA libraries
import numpy as np
import pandas as pd
import matplotlib.pyplot as plt
%matplotlib inline
import seaborn as sns

## Spacy modules and libraries
import spacy 

## Sklearn modules
from sklearn.model_selection import train_test_split, cross_val_score
from sklearn.feature_extraction.text import CountVectorizer, TfidfVectorizer
from sklearn.naive_bayes import MultinomialNB
from sklearn.pipeline import Pipeline
from sklearn.metrics import make_scorer, f1_score

## nltk
from nltk.stem import WordNetLemmatizer

## scipy
from scipy.sparse import hstack


## My own functions
sys.path.append("/Users/sebastianvier/Documents/code/movie_reviews")
from functions import print_report, check_model, vectorize_X
sys.path.append("/Users/sebastianvier/Documents/code/movie_reviews/SVM_models")


def lematizer(x):
    return " ".join([token.lemma_ for token in model(x)])

In [2]:
## Importing the data

path = 'data/train/'

count = 0
labels = []
contents = []

for label in ['neg','pos']:
    filenames = os.listdir(path + label)
    for filename in filenames:
        count += 1
        with open(os.path.join(path, label, filename), 'r') as f:
            labels.append(1 if label == 'pos' else 0) # 1 is positve 0 is negative
            contents.append(f.read())
print(count)
            
data = pd.DataFrame({
    'contents' : contents,
    'labels': labels,

})

data = data.sample(frac=1, random_state=42).reset_index(drop=True) # This code will shuffle the data (just in case!)

25000


In [3]:
X = data.contents
y = data.labels

In [8]:
type(X)

pandas.core.series.Series

In [None]:
## Spacy lematizer

In [6]:
model = spacy.load("en_core_web_sm", exclude=['ner', 'parser', 'tok2vec'])

In [22]:
X_lematized_1 = X[:100].map(lematizer)

In [6]:
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.33, random_state=42)

### Comparing models

In [11]:
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.33, random_state=42)

mnb = MultinomialNB()
cv = CountVectorizer(stop_words='english', ngram_range=(1,1))

for vec ,clf, clf_name in [(cv , mnb, 'Multinomial Naive Bayes')]:
    
    # Vectorization
    start = time.process_time()
    X_train_vectorized, X_test_vectorized = vectorize_X(vec, X_train, X_test)
    print('The amount of features in the Vectorized X train is: {:,} '.format(X_train_vectorized.shape[1]))
    print(f'The amount of time it took to vectorize was: {time.process_time() - start}\n')
    
    # Running the model
    print(clf_name)
    clf.fit(X_train_vectorized, y_train)
    check_model(clf, X_test_vectorized, y_test)
    print("_________________________________________________")

The amount of features in the Vectorized X train is: 63,386 
The amount of time it took to vectorize was: 2.823738999999989

Multinomial Naive Bayes
Confusion Matrix
[[3612  525]
 [ 674 3439]]


Classification Report
              precision    recall  f1-score   support

           0       0.84      0.87      0.86      4137
           1       0.87      0.84      0.85      4113

    accuracy                           0.85      8250
   macro avg       0.86      0.85      0.85      8250
weighted avg       0.86      0.85      0.85      8250



Other Metrics:
Pression Score: 0.8675580221997982
Accuracy Score: 0.8546666666666667
Recall Score: 0.836129345976173
f1 Score 0.8515537947257644
_________________________________________________


In [12]:
X_train, X_test, y_train, y_test = train_test_split(X_lematized, y, test_size=0.33, random_state=42)

mnb = MultinomialNB()
cv = CountVectorizer(stop_words='english', ngram_range=(1,1))

for vec ,clf, clf_name in [(cv , mnb, 'Multinomial Naive Bayes')]:
    
    # Vectorization
    start = time.process_time()
    X_train_vectorized, X_test_vectorized = vectorize_X(vec, X_train, X_test)
    print('The amount of features in the Vectorized X train is: {:,} '.format(X_train_vectorized.shape[1]))
    print(f'The amount of time it took to vectorize was: {time.process_time() - start}\n')
    
    # Running the model
    print(clf_name)
    clf.fit(X_train_vectorized, y_train)
    check_model(clf, X_test_vectorized, y_test)
    print("_________________________________________________")

The amount of features in the Vectorized X train is: 63,260 
The amount of time it took to vectorize was: 2.7196720000000028

Multinomial Naive Bayes
Confusion Matrix
[[3620  517]
 [ 668 3445]]


Classification Report
              precision    recall  f1-score   support

           0       0.84      0.88      0.86      4137
           1       0.87      0.84      0.85      4113

    accuracy                           0.86      8250
   macro avg       0.86      0.86      0.86      8250
weighted avg       0.86      0.86      0.86      8250



Other Metrics:
Pression Score: 0.8695103483089349
Accuracy Score: 0.8563636363636363
Recall Score: 0.837588135181133
f1 Score 0.8532507739938081
_________________________________________________


In [13]:
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.33, random_state=42)

mnb = MultinomialNB()
cv = CountVectorizer(stop_words='english', ngram_range=(1,3))

for vec ,clf, clf_name in [(cv , mnb, 'Multinomial Naive Bayes')]:
    
    # Vectorization
    start = time.process_time()
    X_train_vectorized, X_test_vectorized = vectorize_X(vec, X_train, X_test)
    print('The amount of features in the Vectorized X train is: {:,} '.format(X_train_vectorized.shape[1]))
    print(f'The amount of time it took to vectorize was: {time.process_time() - start}\n')
    
    # Running the model
    print(clf_name)
    clf.fit(X_train_vectorized, y_train)
    check_model(clf, X_test_vectorized, y_test)
    print("_________________________________________________")

The amount of features in the Vectorized X train is: 2,991,762 
The amount of time it took to vectorize was: 13.421976

Multinomial Naive Bayes
Confusion Matrix
[[3663  474]
 [ 545 3568]]


Classification Report
              precision    recall  f1-score   support

           0       0.87      0.89      0.88      4137
           1       0.88      0.87      0.88      4113

    accuracy                           0.88      8250
   macro avg       0.88      0.88      0.88      8250
weighted avg       0.88      0.88      0.88      8250



Other Metrics:
Pression Score: 0.8827313211281543
Accuracy Score: 0.8764848484848485
Recall Score: 0.8674933138828106
f1 Score 0.8750459840588596
_________________________________________________


In [14]:
X_train, X_test, y_train, y_test = train_test_split(X_lematized, y, test_size=0.33, random_state=42)

mnb = MultinomialNB()
cv = CountVectorizer(stop_words='english', ngram_range=(1,1))

for vec ,clf, clf_name in [(cv , mnb, 'Multinomial Naive Bayes')]:
    
    # Vectorization
    start = time.process_time()
    X_train_vectorized, X_test_vectorized = vectorize_X(vec, X_train, X_test)
    print('The amount of features in the Vectorized X train is: {:,} '.format(X_train_vectorized.shape[1]))
    print(f'The amount of time it took to vectorize was: {time.process_time() - start}\n')
    
    # Running the model
    print(clf_name)
    clf.fit(X_train_vectorized, y_train)
    check_model(clf, X_test_vectorized, y_test)
    print("_________________________________________________")

The amount of features in the Vectorized X train is: 63,260 
The amount of time it took to vectorize was: 2.8266649999999913

Multinomial Naive Bayes
Confusion Matrix
[[3620  517]
 [ 668 3445]]


Classification Report
              precision    recall  f1-score   support

           0       0.84      0.88      0.86      4137
           1       0.87      0.84      0.85      4113

    accuracy                           0.86      8250
   macro avg       0.86      0.86      0.86      8250
weighted avg       0.86      0.86      0.86      8250



Other Metrics:
Pression Score: 0.8695103483089349
Accuracy Score: 0.8563636363636363
Recall Score: 0.837588135181133
f1 Score 0.8532507739938081
_________________________________________________


In [16]:
model = spacy.load("en_core_web_md", exclude=['ner', 'parser', 'tok2vec'])

In [30]:
X_lematized_2 = X[:100].map(lematizer)

In [31]:
sum(X_lematized_1 == X_lematized_2

100

In [21]:
X_train, X_test, y_train, y_test = train_test_split(X_lematized, y, test_size=0.33, random_state=42)

mnb = MultinomialNB()
cv = CountVectorizer(stop_words='english', ngram_range=(1,1))

for vec ,clf, clf_name in [(cv , mnb, 'Multinomial Naive Bayes')]:
    
    # Vectorization
    start = time.process_time()
    X_train_vectorized, X_test_vectorized = vectorize_X(vec, X_train, X_test)
    print('The amount of features in the Vectorized X train is: {:,} '.format(X_train_vectorized.shape[1]))
    print(f'The amount of time it took to vectorize was: {time.process_time() - start}\n')
    
    # Running the model
    print(clf_name)
    clf.fit(X_train_vectorized, y_train)
    check_model(clf, X_test_vectorized, y_test)
    print("_________________________________________________")

The amount of features in the Vectorized X train is: 63,260 
The amount of time it took to vectorize was: 2.745069000000001

Multinomial Naive Bayes
Confusion Matrix
[[3620  517]
 [ 668 3445]]


Classification Report
              precision    recall  f1-score   support

           0       0.84      0.88      0.86      4137
           1       0.87      0.84      0.85      4113

    accuracy                           0.86      8250
   macro avg       0.86      0.86      0.86      8250
weighted avg       0.86      0.86      0.86      8250



Other Metrics:
Pression Score: 0.8695103483089349
Accuracy Score: 0.8563636363636363
Recall Score: 0.837588135181133
f1 Score 0.8532507739938081
_________________________________________________


In [41]:
pipeline = Pipeline(
    [
        ("vect", CountVectorizer(ngram_range=(1,1))),
        ("clf", MultinomialNB()),
    ]
)
cv = 5
scorer = make_scorer(f1_score)
result = cross_val_score(pipeline, X_lematized, y, cv=cv, scoring=scorer, verbose=2)

print(round(result.mean() * 100 , 2))
print(round(result.std()  * 100 , 3))

[Parallel(n_jobs=1)]: Using backend SequentialBackend with 1 concurrent workers.


[CV] END .................................................... total time=   3.1s


[Parallel(n_jobs=1)]: Done   1 out of   1 | elapsed:    3.1s remaining:    0.0s


[CV] END .................................................... total time=   2.9s
[CV] END .................................................... total time=   2.9s
[CV] END .................................................... total time=   2.9s
[CV] END .................................................... total time=   2.9s
84.35
0.643


[Parallel(n_jobs=1)]: Done   5 out of   5 | elapsed:   14.8s finished


In [42]:
pipeline = Pipeline(
    [
        ("vect", CountVectorizer(ngram_range=(1,2))),
        ("clf", MultinomialNB()),
    ]
)
cv = 5
scorer = make_scorer(f1_score)
result = cross_val_score(pipeline, X_lematized, y, cv=cv, scoring=scorer, verbose=2)

print(round(result.mean() * 100 , 2))
print(round(result.std()  * 100 , 3))

[Parallel(n_jobs=1)]: Using backend SequentialBackend with 1 concurrent workers.


[CV] END .................................................... total time=   9.3s


[Parallel(n_jobs=1)]: Done   1 out of   1 | elapsed:    9.3s remaining:    0.0s


[CV] END .................................................... total time=   9.3s
[CV] END .................................................... total time=   9.3s
[CV] END .................................................... total time=   9.2s
[CV] END .................................................... total time=   9.4s
87.63
0.423


[Parallel(n_jobs=1)]: Done   5 out of   5 | elapsed:   46.8s finished


In [43]:
pipeline = Pipeline(
    [
        ("vect", CountVectorizer(ngram_range=(1,3))),
        ("clf", MultinomialNB()),
    ]
)
cv = 5
scorer = make_scorer(f1_score)
result = cross_val_score(pipeline, X_lematized, y, cv=cv, scoring=scorer, verbose=2)

print(round(result.mean() * 100 , 2))
print(round(result.std()  * 100 , 3))

[Parallel(n_jobs=1)]: Using backend SequentialBackend with 1 concurrent workers.


[CV] END .................................................... total time=  23.5s


[Parallel(n_jobs=1)]: Done   1 out of   1 | elapsed:   23.5s remaining:    0.0s


[CV] END .................................................... total time=  24.0s
[CV] END .................................................... total time=  23.7s
[CV] END .................................................... total time=  24.4s
[CV] END .................................................... total time=  23.8s
88.65
0.36


[Parallel(n_jobs=1)]: Done   5 out of   5 | elapsed:  2.0min finished


In [44]:
pipeline = Pipeline(
    [
        ("vect", CountVectorizer(ngram_range=(1,3), min_df=2, max_df=.5)),
        ("clf", MultinomialNB()),
    ]
)
cv = 5
scorer = make_scorer(f1_score)
result = cross_val_score(pipeline, X_lematized, y, cv=cv, scoring=scorer, verbose=2)

print(round(result.mean() * 100 , 2))
print(round(result.std()  * 100 , 3))

[Parallel(n_jobs=1)]: Using backend SequentialBackend with 1 concurrent workers.


[CV] END .................................................... total time=  15.9s


[Parallel(n_jobs=1)]: Done   1 out of   1 | elapsed:   15.9s remaining:    0.0s


[CV] END .................................................... total time=  15.7s
[CV] END .................................................... total time=  15.5s
[CV] END .................................................... total time=  15.9s
[CV] END .................................................... total time=  15.3s
88.72
0.264


[Parallel(n_jobs=1)]: Done   5 out of   5 | elapsed:  1.3min finished


In [45]:
pipeline = Pipeline(
    [
        ("vect", CountVectorizer(ngram_range=(1,3), min_df=3, max_df=.5)),
        ("clf", MultinomialNB()),
    ]
)
cv = 5
scorer = make_scorer(f1_score)
result = cross_val_score(pipeline, X_lematized, y, cv=cv, scoring=scorer, verbose=2)

print(round(result.mean() * 100 , 2))
print(round(result.std()  * 100 , 3))

[Parallel(n_jobs=1)]: Using backend SequentialBackend with 1 concurrent workers.


[CV] END .................................................... total time=  15.0s


[Parallel(n_jobs=1)]: Done   1 out of   1 | elapsed:   15.0s remaining:    0.0s


[CV] END .................................................... total time=  14.6s
[CV] END .................................................... total time=  14.8s
[CV] END .................................................... total time=  14.7s
[CV] END .................................................... total time=  15.6s
88.41
0.331


[Parallel(n_jobs=1)]: Done   5 out of   5 | elapsed:  1.3min finished


In [55]:
import nltk
nltk.download('wordnet')

[nltk_data] Downloading package wordnet to
[nltk_data]     /Users/sebastianvier/nltk_data...
[nltk_data]   Unzipping corpora/wordnet.zip.


True

In [49]:
lemmatizer = WordNetLemmatizer()

In [100]:
def w_lemmatizer(x):
    return lematizer(x)

In [101]:
X_lematized = X.map(w_lemmatizer)

In [103]:
X_train, X_test, y_train, y_test = train_test_split(X_lematized, y, test_size=0.33, random_state=42)

mnb = MultinomialNB()
cv = CountVectorizer(stop_words='english', ngram_range=(1,1))

for vec ,clf, clf_name in [(cv , mnb, 'Multinomial Naive Bayes')]:
    
    # Vectorization
    start = time.process_time()
    X_train_vectorized, X_test_vectorized = vectorize_X(vec, X_train, X_test)
    print('The amount of features in the Vectorized X train is: {:,} '.format(X_train_vectorized.shape[1]))
    print(f'The amount of time it took to vectorize was: {time.process_time() - start}\n')
    
    # Running the model
    print(clf_name)
    clf.fit(X_train_vectorized, y_train)
    check_model(clf, X_test_vectorized, y_test)
    print("_________________________________________________")

The amount of features in the Vectorized X train is: 63,260 
The amount of time it took to vectorize was: 2.7692349999999806

Multinomial Naive Bayes
Confusion Matrix
[[3620  517]
 [ 668 3445]]


Classification Report
              precision    recall  f1-score   support

           0       0.84      0.88      0.86      4137
           1       0.87      0.84      0.85      4113

    accuracy                           0.86      8250
   macro avg       0.86      0.86      0.86      8250
weighted avg       0.86      0.86      0.86      8250



Other Metrics:
Pression Score: 0.8695103483089349
Accuracy Score: 0.8563636363636363
Recall Score: 0.837588135181133
f1 Score 0.8532507739938081
_________________________________________________


In [102]:
pipeline = Pipeline(
    [
        ("vect", CountVectorizer(ngram_range=(1,1))),
        ("clf", MultinomialNB()),
    ]
)
cv = 5
scorer = make_scorer(f1_score)
result = cross_val_score(pipeline, X_lematized, y, cv=cv, scoring=scorer, verbose=2)

print(round(result.mean() * 100 , 2))
print(round(result.std()  * 100 , 3))

[Parallel(n_jobs=1)]: Using backend SequentialBackend with 1 concurrent workers.


[CV] END .................................................... total time=   3.0s


[Parallel(n_jobs=1)]: Done   1 out of   1 | elapsed:    3.0s remaining:    0.0s


[CV] END .................................................... total time=   2.9s
[CV] END .................................................... total time=   2.9s
[CV] END .................................................... total time=   2.9s
[CV] END .................................................... total time=   2.9s
84.35
0.643


[Parallel(n_jobs=1)]: Done   5 out of   5 | elapsed:   14.7s finished


In [150]:
X_train, X_test, y_train, y_test = train_test_split(X_pos, y, test_size=0.33, random_state=42)

mnb = MultinomialNB()
cv = CountVectorizer(stop_words='english', ngram_range=(1,1))

for vec ,clf, clf_name in [(cv , mnb, 'Multinomial Naive Bayes')]:
    
    # Vectorization
    start = time.process_time()
    X_train_vectorized, X_test_vectorized = vectorize_X(vec, X_train, X_test)
    print('The amount of features in the Vectorized X train is: {:,} '.format(X_train_vectorized.shape[1]))
    print(f'The amount of time it took to vectorize was: {time.process_time() - start}\n')
    
    # Running the model
    print(clf_name)
    clf.fit(X_train_vectorized, y_train)
    check_model(clf, X_test_vectorized, y_test)
    print("_________________________________________________")

The amount of features in the Vectorized X train is: 16 
The amount of time it took to vectorize was: 3.5451789999997345

Multinomial Naive Bayes
Confusion Matrix
[[2791 1346]
 [1991 2122]]


Classification Report
              precision    recall  f1-score   support

           0       0.58      0.67      0.63      4137
           1       0.61      0.52      0.56      4113

    accuracy                           0.60      8250
   macro avg       0.60      0.60      0.59      8250
weighted avg       0.60      0.60      0.59      8250



Other Metrics:
Pression Score: 0.6118800461361015
Accuracy Score: 0.5955151515151516
Recall Score: 0.5159251154874788
f1 Score 0.5598206041419338
_________________________________________________


In [18]:
X_train, X_test, y_train, y_test = train_test_split(X_pos, y, test_size=0.33, random_state=42)

mnb = MultinomialNB()
cv = TfidfVectorizer(ngram_range=(4,7))

for vec ,clf, clf_name in [(cv , mnb, 'Multinomial Naive Bayes')]:
    
    # Vectorization
    start = time.process_time()
    X_train_vectorized, X_test_vectorized = vectorize_X(vec, X_train, X_test)
    print('The amount of features in the Vectorized X train is: {:,} '.format(X_train_vectorized.shape[1]))
    print(f'The amount of time it took to vectorize was: {time.process_time() - start}\n')
    
    # Running the model
    print(clf_name)
    clf.fit(X_train_vectorized, y_train)
    check_model(clf, X_test_vectorized, y_test)
    print("_________________________________________________")

NameError: name 'X_pos' is not defined

In [28]:
def get_pos(x):
    return " ".join([token.pos_ for token in model(x)])

model = spacy.load("en_core_web_sm")

In [42]:
from tqdm import tqdm

my_values = tqdm(X)
X_pos = list(zip(map(get_pos, my_values)))

100%|██████████| 25000/25000 [14:07<00:00, 29.48it/s]


In [None]:
X_pos.to_csv("X_pos.csv", index=False)

In [45]:
X_pos_1 = list(map(lambda x: x[0],X_pos))

In [None]:
X_train_pos, X_test_pos, y_train, y_test = train_test_split(X_pos_1, y, test_size=0.33, random_state=42)
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.33, random_state=42)

mnb = MultinomialNB()
cv = CountVectorizer(stop_words='english', ngram_range=(1,3), min_df=2, max_df=.5)
cv_pos= CountVectorizer(ngram_range=(5,11))

for vec, vec_pos , clf, clf_name in [(cv ,cv_pos, mnb, 'Multinomial Naive Bayes')]:
    
    # Vectorization
    start = time.process_time()
    X_train_vectorized, X_test_vectorized = vectorize_X(vec, X_train, X_test)
    print('vec_1 complete')
    X_train_vectorized_pos, X_test_vectorized_pos = vectorize_X(vec_pos, X_train_pos, X_test_pos)
    X_train_vectorized = hstack((X_train_vectorized, X_train_vectorized_pos))
    X_test_vectorized  = hstack((X_test_vectorized, X_test_vectorized_pos))
    
    print('vectorization complete')

    
    # Running the model
    print(clf_name)
    clf.fit(X_train_vectorized, y_train)
    check_model(clf, X_test_vectorized, y_test)
    print("_________________________________________________")

vec_1 complete
