In [5]:
## Python in build modules:
import os
import re
from collections import Counter
import time

## EDA libraries
import numpy as np
import pandas as pd
import matplotlib.pyplot as plt
%matplotlib inline
import seaborn as sns

## Metrics (sklearn)
from sklearn.metrics import confusion_matrix, classification_report, make_scorer
from sklearn.metrics import recall_score, precision_score, f1_score, accuracy_score

## Model management (sklearn)
from sklearn.feature_extraction.text import CountVectorizer
from sklearn.naive_bayes import MultinomialNB
from sklearn.model_selection import train_test_split, GridSearchCV, cross_val_score
from sklearn.pipeline import Pipeline


## My own functions
from functions import print_report, check_model, vectorize_X

In [2]:
## Importing the data

path = 'data/train/'

count = 0
labels = []
contents = []

for label in ['neg','pos']:
    filenames = os.listdir(path + label)
    for filename in filenames:
        count += 1
        with open(os.path.join(path, label, filename), 'r') as f:
            labels.append(1 if label == 'pos' else 0) # 1 is positve 0 is negative
            contents.append(f.read())
print(count)
            
data = pd.DataFrame({
    'contents' : contents,
    'labels': labels,

})

data = data.sample(frac=1, random_state=42).reset_index(drop=True) # This code will shuffle the data (just in case!)
X = data.contents
y = data.labels

25000


In [11]:
pipeline = Pipeline(
    [
        ("vect", CountVectorizer(ngram_range=(1,1))),
        ("clf", MultinomialNB()),
    ]
)
cv = 5
scorer = make_scorer(f1_score)
result = cross_val_score(pipeline, X, y, cv=cv, scoring=scorer, verbose=2)

print(round(result.mean() * 100 , 2))
print(round(result.std()  * 100 , 3))

[Parallel(n_jobs=1)]: Using backend SequentialBackend with 1 concurrent workers.


[CV] END .................................................... total time=   2.9s


[Parallel(n_jobs=1)]: Done   1 out of   1 | elapsed:    2.9s remaining:    0.0s


[CV] END .................................................... total time=   2.9s
[CV] END .................................................... total time=   2.8s
[CV] END .................................................... total time=   2.8s
[CV] END .................................................... total time=   2.8s
84.4
0.575


[Parallel(n_jobs=1)]: Done   5 out of   5 | elapsed:   14.3s finished


In [12]:
pipeline = Pipeline(
    [
        ("vect", CountVectorizer(ngram_range=(1,2))),
        ("clf", MultinomialNB()),
    ]
)
cv = 5

scorer = make_scorer(f1_score)
scorer = make_scorer(f1_score)
result = cross_val_score(pipeline, X, y, cv=cv, scoring=scorer, verbose=2)

print(round(result.mean() * 100, 2))
print(round(result.std()  *  100 ,3))

[Parallel(n_jobs=1)]: Using backend SequentialBackend with 1 concurrent workers.


[CV] END .................................................... total time=   9.1s


[Parallel(n_jobs=1)]: Done   1 out of   1 | elapsed:    9.1s remaining:    0.0s


[CV] END .................................................... total time=   9.1s
[CV] END .................................................... total time=   9.2s
[CV] END .................................................... total time=   9.5s
[CV] END .................................................... total time=   9.3s
87.66
0.461


[Parallel(n_jobs=1)]: Done   5 out of   5 | elapsed:   46.4s finished


In [13]:
pipeline = Pipeline(
    [
        ("vect", CountVectorizer(ngram_range=(1,3))),
        ("clf", MultinomialNB()),
    ]
)
cv = 5

scorer = make_scorer(f1_score)
scorer = make_scorer(f1_score)
result = cross_val_score(pipeline, X, y, cv=cv, scoring=scorer, verbose=2)

print(round(result.mean() * 100, 2))
print(round(result.std()  *  100 ,3))

[Parallel(n_jobs=1)]: Using backend SequentialBackend with 1 concurrent workers.


[CV] END .................................................... total time=  21.4s


[Parallel(n_jobs=1)]: Done   1 out of   1 | elapsed:   21.4s remaining:    0.0s


[CV] END .................................................... total time=  21.4s
[CV] END .................................................... total time=  21.5s
[CV] END .................................................... total time=  21.6s
[CV] END .................................................... total time=  21.8s
88.71
0.387


[Parallel(n_jobs=1)]: Done   5 out of   5 | elapsed:  1.8min finished


In [14]:
pipeline = Pipeline(
    [
        ("vect", CountVectorizer(ngram_range=(1,4))),
        ("clf", MultinomialNB()),
    ]
)
cv = 5

scorer = make_scorer(f1_score)
scorer = make_scorer(f1_score)
result = cross_val_score(pipeline, X, y, cv=cv, scoring=scorer, verbose=2)

print(round(result.mean() * 100, 2))
print(round(result.std()  *  100 ,3))

[Parallel(n_jobs=1)]: Using backend SequentialBackend with 1 concurrent workers.


[CV] END .................................................... total time= 1.0min


[Parallel(n_jobs=1)]: Done   1 out of   1 | elapsed:  1.0min remaining:    0.0s


[CV] END .................................................... total time=  57.5s
[CV] END .................................................... total time=  45.5s
[CV] END .................................................... total time=  45.7s
[CV] END .................................................... total time=  44.9s
89.1
0.365


[Parallel(n_jobs=1)]: Done   5 out of   5 | elapsed:  4.3min finished


In [16]:
pipeline = Pipeline(
    [
        ("vect", CountVectorizer(ngram_range=(1,4))),
        ("clf", MultinomialNB()),
    ]
)
cv = 3

scorer = make_scorer(f1_score)
scorer = make_scorer(f1_score)
result = cross_val_score(pipeline, X, y, cv=cv, scoring=scorer, verbose=2)

print(round(result.mean() * 100, 2))
print(round(result.std()  *  100 ,3))

[Parallel(n_jobs=1)]: Using backend SequentialBackend with 1 concurrent workers.


[CV] END .................................................... total time=  43.1s


[Parallel(n_jobs=1)]: Done   1 out of   1 | elapsed:   43.2s remaining:    0.0s


[CV] END .................................................... total time=  49.5s
[CV] END .................................................... total time=  44.6s
88.85
0.184


[Parallel(n_jobs=1)]: Done   3 out of   3 | elapsed:  2.3min finished


In [20]:
means = []
stds = []

for i in range(2,6):
    pipeline = Pipeline(
    [
        ("vect", CountVectorizer(ngram_range=(1,2), min_df=i)),
        ("clf", MultinomialNB()),
    ]
    )

    scorer = make_scorer(f1_score)
    result = cross_val_score(pipeline, X, y, cv=5, scoring=scorer, verbose=2)

    means.append(round(result.mean() * 100, 2))
    stds.append(round(result.std()  *  100 ,3))

[Parallel(n_jobs=1)]: Using backend SequentialBackend with 1 concurrent workers.


[CV] END .................................................... total time=   8.0s


[Parallel(n_jobs=1)]: Done   1 out of   1 | elapsed:    8.1s remaining:    0.0s


[CV] END .................................................... total time=   7.8s
[CV] END .................................................... total time=   7.8s
[CV] END .................................................... total time=   7.8s
[CV] END .................................................... total time=   7.7s


[Parallel(n_jobs=1)]: Done   5 out of   5 | elapsed:   39.6s finished
[Parallel(n_jobs=1)]: Using backend SequentialBackend with 1 concurrent workers.


[CV] END .................................................... total time=   7.5s


[Parallel(n_jobs=1)]: Done   1 out of   1 | elapsed:    7.5s remaining:    0.0s


[CV] END .................................................... total time=   7.4s
[CV] END .................................................... total time=   7.5s
[CV] END .................................................... total time=   7.4s
[CV] END .................................................... total time=   7.4s


[Parallel(n_jobs=1)]: Done   5 out of   5 | elapsed:   37.7s finished
[Parallel(n_jobs=1)]: Using backend SequentialBackend with 1 concurrent workers.


[CV] END .................................................... total time=   7.5s


[Parallel(n_jobs=1)]: Done   1 out of   1 | elapsed:    7.5s remaining:    0.0s


[CV] END .................................................... total time=   7.3s
[CV] END .................................................... total time=   7.5s
[CV] END .................................................... total time=   7.3s
[CV] END .................................................... total time=   7.4s


[Parallel(n_jobs=1)]: Done   5 out of   5 | elapsed:   37.3s finished
[Parallel(n_jobs=1)]: Using backend SequentialBackend with 1 concurrent workers.


[CV] END .................................................... total time=   7.3s


[Parallel(n_jobs=1)]: Done   1 out of   1 | elapsed:    7.3s remaining:    0.0s


[CV] END .................................................... total time=   7.3s
[CV] END .................................................... total time=   7.4s
[CV] END .................................................... total time=   7.3s
[CV] END .................................................... total time=   7.2s


[Parallel(n_jobs=1)]: Done   5 out of   5 | elapsed:   36.9s finished


In [None]:
pd.DataFrame({
    "mean" : means,
    "std " : stds
 }
)

In [22]:
means = []
stds = []

for i in range(2,6):
    pipeline = Pipeline(
    [
        ("vect", CountVectorizer(ngram_range=(1,1), min_df=i)),
        ("clf", MultinomialNB()),
    ]
    )

    scorer = make_scorer(f1_score)
    result = cross_val_score(pipeline, X, y, cv=5, scoring=scorer, verbose=2)

    means.append(round(result.mean() * 100, 2))
    stds.append(round(result.std()  *  100 ,3))

[Parallel(n_jobs=1)]: Using backend SequentialBackend with 1 concurrent workers.


[CV] END .................................................... total time=   2.9s


[Parallel(n_jobs=1)]: Done   1 out of   1 | elapsed:    2.9s remaining:    0.0s


[CV] END .................................................... total time=   2.9s
[CV] END .................................................... total time=   2.8s
[CV] END .................................................... total time=   2.8s
[CV] END .................................................... total time=   2.8s


[Parallel(n_jobs=1)]: Done   5 out of   5 | elapsed:   14.3s finished
[Parallel(n_jobs=1)]: Using backend SequentialBackend with 1 concurrent workers.


[CV] END .................................................... total time=   2.8s


[Parallel(n_jobs=1)]: Done   1 out of   1 | elapsed:    2.8s remaining:    0.0s


[CV] END .................................................... total time=   2.8s
[CV] END .................................................... total time=   2.8s
[CV] END .................................................... total time=   2.8s
[CV] END .................................................... total time=   2.8s


[Parallel(n_jobs=1)]: Done   5 out of   5 | elapsed:   14.2s finished
[Parallel(n_jobs=1)]: Using backend SequentialBackend with 1 concurrent workers.


[CV] END .................................................... total time=   2.9s


[Parallel(n_jobs=1)]: Done   1 out of   1 | elapsed:    2.9s remaining:    0.0s


[CV] END .................................................... total time=   2.8s
[CV] END .................................................... total time=   2.8s
[CV] END .................................................... total time=   2.9s
[CV] END .................................................... total time=   2.8s


[Parallel(n_jobs=1)]: Done   5 out of   5 | elapsed:   14.3s finished
[Parallel(n_jobs=1)]: Using backend SequentialBackend with 1 concurrent workers.


[CV] END .................................................... total time=   2.8s


[Parallel(n_jobs=1)]: Done   1 out of   1 | elapsed:    2.8s remaining:    0.0s


[CV] END .................................................... total time=   2.8s
[CV] END .................................................... total time=   2.9s
[CV] END .................................................... total time=   2.9s
[CV] END .................................................... total time=   2.9s


[Parallel(n_jobs=1)]: Done   5 out of   5 | elapsed:   14.2s finished


In [23]:
pd.DataFrame({
    "mean" : means,
    "std " : stds
 }
)

Unnamed: 0,mean,std
0,84.23,0.677
1,84.09,0.657
2,84.01,0.672
3,83.97,0.73


In [52]:
pipeline = Pipeline(
    [
        ("vect", CountVectorizer(ngram_range=(1,3))),
        ("clf", MultinomialNB()),
    ]
)
cv = 5

scorer = make_scorer(f1_score)
# start = time.process_time()
result = cross_val_score(pipeline, X, y, cv=cv, scoring=scorer, )
run_time = round(((time.process_time() - start) / cv), 3)

print(f'The amount of time it took to vectorize was: {run_time} seconds \n')
print(result.mean() * 100)
print(result.std()  * 100)

The amount of time it took to vectorize was: 129.9 seconds 

88.73328784345573
0.28208716867469424


In [53]:
pipeline = Pipeline(
    [
        ("vect", CountVectorizer(ngram_range=(1,3), min_df=2, max_df=.4)),
        ("clf", MultinomialNB()),
    ]
)
cv = 5

scorer = make_scorer(f1_score)
# start = time.process_time()
result = cross_val_score(pipeline, X, y, cv=cv, scoring=scorer)
run_time = round(((time.process_time() - start) / cv), 3)

print(f'The amount of time it took to vectorize was: {run_time} seconds \n')
print(result.mean() * 100)
print(result.std()  * 100)

The amount of time it took to vectorize was: 145.805 seconds 

89.02272775095715
0.2569787908172416
