In [5]:
## Python in build modules:
import os
import re
from collections import Counter
import time

## EDA libraries
import numpy as np
import pandas as pd
import matplotlib.pyplot as plt
%matplotlib inline
import seaborn as sns

## Metrics (sklearn)
from sklearn.metrics import confusion_matrix, classification_report, make_scorer
from sklearn.metrics import recall_score, precision_score, f1_score, accuracy_score

## Model management (sklearn)
from sklearn.feature_extraction.text import CountVectorizer
from sklearn.naive_bayes import MultinomialNB
from sklearn.model_selection import train_test_split, GridSearchCV, cross_val_score
from sklearn.pipeline import Pipeline


## My own functions
from functions import print_report, check_model, vectorize_X

In [2]:
## Importing the data

path = 'data/train/'

count = 0
labels = []
contents = []

for label in ['neg','pos']:
    filenames = os.listdir(path + label)
    for filename in filenames:
        count += 1
        with open(os.path.join(path, label, filename), 'r') as f:
            labels.append(1 if label == 'pos' else 0) # 1 is positve 0 is negative
            contents.append(f.read())
print(count)
            
data = pd.DataFrame({
    'contents' : contents,
    'labels': labels,

})

data = data.sample(frac=1, random_state=42).reset_index(drop=True) # This code will shuffle the data (just in case!)
X = data.contents
y = data.labels

25000


In [11]:
pipeline = Pipeline(
    [
        ("vect", CountVectorizer(ngram_range=(1,1))),
        ("clf", MultinomialNB()),
    ]
)
cv = 5
scorer = make_scorer(f1_score)
result = cross_val_score(pipeline, X, y, cv=cv, scoring=scorer, verbose=2)

print(round(result.mean() * 100 , 2))
print(round(result.std()  * 100 , 3))

[Parallel(n_jobs=1)]: Using backend SequentialBackend with 1 concurrent workers.


[CV] END .................................................... total time=   2.9s


[Parallel(n_jobs=1)]: Done   1 out of   1 | elapsed:    2.9s remaining:    0.0s


[CV] END .................................................... total time=   2.9s
[CV] END .................................................... total time=   2.8s
[CV] END .................................................... total time=   2.8s
[CV] END .................................................... total time=   2.8s
84.4
0.575


[Parallel(n_jobs=1)]: Done   5 out of   5 | elapsed:   14.3s finished


In [12]:
pipeline = Pipeline(
    [
        ("vect", CountVectorizer(ngram_range=(1,2))),
        ("clf", MultinomialNB()),
    ]
)
cv = 5

scorer = make_scorer(f1_score)
scorer = make_scorer(f1_score)
result = cross_val_score(pipeline, X, y, cv=cv, scoring=scorer, verbose=2)

print(round(result.mean() * 100, 2))
print(round(result.std()  *  100 ,3))

[Parallel(n_jobs=1)]: Using backend SequentialBackend with 1 concurrent workers.


[CV] END .................................................... total time=   9.1s


[Parallel(n_jobs=1)]: Done   1 out of   1 | elapsed:    9.1s remaining:    0.0s


[CV] END .................................................... total time=   9.1s
[CV] END .................................................... total time=   9.2s
[CV] END .................................................... total time=   9.5s
[CV] END .................................................... total time=   9.3s
87.66
0.461


[Parallel(n_jobs=1)]: Done   5 out of   5 | elapsed:   46.4s finished


In [13]:
pipeline = Pipeline(
    [
        ("vect", CountVectorizer(ngram_range=(1,3))),
        ("clf", MultinomialNB()),
    ]
)
cv = 5

scorer = make_scorer(f1_score)
scorer = make_scorer(f1_score)
result = cross_val_score(pipeline, X, y, cv=cv, scoring=scorer, verbose=2)

print(round(result.mean() * 100, 2))
print(round(result.std()  *  100 ,3))

[Parallel(n_jobs=1)]: Using backend SequentialBackend with 1 concurrent workers.


[CV] END .................................................... total time=  21.4s


[Parallel(n_jobs=1)]: Done   1 out of   1 | elapsed:   21.4s remaining:    0.0s


[CV] END .................................................... total time=  21.4s
[CV] END .................................................... total time=  21.5s
[CV] END .................................................... total time=  21.6s
[CV] END .................................................... total time=  21.8s
88.71
0.387


[Parallel(n_jobs=1)]: Done   5 out of   5 | elapsed:  1.8min finished


In [14]:
pipeline = Pipeline(
    [
        ("vect", CountVectorizer(ngram_range=(1,4))),
        ("clf", MultinomialNB()),
    ]
)
cv = 5

scorer = make_scorer(f1_score)
scorer = make_scorer(f1_score)
result = cross_val_score(pipeline, X, y, cv=cv, scoring=scorer, verbose=2)

print(round(result.mean() * 100, 2))
print(round(result.std()  *  100 ,3))

[Parallel(n_jobs=1)]: Using backend SequentialBackend with 1 concurrent workers.


[CV] END .................................................... total time= 1.0min


[Parallel(n_jobs=1)]: Done   1 out of   1 | elapsed:  1.0min remaining:    0.0s


[CV] END .................................................... total time=  57.5s
[CV] END .................................................... total time=  45.5s
[CV] END .................................................... total time=  45.7s
[CV] END .................................................... total time=  44.9s
89.1
0.365


[Parallel(n_jobs=1)]: Done   5 out of   5 | elapsed:  4.3min finished


In [16]:
pipeline = Pipeline(
    [
        ("vect", CountVectorizer(ngram_range=(1,4))),
        ("clf", MultinomialNB()),
    ]
)
cv = 3

scorer = make_scorer(f1_score)
scorer = make_scorer(f1_score)
result = cross_val_score(pipeline, X, y, cv=cv, scoring=scorer, verbose=2)

print(round(result.mean() * 100, 2))
print(round(result.std()  *  100 ,3))

[Parallel(n_jobs=1)]: Using backend SequentialBackend with 1 concurrent workers.


[CV] END .................................................... total time=  43.1s


[Parallel(n_jobs=1)]: Done   1 out of   1 | elapsed:   43.2s remaining:    0.0s


[CV] END .................................................... total time=  49.5s
[CV] END .................................................... total time=  44.6s
88.85
0.184


[Parallel(n_jobs=1)]: Done   3 out of   3 | elapsed:  2.3min finished


In [20]:
means = []
stds = []

for i in range(2,6):
    pipeline = Pipeline(
    [
        ("vect", CountVectorizer(ngram_range=(1,2), min_df=i)),
        ("clf", MultinomialNB()),
    ]
    )

    scorer = make_scorer(f1_score)
    result = cross_val_score(pipeline, X, y, cv=5, scoring=scorer, verbose=2)

    means.append(round(result.mean() * 100, 2))
    stds.append(round(result.std()  *  100 ,3))

[Parallel(n_jobs=1)]: Using backend SequentialBackend with 1 concurrent workers.


[CV] END .................................................... total time=   8.0s


[Parallel(n_jobs=1)]: Done   1 out of   1 | elapsed:    8.1s remaining:    0.0s


[CV] END .................................................... total time=   7.8s
[CV] END .................................................... total time=   7.8s
[CV] END .................................................... total time=   7.8s
[CV] END .................................................... total time=   7.7s


[Parallel(n_jobs=1)]: Done   5 out of   5 | elapsed:   39.6s finished
[Parallel(n_jobs=1)]: Using backend SequentialBackend with 1 concurrent workers.


[CV] END .................................................... total time=   7.5s


[Parallel(n_jobs=1)]: Done   1 out of   1 | elapsed:    7.5s remaining:    0.0s


[CV] END .................................................... total time=   7.4s
[CV] END .................................................... total time=   7.5s
[CV] END .................................................... total time=   7.4s
[CV] END .................................................... total time=   7.4s


[Parallel(n_jobs=1)]: Done   5 out of   5 | elapsed:   37.7s finished
[Parallel(n_jobs=1)]: Using backend SequentialBackend with 1 concurrent workers.


[CV] END .................................................... total time=   7.5s


[Parallel(n_jobs=1)]: Done   1 out of   1 | elapsed:    7.5s remaining:    0.0s


[CV] END .................................................... total time=   7.3s
[CV] END .................................................... total time=   7.5s
[CV] END .................................................... total time=   7.3s
[CV] END .................................................... total time=   7.4s


[Parallel(n_jobs=1)]: Done   5 out of   5 | elapsed:   37.3s finished
[Parallel(n_jobs=1)]: Using backend SequentialBackend with 1 concurrent workers.


[CV] END .................................................... total time=   7.3s


[Parallel(n_jobs=1)]: Done   1 out of   1 | elapsed:    7.3s remaining:    0.0s


[CV] END .................................................... total time=   7.3s
[CV] END .................................................... total time=   7.4s
[CV] END .................................................... total time=   7.3s
[CV] END .................................................... total time=   7.2s


[Parallel(n_jobs=1)]: Done   5 out of   5 | elapsed:   36.9s finished


In [None]:
pd.DataFrame({
    "mean" : means,
    "std " : stds
 }
)

In [22]:
means = []
stds = []

for i in range(2,6):
    pipeline = Pipeline(
    [
        ("vect", CountVectorizer(ngram_range=(1,1), min_df=i)),
        ("clf", MultinomialNB()),
    ]
    )

    scorer = make_scorer(f1_score)
    result = cross_val_score(pipeline, X, y, cv=5, scoring=scorer, verbose=2)

    means.append(round(result.mean() * 100, 2))
    stds.append(round(result.std()  *  100 ,3))

[Parallel(n_jobs=1)]: Using backend SequentialBackend with 1 concurrent workers.


[CV] END .................................................... total time=   2.9s


[Parallel(n_jobs=1)]: Done   1 out of   1 | elapsed:    2.9s remaining:    0.0s


[CV] END .................................................... total time=   2.9s
[CV] END .................................................... total time=   2.8s
[CV] END .................................................... total time=   2.8s
[CV] END .................................................... total time=   2.8s


[Parallel(n_jobs=1)]: Done   5 out of   5 | elapsed:   14.3s finished
[Parallel(n_jobs=1)]: Using backend SequentialBackend with 1 concurrent workers.


[CV] END .................................................... total time=   2.8s


[Parallel(n_jobs=1)]: Done   1 out of   1 | elapsed:    2.8s remaining:    0.0s


[CV] END .................................................... total time=   2.8s
[CV] END .................................................... total time=   2.8s
[CV] END .................................................... total time=   2.8s
[CV] END .................................................... total time=   2.8s


[Parallel(n_jobs=1)]: Done   5 out of   5 | elapsed:   14.2s finished
[Parallel(n_jobs=1)]: Using backend SequentialBackend with 1 concurrent workers.


[CV] END .................................................... total time=   2.9s


[Parallel(n_jobs=1)]: Done   1 out of   1 | elapsed:    2.9s remaining:    0.0s


[CV] END .................................................... total time=   2.8s
[CV] END .................................................... total time=   2.8s
[CV] END .................................................... total time=   2.9s
[CV] END .................................................... total time=   2.8s


[Parallel(n_jobs=1)]: Done   5 out of   5 | elapsed:   14.3s finished
[Parallel(n_jobs=1)]: Using backend SequentialBackend with 1 concurrent workers.


[CV] END .................................................... total time=   2.8s


[Parallel(n_jobs=1)]: Done   1 out of   1 | elapsed:    2.8s remaining:    0.0s


[CV] END .................................................... total time=   2.8s
[CV] END .................................................... total time=   2.9s
[CV] END .................................................... total time=   2.9s
[CV] END .................................................... total time=   2.9s


[Parallel(n_jobs=1)]: Done   5 out of   5 | elapsed:   14.2s finished


In [36]:
means = []
stds = []

for i in range(100,39,-10):
    pipeline = Pipeline(
    [
        ("vect", CountVectorizer(ngram_range=(1,3), max_df=i/100)),
        ("clf", MultinomialNB()),
    ]
    )

    scorer = make_scorer(f1_score)
    result = cross_val_score(pipeline, X, y, cv=5, scoring=scorer, verbose=2)

    means.append(round(result.mean() * 100, 2))
    stds.append(round(result.std()  *  100 ,3))

[Parallel(n_jobs=1)]: Using backend SequentialBackend with 1 concurrent workers.


[CV] END .................................................... total time=  23.2s


[Parallel(n_jobs=1)]: Done   1 out of   1 | elapsed:   23.2s remaining:    0.0s


[CV] END .................................................... total time=  24.4s
[CV] END .................................................... total time=  24.2s
[CV] END .................................................... total time=  24.4s
[CV] END .................................................... total time=  23.6s


[Parallel(n_jobs=1)]: Done   5 out of   5 | elapsed:  2.0min finished
[Parallel(n_jobs=1)]: Using backend SequentialBackend with 1 concurrent workers.


[CV] END .................................................... total time=  22.8s


[Parallel(n_jobs=1)]: Done   1 out of   1 | elapsed:   22.8s remaining:    0.0s


[CV] END .................................................... total time=  23.6s
[CV] END .................................................... total time=  23.9s
[CV] END .................................................... total time=  23.5s
[CV] END .................................................... total time=  23.7s


[Parallel(n_jobs=1)]: Done   5 out of   5 | elapsed:  2.0min finished
[Parallel(n_jobs=1)]: Using backend SequentialBackend with 1 concurrent workers.


[CV] END .................................................... total time=  22.6s


[Parallel(n_jobs=1)]: Done   1 out of   1 | elapsed:   22.6s remaining:    0.0s


[CV] END .................................................... total time=  22.6s
[CV] END .................................................... total time=  23.0s
[CV] END .................................................... total time=  23.5s
[CV] END .................................................... total time=  42.6s


[Parallel(n_jobs=1)]: Done   5 out of   5 | elapsed:  2.3min finished
[Parallel(n_jobs=1)]: Using backend SequentialBackend with 1 concurrent workers.


[CV] END .................................................... total time=  23.3s


[Parallel(n_jobs=1)]: Done   1 out of   1 | elapsed:   23.3s remaining:    0.0s


[CV] END .................................................... total time=  23.9s
[CV] END .................................................... total time=  23.1s
[CV] END .................................................... total time=  23.6s
[CV] END .................................................... total time=  23.5s


[Parallel(n_jobs=1)]: Done   5 out of   5 | elapsed:  2.0min finished
[Parallel(n_jobs=1)]: Using backend SequentialBackend with 1 concurrent workers.


[CV] END .................................................... total time=  22.7s


[Parallel(n_jobs=1)]: Done   1 out of   1 | elapsed:   22.7s remaining:    0.0s


[CV] END .................................................... total time=  23.6s
[CV] END .................................................... total time=  23.5s
[CV] END .................................................... total time= 1.2min
[CV] END .................................................... total time=  23.7s


[Parallel(n_jobs=1)]: Done   5 out of   5 | elapsed:  2.8min finished
[Parallel(n_jobs=1)]: Using backend SequentialBackend with 1 concurrent workers.


[CV] END .................................................... total time=  22.3s


[Parallel(n_jobs=1)]: Done   1 out of   1 | elapsed:   22.3s remaining:    0.0s


[CV] END .................................................... total time=  22.5s
[CV] END .................................................... total time=  23.1s
[CV] END .................................................... total time=  24.4s
[CV] END .................................................... total time=  25.2s


[Parallel(n_jobs=1)]: Done   5 out of   5 | elapsed:  2.0min finished
[Parallel(n_jobs=1)]: Using backend SequentialBackend with 1 concurrent workers.


[CV] END .................................................... total time=  22.8s


[Parallel(n_jobs=1)]: Done   1 out of   1 | elapsed:   22.8s remaining:    0.0s


[CV] END .................................................... total time=  25.5s
[CV] END .................................................... total time=  24.4s
[CV] END .................................................... total time=  24.4s
[CV] END .................................................... total time=  24.7s


[Parallel(n_jobs=1)]: Done   5 out of   5 | elapsed:  2.1min finished


In [38]:
means = []
stds = []

for i in [1,2,3,5,8,12]:
    pipeline = Pipeline(
    [
        ("vect", CountVectorizer(ngram_range=(1,3), min_df=i)),
        ("clf", MultinomialNB()),
    ]
    )

    scorer = make_scorer(f1_score)
    result = cross_val_score(pipeline, X, y, cv=5, scoring=scorer, verbose=2)

    means.append(round(result.mean() * 100, 2))
    stds.append(round(result.std()  *  100 ,3))

[Parallel(n_jobs=1)]: Using backend SequentialBackend with 1 concurrent workers.


[CV] END .................................................... total time=  24.9s


[Parallel(n_jobs=1)]: Done   1 out of   1 | elapsed:   24.9s remaining:    0.0s


[CV] END .................................................... total time=  23.6s
[CV] END .................................................... total time=  23.4s
[CV] END .................................................... total time=  23.6s
[CV] END .................................................... total time=  23.3s


[Parallel(n_jobs=1)]: Done   5 out of   5 | elapsed:  2.0min finished
[Parallel(n_jobs=1)]: Using backend SequentialBackend with 1 concurrent workers.


[CV] END .................................................... total time=  15.7s


[Parallel(n_jobs=1)]: Done   1 out of   1 | elapsed:   15.7s remaining:    0.0s


[CV] END .................................................... total time=  15.2s
[CV] END .................................................... total time=  15.2s
[CV] END .................................................... total time=  15.3s
[CV] END .................................................... total time=  15.4s


[Parallel(n_jobs=1)]: Done   5 out of   5 | elapsed:  1.3min finished
[Parallel(n_jobs=1)]: Using backend SequentialBackend with 1 concurrent workers.


[CV] END .................................................... total time=  14.7s


[Parallel(n_jobs=1)]: Done   1 out of   1 | elapsed:   14.8s remaining:    0.0s


[CV] END .................................................... total time=  14.6s
[CV] END .................................................... total time=  14.5s
[CV] END .................................................... total time=  14.6s
[CV] END .................................................... total time=  14.6s


[Parallel(n_jobs=1)]: Done   5 out of   5 | elapsed:  1.3min finished
[Parallel(n_jobs=1)]: Using backend SequentialBackend with 1 concurrent workers.


[CV] END .................................................... total time=  14.1s


[Parallel(n_jobs=1)]: Done   1 out of   1 | elapsed:   14.1s remaining:    0.0s


[CV] END .................................................... total time=  14.2s
[CV] END .................................................... total time=  14.3s
[CV] END .................................................... total time=  14.2s
[CV] END .................................................... total time=  14.0s


[Parallel(n_jobs=1)]: Done   5 out of   5 | elapsed:  1.2min finished
[Parallel(n_jobs=1)]: Using backend SequentialBackend with 1 concurrent workers.


[CV] END .................................................... total time=  14.0s


[Parallel(n_jobs=1)]: Done   1 out of   1 | elapsed:   14.0s remaining:    0.0s


[CV] END .................................................... total time=  13.9s
[CV] END .................................................... total time=  13.9s
[CV] END .................................................... total time=  14.1s
[CV] END .................................................... total time=  13.9s


[Parallel(n_jobs=1)]: Done   5 out of   5 | elapsed:  1.2min finished
[Parallel(n_jobs=1)]: Using backend SequentialBackend with 1 concurrent workers.


[CV] END .................................................... total time=  13.9s


[Parallel(n_jobs=1)]: Done   1 out of   1 | elapsed:   13.9s remaining:    0.0s


[CV] END .................................................... total time=  13.8s
[CV] END .................................................... total time=  13.8s
[CV] END .................................................... total time=  13.8s
[CV] END .................................................... total time=  13.8s


[Parallel(n_jobs=1)]: Done   5 out of   5 | elapsed:  1.2min finished


In [39]:
pd.DataFrame({
    "mean" : means,
    "std " : stds
 }
)

Unnamed: 0,mean,std
0,88.71,0.387
1,88.58,0.236
2,88.26,0.32
3,87.96,0.273
4,87.63,0.31
5,87.29,0.309


In [42]:
pipeline = Pipeline(
    [
        ("vect", CountVectorizer(ngram_range=(1,3), min_df=2, max_df=.5)),
        ("clf", MultinomialNB()),
    ]
)
cv = 5

scorer = make_scorer(f1_score)
result = cross_val_score(pipeline, X, y, cv=cv, scoring=scorer, verbose=2)

print(result.mean() * 100)
print(result.std()  * 100)

[Parallel(n_jobs=1)]: Using backend SequentialBackend with 1 concurrent workers.


[CV] END .................................................... total time=  15.5s


[Parallel(n_jobs=1)]: Done   1 out of   1 | elapsed:   15.5s remaining:    0.0s


[CV] END .................................................... total time=  15.6s
[CV] END .................................................... total time=  15.2s
[CV] END .................................................... total time=  15.2s
[CV] END .................................................... total time=  15.3s
88.76145169308067
0.2325336511514508


[Parallel(n_jobs=1)]: Done   5 out of   5 | elapsed:  1.3min finished


In [47]:
data = data.sample(frac=1, random_state=70).reset_index(drop=True) # This code will shuffle the data (just in case!)
X = data.contents
y = data.labels

In [48]:
pipeline = Pipeline(
    [
        ("vect", CountVectorizer(ngram_range=(1,3), min_df=2, max_df=.5)),
        ("clf", MultinomialNB()),
    ]
)
cv = 5

scorer = make_scorer(f1_score)
result = cross_val_score(pipeline, X, y, cv=cv, scoring=scorer, verbose=2)

print(result.mean() * 100)
print(result.std()  * 100)

[Parallel(n_jobs=1)]: Using backend SequentialBackend with 1 concurrent workers.


[CV] END .................................................... total time=  15.3s


[Parallel(n_jobs=1)]: Done   1 out of   1 | elapsed:   15.3s remaining:    0.0s


[CV] END .................................................... total time=  15.2s
[CV] END .................................................... total time=  15.3s
[CV] END .................................................... total time=  15.4s
[CV] END .................................................... total time=  16.2s
88.8369593976378
0.22677420607655754


[Parallel(n_jobs=1)]: Done   5 out of   5 | elapsed:  1.3min finished


#### n_grams being removed

In [60]:
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.33, random_state=42)

In [62]:
mnb = MultinomialNB()
cv = CountVectorizer(ngram_range=(1,3), min_df=2)

X_train_vectorized = cv.fit_transform(X_train)
min_df_words = list(cv.stop_words_)


mnb = MultinomialNB()
cv = CountVectorizer(ngram_range=(1,3), max_df=.5)

X_train_vectorized = cv.fit_transform(X_train)
clf.fit(X_train_vectorized, y_train)
max_df_words = list(cv.stop_words_)

In [78]:
def length_words(x):
    return len(x.split())

In [79]:
df1 = pd.DataFrame(
    {
    "n_gram" : min_df_words,
    "words" : list(map(length_words, min_df_words))
    })

In [80]:
df2 = pd.DataFrame(
    {
    "n_gram" : max_df_words,
    "words" : list(map(length_words, max_df_words))
    })

In [83]:
df1.words.value_counts()

3    2261129
2     789528
1      25476
Name: words, dtype: int64

In [105]:
df2[df2.words == 1].sample()

Unnamed: 0,n_gram,words
0,that,1


In [84]:
df2.words.value_counts()

1    26
2     3
Name: words, dtype: int64