In [45]:
import numpy as np
import pandas as pd
import time

In [2]:
from matplotlib import pyplot as plt
%matplotlib inline

In [3]:
import helpers

In [4]:
from tempfile import mkdtemp
from shutil import rmtree

In [5]:
from nltk.corpus import stopwords

In [6]:
# First import naive bayes and create a pipeline
from sklearn.naive_bayes import MultinomialNB
from sklearn.pipeline import Pipeline
from sklearn.feature_extraction.text import TfidfVectorizer, CountVectorizer
from sklearn.linear_model import LogisticRegression

In [7]:
from sklearn.model_selection import train_test_split
from sklearn.model_selection import GridSearchCV, RandomizedSearchCV
from sklearn.metrics import log_loss

In [83]:
from sklearn.preprocessing import LabelEncoder

In [8]:
import scipy.stats as ss

In [10]:
from dask_searchcv import GridSearchCV as DaskGridSearchCV

In [11]:
# Results Dict
results = {}

### Load IMDB (Original Dataset)

In [12]:
import importlib

In [13]:
importlib.reload(helpers)

<module 'helpers' from 'C:\\Users\\sankalpg\\Desktop\\Learning\\notebooks\\imdbExercise\\helpers.py'>

In [14]:
path = '../kaggleData/imdb-review-dataset/imdb_master.csv'
x_train, y_train, x_test, y_test = helpers.load_imdb(path)

Loading IMDB dataset...
Original dataframe shape:  (100000, 4)
Original DF columns: Index(['type', 'review', 'label', 'file'], dtype='object')
Original Train/Test split: train    75000
test     25000
Name: type, dtype: int64
Dropping the file column...
Dropping the unlabeled rows and splitting into train/test...
X_train, y_train, shapes: (25000,) (25000,)
y_train counts: neg    12500
pos    12500
Name: label, dtype: int64
X_test, y_test, shapes: (25000,) (25000,)
y_test counts: neg    12500
pos    12500
Name: label, dtype: int64


In [15]:
print(type(x_train))
print(type(y_train))

<class 'pandas.core.series.Series'>
<class 'pandas.core.series.Series'>


In [16]:
y_train[0:10]

25000    neg
25001    neg
25002    neg
25003    neg
25004    neg
25005    neg
25006    neg
25007    neg
25008    neg
25009    neg
Name: label, dtype: object

## Naive Bayes

### CountVectorizer - Naive Bayes

In [17]:
countPipe1 = Pipeline([
    ('countVect', CountVectorizer()),
    ('clf', MultinomialNB()),
])

In [18]:
params = {'countVect__binary':[False, True],
          'countVect__ngram_range':[(1,1), (1,3)],
          'countVect__stop_words':[None, stopwords.words('english')]
         }

#### Doing SkLearn gridsearch with CountVectorizer and Naive Bayes

In [48]:
st = time.time()
cvSklearn = GridSearchCV(countPipe1, param_grid = params, n_jobs=-1, scoring='accuracy', verbose=5)
cvSklearn.fit(x_train, y_train)
end = time.time()

Fitting 3 folds for each of 8 candidates, totalling 24 fits


[Parallel(n_jobs=-1)]: Done  10 tasks      | elapsed:  1.9min
[Parallel(n_jobs=-1)]: Done  22 out of  24 | elapsed:  3.9min remaining:   21.4s
[Parallel(n_jobs=-1)]: Done  24 out of  24 | elapsed:  4.5min finished


In [49]:
print(end-st)

317.9818708896637


In [50]:
helpers.print_gridSearch_report(cvSklearn)

Best parameters set found on development set:

{'countVect__binary': True, 'countVect__ngram_range': (1, 3), 'countVect__stop_words': None}

Grid scores on development set:

0.856 (+/-0.011) for {'countVect__binary': True, 'countVect__ngram_range': (1, 3), 'countVect__stop_words': None}
0.841 (+/-0.005) for {'countVect__binary': False, 'countVect__ngram_range': (1, 3), 'countVect__stop_words': None}
0.832 (+/-0.009) for {'countVect__binary': True, 'countVect__ngram_range': (1, 3), 'countVect__stop_words': True}
0.808 (+/-0.010) for {'countVect__binary': False, 'countVect__ngram_range': (1, 3), 'countVect__stop_words': True}
0.807 (+/-0.011) for {'countVect__binary': True, 'countVect__ngram_range': (1, 1), 'countVect__stop_words': True}
0.802 (+/-0.013) for {'countVect__binary': True, 'countVect__ngram_range': (1, 1), 'countVect__stop_words': None}
0.774 (+/-0.007) for {'countVect__binary': False, 'countVect__ngram_range': (1, 1), 'countVect__stop_words': True}
0.771 (+/-0.009) for {'co

### TfidfVectorizer - Naive Bayes

In [51]:
tfidfPipe1 = Pipeline([
    ('tfidfVect', TfidfVectorizer()),
    ('clf', MultinomialNB()),
])

In [52]:
params = {'tfidfVect__max_df':ss.uniform(0.1, 0.9),
          'tfidfVect__binary':[False, True],
          'tfidfVect__ngram_range':[(1,1), (1,3)],
          'tfidfVect__stop_words':[None, stopwords.words('english')]
         }

In [None]:
tfidf_cv = RandomizedSearchCV(tfidfPipe1, param_dist = params, scoring='accuracy', verbose=5)

In [145]:
st = time.time()
tfidf_cv.fit(x_train, y_train)
end=time.time()

Fitting 3 folds for each of 24 candidates, totalling 72 fits


[Parallel(n_jobs=8)]: Done   2 tasks      | elapsed:   29.7s
[Parallel(n_jobs=8)]: Done  56 tasks      | elapsed: 10.6min
[Parallel(n_jobs=8)]: Done  72 out of  72 | elapsed: 14.6min remaining:    0.0s
[Parallel(n_jobs=8)]: Done  72 out of  72 | elapsed: 14.6min finished
  y = column_or_1d(y, warn=True)


GridSearchCV(cv=None, error_score='raise',
       estimator=Pipeline(memory=None,
     steps=[('tfidfVect', TfidfVectorizer(analyzer='word', binary=False, decode_error='strict',
        dtype=<class 'numpy.int64'>, encoding='utf-8', input='content',
        lowercase=True, max_df=1.0, max_features=None, min_df=1,
        ngram_range=(1, 1), norm='l2', preprocessor=None, smooth_idf=Tru...rue,
        vocabulary=None)), ('clf', MultinomialNB(alpha=1.0, class_prior=None, fit_prior=True))]),
       fit_params=None, iid=True, n_jobs=8,
       param_grid={'tfidfVect__max_df': [0.2, 0.5, 1.0], 'tfidfVect__binary': [False, True], 'tfidfVect__ngram_range': [(1, 1), (1, 3)], 'tfidfVect__stop_words': [None, ['i', 'me', 'my', 'myself', 'we', 'our', 'ours', 'ourselves', 'you', "you're", "you've", "you'll", "you'd", 'your', 'yours', 'yourself', '...shouldn', "shouldn't", 'wasn', "wasn't", 'weren', "weren't", 'won', "won't", 'wouldn', "wouldn't"]]},
       pre_dispatch='2*n_jobs', refit=True, return_

In [146]:
helpers.print_gridSearch_report(tfidf_cv)

Best parameters set found on development set:

{'tfidfVect__binary': True, 'tfidfVect__max_df': 0.2, 'tfidfVect__ngram_range': (1, 3), 'tfidfVect__stop_words': None}

Grid scores on development set:

0.868 (+/-0.008) for {'tfidfVect__binary': True, 'tfidfVect__max_df': 0.2, 'tfidfVect__ngram_range': (1, 3), 'tfidfVect__stop_words': None}
0.866 (+/-0.010) for {'tfidfVect__binary': True, 'tfidfVect__max_df': 0.5, 'tfidfVect__ngram_range': (1, 3), 'tfidfVect__stop_words': None}
0.865 (+/-0.011) for {'tfidfVect__binary': True, 'tfidfVect__max_df': 1.0, 'tfidfVect__ngram_range': (1, 3), 'tfidfVect__stop_words': None}
0.856 (+/-0.005) for {'tfidfVect__binary': False, 'tfidfVect__max_df': 0.2, 'tfidfVect__ngram_range': (1, 3), 'tfidfVect__stop_words': None}
0.854 (+/-0.004) for {'tfidfVect__binary': True, 'tfidfVect__max_df': 0.2, 'tfidfVect__ngram_range': (1, 3), 'tfidfVect__stop_words': True}
0.854 (+/-0.004) for {'tfidfVect__binary': True, 'tfidfVect__max_df': 0.5, 'tfidfVect__ngram_range'

In [54]:
print(end-st)

317.9818708896637


### Optimize on the ngram range and min_df as well

In [55]:
tfidfPipe2 = Pipeline([
    ('tfidfVect', TfidfVectorizer(max_df=0.2, binary=True)),
    ('clf', MultinomialNB()),
])

In [56]:
params2 = {'tfidfVect__ngram_range':[(1,3), (1,5), (1,7)]}

In [59]:
st = time.time()
tfidf_cv = GridSearchCV(tfidfPipe2, param_grid = params2, n_jobs=-1, scoring='accuracy', verbose=5)
tfidf_cv.fit(x_train, y_train)

Fitting 3 folds for each of 3 candidates, totalling 9 fits


[Parallel(n_jobs=-1)]: Done   4 out of   9 | elapsed:  3.3min remaining:  4.2min
[Parallel(n_jobs=-1)]: Done   6 out of   9 | elapsed:  5.1min remaining:  2.6min
[Parallel(n_jobs=-1)]: Done   9 out of   9 | elapsed:  8.5min finished


GridSearchCV(cv=None, error_score='raise',
       estimator=Pipeline(memory=None,
     steps=[('tfidfVect', TfidfVectorizer(analyzer='word', binary=True, decode_error='strict',
        dtype=<class 'numpy.int64'>, encoding='utf-8', input='content',
        lowercase=True, max_df=0.2, max_features=None, min_df=1,
        ngram_range=(1, 1), norm='l2', preprocessor=None, smooth_idf=True...rue,
        vocabulary=None)), ('clf', MultinomialNB(alpha=1.0, class_prior=None, fit_prior=True))]),
       fit_params=None, iid=True, n_jobs=-1,
       param_grid={'tfidfVect__ngram_range': [(1, 3), (1, 5), (1, 7)]},
       pre_dispatch='2*n_jobs', refit=True, return_train_score='warn',
       scoring='accuracy', verbose=5)

In [60]:
print(time.time()-st)

647.6303422451019


In [61]:
helpers.print_gridSearch_report(tfidf_cv)

Best parameters set found on development set:

{'tfidfVect__ngram_range': (1, 5)}

Grid scores on development set:

0.870 (+/-0.009) for {'tfidfVect__ngram_range': (1, 5)}
0.870 (+/-0.009) for {'tfidfVect__ngram_range': (1, 7)}
0.868 (+/-0.008) for {'tfidfVect__ngram_range': (1, 3)}



## Logistic Regression

### CountVectorizer

In [63]:
logPipe1 = Pipeline([
    ('vectorizer', CountVectorizer()),
    ('clf', LogisticRegression(solver='sag')),
])


params = {'vectorizer__max_df': [ 0.2, 0.5, 1.0],
          'vectorizer__binary':[False, True],
          'vectorizer__ngram_range':[(1,1), (1,3)],
          'clf__C': [0.001, 0.01, 0.1, 1, 10, 100]
         }

In [64]:
log_cv = RandomizedSearchCV(logPipe1, param_distributions = params, n_iter=20, n_jobs=-1, scoring='accuracy', verbose=5)
log_cv.fit(x_train, y_train)

Fitting 3 folds for each of 20 candidates, totalling 60 fits


[Parallel(n_jobs=-1)]: Done  10 tasks      | elapsed:  3.7min
[Parallel(n_jobs=-1)]: Done  60 out of  60 | elapsed: 33.6min finished


RandomizedSearchCV(cv=None, error_score='raise',
          estimator=Pipeline(memory=None,
     steps=[('vectorizer', CountVectorizer(analyzer='word', binary=False, decode_error='strict',
        dtype=<class 'numpy.int64'>, encoding='utf-8', input='content',
        lowercase=True, max_df=1.0, max_features=None, min_df=1,
        ngram_range=(1, 1), preprocessor=None, stop_words=None,
       ... penalty='l2', random_state=None, solver='sag', tol=0.0001,
          verbose=0, warm_start=False))]),
          fit_params=None, iid=True, n_iter=20, n_jobs=-1,
          param_distributions={'vectorizer__max_df': [0.2, 0.5, 1.0], 'vectorizer__binary': [False, True], 'vectorizer__ngram_range': [(1, 1), (1, 3)], 'clf__C': [0.001, 0.01, 0.1, 1, 10, 100]},
          pre_dispatch='2*n_jobs', random_state=None, refit=True,
          return_train_score='warn', scoring='accuracy', verbose=5)

In [65]:
helpers.print_gridSearch_report(log_cv)

Best parameters set found on development set:

{'vectorizer__ngram_range': (1, 3), 'vectorizer__max_df': 1.0, 'vectorizer__binary': True, 'clf__C': 100}

Grid scores on development set:

0.882 (+/-0.007) for {'vectorizer__ngram_range': (1, 3), 'vectorizer__max_df': 1.0, 'vectorizer__binary': True, 'clf__C': 100}
0.882 (+/-0.006) for {'vectorizer__ngram_range': (1, 3), 'vectorizer__max_df': 0.5, 'vectorizer__binary': True, 'clf__C': 100}
0.881 (+/-0.005) for {'vectorizer__ngram_range': (1, 3), 'vectorizer__max_df': 0.5, 'vectorizer__binary': True, 'clf__C': 0.1}
0.881 (+/-0.006) for {'vectorizer__ngram_range': (1, 3), 'vectorizer__max_df': 0.2, 'vectorizer__binary': True, 'clf__C': 0.01}
0.876 (+/-0.007) for {'vectorizer__ngram_range': (1, 3), 'vectorizer__max_df': 0.5, 'vectorizer__binary': True, 'clf__C': 0.01}
0.875 (+/-0.007) for {'vectorizer__ngram_range': (1, 3), 'vectorizer__max_df': 1.0, 'vectorizer__binary': True, 'clf__C': 0.01}
0.874 (+/-0.007) for {'vectorizer__ngram_range':

### TFidf Vectorizer

In [73]:
logPipe2 = Pipeline([
    ('vectorizer', TfidfVectorizer()),
    ('clf', LogisticRegression(solver='sag')),
])


params = {'vectorizer__max_df': ss.uniform(0.1, 1.0),
          'vectorizer__binary':[True],
          'vectorizer__ngram_range':[(1,3)],
          'clf__C': [10, 100, 1000]
         }

In [74]:
log_cv = RandomizedSearchCV(logPipe2, param_distributions = params, n_iter=20, n_jobs=-1, scoring='accuracy', verbose=5)
log_cv.fit(x_train, y_train)

Fitting 3 folds for each of 20 candidates, totalling 60 fits


[Parallel(n_jobs=-1)]: Done  10 tasks      | elapsed: 14.7min
[Parallel(n_jobs=-1)]: Done  60 out of  60 | elapsed: 62.4min finished


RandomizedSearchCV(cv=None, error_score='raise',
          estimator=Pipeline(memory=None,
     steps=[('vectorizer', TfidfVectorizer(analyzer='word', binary=False, decode_error='strict',
        dtype=<class 'numpy.int64'>, encoding='utf-8', input='content',
        lowercase=True, max_df=1.0, max_features=None, min_df=1,
        ngram_range=(1, 1), norm='l2', preprocessor=None, smooth_idf=Tr... penalty='l2', random_state=None, solver='sag', tol=0.0001,
          verbose=0, warm_start=False))]),
          fit_params=None, iid=True, n_iter=20, n_jobs=-1,
          param_distributions={'vectorizer__max_df': <scipy.stats._distn_infrastructure.rv_frozen object at 0x000001F7EDE90DD8>, 'vectorizer__binary': [True], 'vectorizer__ngram_range': [(1, 3)], 'clf__C': [10, 100, 1000]},
          pre_dispatch='2*n_jobs', random_state=None, refit=True,
          return_train_score='warn', scoring='accuracy', verbose=5)

In [75]:
helpers.print_gridSearch_report(log_cv)

Best parameters set found on development set:

{'clf__C': 1000, 'vectorizer__binary': True, 'vectorizer__max_df': 0.2645730704175282, 'vectorizer__ngram_range': (1, 3)}

Grid scores on development set:

0.888 (+/-0.009) for {'clf__C': 1000, 'vectorizer__binary': True, 'vectorizer__max_df': 0.2645730704175282, 'vectorizer__ngram_range': (1, 3)}
0.888 (+/-0.006) for {'clf__C': 1000, 'vectorizer__binary': True, 'vectorizer__max_df': 0.43095227191752072, 'vectorizer__ngram_range': (1, 3)}
0.887 (+/-0.006) for {'clf__C': 1000, 'vectorizer__binary': True, 'vectorizer__max_df': 0.46507593054654572, 'vectorizer__ngram_range': (1, 3)}
0.887 (+/-0.005) for {'clf__C': 1000, 'vectorizer__binary': True, 'vectorizer__max_df': 0.85052342685369797, 'vectorizer__ngram_range': (1, 3)}
0.887 (+/-0.005) for {'clf__C': 100, 'vectorizer__binary': True, 'vectorizer__max_df': 0.48210991720845253, 'vectorizer__ngram_range': (1, 3)}
0.887 (+/-0.005) for {'clf__C': 1000, 'vectorizer__binary': True, 'vectorizer__

## Run on Test Set

In [92]:
logPipe3 = Pipeline([
    ('vectorizer', TfidfVectorizer(ngram_range=(1,3), max_df=0.5, binary=True)),
    ('clf', LogisticRegression(C=100)),
])

In [95]:
logPipe3.fit(x_train, LabelEncoder().fit_transform(y_train))

Pipeline(memory=None,
     steps=[('vectorizer', TfidfVectorizer(analyzer='word', binary=True, decode_error='strict',
        dtype=<class 'numpy.int64'>, encoding='utf-8', input='content',
        lowercase=True, max_df=0.5, max_features=None, min_df=1,
        ngram_range=(1, 3), norm='l2', preprocessor=None, smooth_idf=Tru...ty='l2', random_state=None, solver='liblinear', tol=0.0001,
          verbose=0, warm_start=False))])

In [98]:
print ("Accuracy: %f" % logPipe3.score(x_test, LabelEncoder().fit_transform(y_test)))

Accuracy: 0.907280
