In [1]:
import numpy as np
import pandas as pd

In [2]:
from matplotlib import pyplot as plt
%matplotlib inline

In [53]:
import helpers

In [3]:
from nltk.corpus import stopwords

In [4]:
# First import naive bayes and create a pipeline
from sklearn.naive_bayes import MultinomialNB
from sklearn.pipeline import Pipeline
from sklearn.feature_extraction.text import TfidfVectorizer, CountVectorizer
from sklearn.linear_model import LogisticRegression

In [5]:
from sklearn.model_selection import train_test_split
from sklearn.model_selection import GridSearchCV

In [6]:
# Results Dict
results = {}

### Load IMDB (Original Dataset)

In [7]:
# Need cp1252 - Windows Western Europe encoding here
df = pd.read_csv('../kaggleData/imdb-review-dataset/imdb_master.csv', index_col=0, encoding='cp1252')

In [8]:
df.head()

Unnamed: 0,type,review,label,file
0,test,Once again Mr. Costner has dragged out a movie...,neg,0_2.txt
1,test,This is an example of why the majority of acti...,neg,10000_4.txt
2,test,"First of all I hate those moronic rappers, who...",neg,10001_1.txt
3,test,Not even the Beatles could write songs everyon...,neg,10002_3.txt
4,test,Brass pictures (movies is not a fitting word f...,neg,10003_3.txt


In [9]:
df.shape

(100000, 4)

In [10]:
df['type'].unique()

array(['test', 'train'], dtype=object)

In [11]:
print(df[df['type']=='train'].shape, df[df['type']=='test'].shape)

(75000, 4) (25000, 4)


In [12]:
df = df.drop('file', axis=1)

In [13]:
df.head()

Unnamed: 0,type,review,label
0,test,Once again Mr. Costner has dragged out a movie...,neg
1,test,This is an example of why the majority of acti...,neg
2,test,"First of all I hate those moronic rappers, who...",neg
3,test,Not even the Beatles could write songs everyon...,neg
4,test,Brass pictures (movies is not a fitting word f...,neg


### Divide into train, test
Only using rows with neg/pos labels

In [14]:
x_train = df.loc[(df['type']=='train') & (df['label'] != 'unsup'), 'review']
y_train = df.loc[(df['type']=='train') & (df['label'] != 'unsup'), 'label']

In [15]:
print(x_train.shape)
print(y_train.shape)

(25000,)
(25000,)


In [16]:
y_train.describe()

count     25000
unique        2
top         pos
freq      12500
Name: label, dtype: object

In [17]:
x_test = df.loc[(df['type']=='test') & (df['label'] != 'unsup'), 'review']
y_test = df.loc[(df['type']=='test') & (df['label'] != 'unsup'), 'label']

In [18]:
print(x_test.shape, y_test.shape)

(25000,) (25000,)


In [19]:
y_train.describe()

count     25000
unique        2
top         pos
freq      12500
Name: label, dtype: object

In [20]:
# Here we build a train function that will automatically split the train set into train and validation
# so that our test set is separate
 
def train_test_accuracy(classifier, X, y):
    X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.20, random_state=33)
    classifier.fit(X_train, y_train)
    print ("Accuracy: %s" % classifier.score(X_test, y_test))
    return classifier

## Naive Bayes

### CountVectorizer - Naive Bayes

In [40]:
countPipe1 = Pipeline([
    ('countVect', CountVectorizer()),
    ('clf', MultinomialNB()),
])

In [50]:
params = {'countVect__binary':[False, True],
          'countVect__ngram_range':[(1,1), (1,3)],
          'countVect__stop_words':[None, stopwords.words('english')]
         }

In [51]:
cv = GridSearchCV(countPipe1, param_grid = params, n_jobs=4, scoring='accuracy', verbose=5)

In [52]:
cv.fit(x_train, y_train)

Fitting 3 folds for each of 8 candidates, totalling 24 fits


[Parallel(n_jobs=4)]: Done  10 tasks      | elapsed:  2.0min
[Parallel(n_jobs=4)]: Done  22 out of  24 | elapsed:  4.0min remaining:   21.7s
[Parallel(n_jobs=4)]: Done  24 out of  24 | elapsed:  4.5min finished


GridSearchCV(cv=None, error_score='raise',
       estimator=Pipeline(memory=None,
     steps=[('countVect', CountVectorizer(analyzer='word', binary=False, decode_error='strict',
        dtype=<class 'numpy.int64'>, encoding='utf-8', input='content',
        lowercase=True, max_df=1.0, max_features=None, min_df=1,
        ngram_range=(1, 1), preproc...nizer=None, vocabulary=None)), ('clf', MultinomialNB(alpha=1.0, class_prior=None, fit_prior=True))]),
       fit_params=None, iid=True, n_jobs=4,
       param_grid={'countVect__binary': [False, True], 'countVect__ngram_range': [(1, 1), (1, 3)], 'countVect__stop_words': [None, ['i', 'me', 'my', 'myself', 'we', 'our', 'ours', 'ourselves', 'you', "you're", "you've", "you'll", "you'd", 'your', 'yours', 'yourself', 'yourselves', 'he', 'him', 'his', 'hims...shouldn', "shouldn't", 'wasn', "wasn't", 'weren', "weren't", 'won', "won't", 'wouldn', "wouldn't"]]},
       pre_dispatch='2*n_jobs', refit=True, return_train_score='warn',
       scoring='ac

In [54]:
helpers.print_gridSearch_report(cv)

Best parameters set found on development set:

{'countVect__binary': True, 'countVect__ngram_range': (1, 3), 'countVect__stop_words': None}

Grid scores on development set:

0.771 (+/-0.009) for {'countVect__binary': False, 'countVect__ngram_range': (1, 1), 'countVect__stop_words': None}
0.774 (+/-0.007) for {'countVect__binary': False, 'countVect__ngram_range': (1, 1), 'countVect__stop_words': ['i', 'me', 'my', 'myself', 'we', 'our', 'ours', 'ourselves', 'you', "you're", "you've", "you'll", "you'd", 'your', 'yours', 'yourself', 'yourselves', 'he', 'him', 'his', 'himself', 'she', "she's", 'her', 'hers', 'herself', 'it', "it's", 'its', 'itself', 'they', 'them', 'their', 'theirs', 'themselves', 'what', 'which', 'who', 'whom', 'this', 'that', "that'll", 'these', 'those', 'am', 'is', 'are', 'was', 'were', 'be', 'been', 'being', 'have', 'has', 'had', 'having', 'do', 'does', 'did', 'doing', 'a', 'an', 'the', 'and', 'but', 'if', 'or', 'because', 'as', 'until', 'while', 'of', 'at', 'by', 'for'

### TfidfVectorizer - Naive Bayes

In [56]:
tfidfPipe1 = Pipeline([
    ('tfidfVect', TfidfVectorizer()),
    ('clf', MultinomialNB()),
])

In [57]:
params = {'tfidfVect__max_df': [ 0.2, 0.5, 1.0],
          'tfidfVect__binary':[False, True],
          'tfidfVect__ngram_range':[(1,1), (1,3)],
          'tfidfVect__stop_words':[None, stopwords.words('english')]
         }

In [58]:
tfidf_cv = GridSearchCV(tfidfPipe1, param_grid = params, n_jobs=8, scoring='accuracy', verbose=5)

In [59]:
tfidf_cv.fit(x_train, y_train)

Fitting 3 folds for each of 24 candidates, totalling 72 fits


[Parallel(n_jobs=8)]: Done   2 tasks      | elapsed:   29.0s
[Parallel(n_jobs=8)]: Done  56 tasks      | elapsed: 11.5min
[Parallel(n_jobs=8)]: Done  72 out of  72 | elapsed: 15.5min remaining:    0.0s
[Parallel(n_jobs=8)]: Done  72 out of  72 | elapsed: 15.5min finished


GridSearchCV(cv=None, error_score='raise',
       estimator=Pipeline(memory=None,
     steps=[('tfidfVect', TfidfVectorizer(analyzer='word', binary=False, decode_error='strict',
        dtype=<class 'numpy.int64'>, encoding='utf-8', input='content',
        lowercase=True, max_df=1.0, max_features=None, min_df=1,
        ngram_range=(1, 1), norm='l2', preprocessor=None, smooth_idf=Tru...rue,
        vocabulary=None)), ('clf', MultinomialNB(alpha=1.0, class_prior=None, fit_prior=True))]),
       fit_params=None, iid=True, n_jobs=8,
       param_grid={'tfidfVect__max_df': [0.2, 0.5, 1.0], 'tfidfVect__binary': [False, True], 'tfidfVect__ngram_range': [(1, 1), (1, 3)], 'tfidfVect__stop_words': [None, ['i', 'me', 'my', 'myself', 'we', 'our', 'ours', 'ourselves', 'you', "you're", "you've", "you'll", "you'd", 'your', 'yours', 'yourself', '...shouldn', "shouldn't", 'wasn', "wasn't", 'weren', "weren't", 'won', "won't", 'wouldn', "wouldn't"]]},
       pre_dispatch='2*n_jobs', refit=True, return_

In [85]:
import importlib

In [86]:
importlib.reload(helpers)

<module 'helpers' from 'C:\\Users\\sankalpg\\Desktop\\Learning\\notebooks\\imdbExercise\\helpers.py'>

In [75]:
helpers.print_gridSearch_report(tfidf_cv)

Best parameters set found on development set:

{'tfidfVect__binary': True, 'tfidfVect__max_df': 0.2, 'tfidfVect__ngram_range': (1, 3), 'tfidfVect__stop_words': None}

Grid scores on development set:

0.868 (+/-0.008) for {'tfidfVect__binary': True, 'tfidfVect__max_df': 0.2, 'tfidfVect__ngram_range': (1, 3), 'tfidfVect__stop_words': None}
0.866 (+/-0.010) for {'tfidfVect__binary': True, 'tfidfVect__max_df': 0.5, 'tfidfVect__ngram_range': (1, 3), 'tfidfVect__stop_words': None}
0.865 (+/-0.011) for {'tfidfVect__binary': True, 'tfidfVect__max_df': 1.0, 'tfidfVect__ngram_range': (1, 3), 'tfidfVect__stop_words': None}
0.856 (+/-0.005) for {'tfidfVect__binary': False, 'tfidfVect__max_df': 0.2, 'tfidfVect__ngram_range': (1, 3), 'tfidfVect__stop_words': None}
0.854 (+/-0.004) for {'tfidfVect__binary': True, 'tfidfVect__max_df': 0.2, 'tfidfVect__ngram_range': (1, 3), 'tfidfVect__stop_words': True}
0.854 (+/-0.004) for {'tfidfVect__binary': True, 'tfidfVect__max_df': 0.5, 'tfidfVect__ngram_range'

### Optimize on the ngram range and min_df as well

In [78]:
tfidfPipe2 = Pipeline([
    ('tfidfVect', TfidfVectorizer(max_df=0.2, binary=True)),
    ('clf', MultinomialNB()),
])

In [83]:
params2 = {'tfidfVect__ngram_range':[(1,3), (1,5), (1,7)]}

In [84]:
tfidf_cv = GridSearchCV(tfidfPipe2, param_grid = params2, n_jobs=8, scoring='accuracy', verbose=5)
tfidf_cv.fit(x_train, y_train)

Fitting 3 folds for each of 3 candidates, totalling 9 fits


[Parallel(n_jobs=8)]: Done   2 out of   9 | elapsed:  3.7min remaining: 12.8min
[Parallel(n_jobs=8)]: Done   4 out of   9 | elapsed:  8.8min remaining: 11.0min
[Parallel(n_jobs=8)]: Done   6 out of   9 | elapsed: 12.2min remaining:  6.1min
[Parallel(n_jobs=8)]: Done   9 out of   9 | elapsed: 14.7min finished


Best parameters set found on development set:

{'tfidfVect__ngram_range': (1, 5)}

Grid scores on development set:



IndexError: list index out of range

In [87]:
helpers.print_gridSearch_report(tfidf_cv)

Best parameters set found on development set:

{'tfidfVect__ngram_range': (1, 5)}

Grid scores on development set:

0.870 (+/-0.009) for {'tfidfVect__ngram_range': (1, 5)}
0.870 (+/-0.009) for {'tfidfVect__ngram_range': (1, 7)}
0.868 (+/-0.008) for {'tfidfVect__ngram_range': (1, 3)}



## Logistic Regression

In [89]:
logPipe1 = Pipeline([
    ('vectorizer', CountVectorizer()),
    ('clf', LogisticRegression(solver='sag')),
])


params = {'vectorizer__max_df': [ 0.2, 0.5, 1.0],
          'vectorizer__binary':[False, True],
          'vectorizer__ngram_range':[(1,1), (1,3), (1,5)],
          'clf__C': [0.001, 0.01, 0.1, 1, 10, 100]
         }

In [None]:
log_cv = GridSearchCV(tfidfPipe2, param_grid = params2, n_jobs=4, scoring='accuracy', verbose=5)
log_cv.fit(x_train, y_train)

In [None]:
helpers.print_gridSearch_report(log_cv)