In [1]:
import numpy as np
import pandas as pd
import time

In [2]:
from matplotlib import pyplot as plt
%matplotlib inline

In [3]:
import helpers

In [4]:
from tempfile import mkdtemp
from shutil import rmtree

In [5]:
from nltk.corpus import stopwords

In [6]:
# First import naive bayes and create a pipeline
from sklearn.naive_bayes import MultinomialNB
from sklearn.pipeline import Pipeline
from sklearn.feature_extraction.text import TfidfVectorizer, CountVectorizer
from sklearn.linear_model import LogisticRegression
from sklearn.ensemble import RandomForestClassifier

In [7]:
from sklearn.model_selection import train_test_split
from sklearn.model_selection import GridSearchCV, RandomizedSearchCV
from sklearn.metrics import log_loss

In [8]:
from sklearn.preprocessing import LabelEncoder

In [9]:
import scipy.stats as ss

In [10]:
from dask_searchcv import GridSearchCV as DaskGridSearchCV

In [11]:
# Results Dict
results = {}

### Load IMDB (Original Dataset)

In [12]:
import importlib

In [13]:
importlib.reload(helpers)

<module 'helpers' from 'C:\\Users\\sankalpg\\Desktop\\Learning\\notebooks\\imdbExercise\\helpers.py'>

In [14]:
imdb = pd.read_csv('../kaggleData/imdb-review-dataset/imdb_master.csv', encoding='cp1252')

In [15]:
np.unique(imdb['label'])

array(['neg', 'pos', 'unsup'], dtype=object)

In [16]:
path = '../kaggleData/imdb-review-dataset/imdb_master.csv'
x_train, y_train, x_test, y_test = helpers.load_imdb(path)

Loading IMDB dataset...
Original dataframe shape:  (100000, 4)
Original DF columns: Index(['type', 'review', 'label', 'file'], dtype='object')
Original Train/Test split: train    75000
test     25000
Name: type, dtype: int64
Dropping the file column...
Dropping the unlabeled rows and splitting into train/test...
X_train, y_train, shapes: (25000,) (25000,)
y_train counts: neg    12500
pos    12500
Name: label, dtype: int64
X_test, y_test, shapes: (25000,) (25000,)
y_test counts: neg    12500
pos    12500
Name: label, dtype: int64


In [17]:
y_train[0:5]

25000    neg
25001    neg
25002    neg
25003    neg
25004    neg
Name: label, dtype: object

In [18]:
print(type(x_train))
print(type(y_train))

<class 'pandas.core.series.Series'>
<class 'pandas.core.series.Series'>


In [19]:
y_train[0:10]

25000    neg
25001    neg
25002    neg
25003    neg
25004    neg
25005    neg
25006    neg
25007    neg
25008    neg
25009    neg
Name: label, dtype: object

## Naive Bayes

### CountVectorizer - Naive Bayes

In [20]:
countPipe1 = Pipeline([
    ('countVect', CountVectorizer()),
    ('clf', MultinomialNB()),
])

In [21]:
params = {'countVect__binary':[False, True],
          'countVect__ngram_range':[(1,1), (1,3)],
          'countVect__stop_words':[None, stopwords.words('english')]
         }

#### Doing SkLearn gridsearch with CountVectorizer and Naive Bayes

In [22]:
st = time.time()
cvSklearn = GridSearchCV(countPipe1, param_grid = params, n_jobs=-1, scoring='accuracy', verbose=5)
cvSklearn.fit(x_train, y_train)
end = time.time()

Fitting 3 folds for each of 8 candidates, totalling 24 fits


[Parallel(n_jobs=-1)]: Done  10 tasks      | elapsed:  1.9min
[Parallel(n_jobs=-1)]: Done  22 out of  24 | elapsed:  3.9min remaining:   21.0s
[Parallel(n_jobs=-1)]: Done  24 out of  24 | elapsed:  4.4min finished


In [23]:
print(end-st)

312.0788550376892


In [24]:
helpers.print_gridSearch_report(cvSklearn)

Best parameters set found on development set:

{'countVect__binary': True, 'countVect__ngram_range': (1, 3), 'countVect__stop_words': None}

Grid scores on development set:

0.856 (+/-0.011) for {'countVect__binary': True, 'countVect__ngram_range': (1, 3), 'countVect__stop_words': None}
0.841 (+/-0.005) for {'countVect__binary': False, 'countVect__ngram_range': (1, 3), 'countVect__stop_words': None}
0.832 (+/-0.009) for {'countVect__binary': True, 'countVect__ngram_range': (1, 3), 'countVect__stop_words': True}
0.808 (+/-0.010) for {'countVect__binary': False, 'countVect__ngram_range': (1, 3), 'countVect__stop_words': True}
0.807 (+/-0.011) for {'countVect__binary': True, 'countVect__ngram_range': (1, 1), 'countVect__stop_words': True}
0.802 (+/-0.013) for {'countVect__binary': True, 'countVect__ngram_range': (1, 1), 'countVect__stop_words': None}
0.774 (+/-0.007) for {'countVect__binary': False, 'countVect__ngram_range': (1, 1), 'countVect__stop_words': True}
0.771 (+/-0.009) for {'co

### TfidfVectorizer - Naive Bayes

In [20]:
tfidfPipe1 = Pipeline([
    ('tfidfVect', TfidfVectorizer()),
    ('clf', MultinomialNB()),
])

In [21]:
params = {'tfidfVect__max_df':ss.uniform(0.1, 0.9),
          'tfidfVect__binary':[False, True],
          'tfidfVect__ngram_range':[(1,1), (1,3)],
          'tfidfVect__stop_words':[None, stopwords.words('english')]
         }

In [22]:
tfidf_cv = RandomizedSearchCV(tfidfPipe1, param_distributions=params, scoring='accuracy', verbose=5)

In [29]:
st = time.time()
tfidf_cv.fit(x_train, y_train)
end=time.time()

Fitting 3 folds for each of 10 candidates, totalling 30 fits
[CV] tfidfVect__binary=False, tfidfVect__max_df=0.979635393257, tfidfVect__ngram_range=(1, 1), tfidfVect__stop_words=['i', 'me', 'my', 'myself', 'we', 'our', 'ours', 'ourselves', 'you', "you're", "you've", "you'll", "you'd", 'your', 'yours', 'yourself', 'yourselves', 'he', 'him', 'his', 'himself', 'she', "she's", 'her', 'hers', 'herself', 'it', "it's", 'its', 'itself', 'they', 'them', 'their', 'theirs', 'themselves', 'what', 'which', 'who', 'whom', 'this', 'that', "that'll", 'these', 'those', 'am', 'is', 'are', 'was', 'were', 'be', 'been', 'being', 'have', 'has', 'had', 'having', 'do', 'does', 'did', 'doing', 'a', 'an', 'the', 'and', 'but', 'if', 'or', 'because', 'as', 'until', 'while', 'of', 'at', 'by', 'for', 'with', 'about', 'against', 'between', 'into', 'through', 'during', 'before', 'after', 'above', 'below', 'to', 'from', 'up', 'down', 'in', 'out', 'on', 'off', 'over', 'under', 'again', 'further', 'then', 'once', 'here'

[Parallel(n_jobs=1)]: Done   1 out of   1 | elapsed:    7.3s remaining:    0.0s


[CV]  tfidfVect__binary=False, tfidfVect__max_df=0.979635393257, tfidfVect__ngram_range=(1, 1), tfidfVect__stop_words=['i', 'me', 'my', 'myself', 'we', 'our', 'ours', 'ourselves', 'you', "you're", "you've", "you'll", "you'd", 'your', 'yours', 'yourself', 'yourselves', 'he', 'him', 'his', 'himself', 'she', "she's", 'her', 'hers', 'herself', 'it', "it's", 'its', 'itself', 'they', 'them', 'their', 'theirs', 'themselves', 'what', 'which', 'who', 'whom', 'this', 'that', "that'll", 'these', 'those', 'am', 'is', 'are', 'was', 'were', 'be', 'been', 'being', 'have', 'has', 'had', 'having', 'do', 'does', 'did', 'doing', 'a', 'an', 'the', 'and', 'but', 'if', 'or', 'because', 'as', 'until', 'while', 'of', 'at', 'by', 'for', 'with', 'about', 'against', 'between', 'into', 'through', 'during', 'before', 'after', 'above', 'below', 'to', 'from', 'up', 'down', 'in', 'out', 'on', 'off', 'over', 'under', 'again', 'further', 'then', 'once', 'here', 'there', 'when', 'where', 'why', 'how', 'all', 'any', 'bot

[Parallel(n_jobs=1)]: Done   2 out of   2 | elapsed:   14.5s remaining:    0.0s


[CV]  tfidfVect__binary=False, tfidfVect__max_df=0.979635393257, tfidfVect__ngram_range=(1, 1), tfidfVect__stop_words=['i', 'me', 'my', 'myself', 'we', 'our', 'ours', 'ourselves', 'you', "you're", "you've", "you'll", "you'd", 'your', 'yours', 'yourself', 'yourselves', 'he', 'him', 'his', 'himself', 'she', "she's", 'her', 'hers', 'herself', 'it', "it's", 'its', 'itself', 'they', 'them', 'their', 'theirs', 'themselves', 'what', 'which', 'who', 'whom', 'this', 'that', "that'll", 'these', 'those', 'am', 'is', 'are', 'was', 'were', 'be', 'been', 'being', 'have', 'has', 'had', 'having', 'do', 'does', 'did', 'doing', 'a', 'an', 'the', 'and', 'but', 'if', 'or', 'because', 'as', 'until', 'while', 'of', 'at', 'by', 'for', 'with', 'about', 'against', 'between', 'into', 'through', 'during', 'before', 'after', 'above', 'below', 'to', 'from', 'up', 'down', 'in', 'out', 'on', 'off', 'over', 'under', 'again', 'further', 'then', 'once', 'here', 'there', 'when', 'where', 'why', 'how', 'all', 'any', 'bot

[Parallel(n_jobs=1)]: Done   3 out of   3 | elapsed:   22.5s remaining:    0.0s


[CV]  tfidfVect__binary=True, tfidfVect__max_df=0.202895603155, tfidfVect__ngram_range=(1, 3), tfidfVect__stop_words=None, score=0.8660907127429806, total=  41.3s
[CV] tfidfVect__binary=True, tfidfVect__max_df=0.202895603155, tfidfVect__ngram_range=(1, 3), tfidfVect__stop_words=None 


[Parallel(n_jobs=1)]: Done   4 out of   4 | elapsed:  1.3min remaining:    0.0s


[CV]  tfidfVect__binary=True, tfidfVect__max_df=0.202895603155, tfidfVect__ngram_range=(1, 3), tfidfVect__stop_words=None, score=0.8736501079913607, total=  40.1s
[CV] tfidfVect__binary=True, tfidfVect__max_df=0.202895603155, tfidfVect__ngram_range=(1, 3), tfidfVect__stop_words=None 
[CV]  tfidfVect__binary=True, tfidfVect__max_df=0.202895603155, tfidfVect__ngram_range=(1, 3), tfidfVect__stop_words=None, score=0.864978396543447, total=  42.2s
[CV] tfidfVect__binary=False, tfidfVect__max_df=0.652834114179, tfidfVect__ngram_range=(1, 1), tfidfVect__stop_words=None 
[CV]  tfidfVect__binary=False, tfidfVect__max_df=0.652834114179, tfidfVect__ngram_range=(1, 1), tfidfVect__stop_words=None, score=0.7814974802015838, total=   4.3s
[CV] tfidfVect__binary=False, tfidfVect__max_df=0.652834114179, tfidfVect__ngram_range=(1, 1), tfidfVect__stop_words=None 
[CV]  tfidfVect__binary=False, tfidfVect__max_df=0.652834114179, tfidfVect__ngram_range=(1, 1), tfidfVect__stop_words=None, score=0.79433645308

[CV]  tfidfVect__binary=True, tfidfVect__max_df=0.687431245163, tfidfVect__ngram_range=(1, 3), tfidfVect__stop_words=['i', 'me', 'my', 'myself', 'we', 'our', 'ours', 'ourselves', 'you', "you're", "you've", "you'll", "you'd", 'your', 'yours', 'yourself', 'yourselves', 'he', 'him', 'his', 'himself', 'she', "she's", 'her', 'hers', 'herself', 'it', "it's", 'its', 'itself', 'they', 'them', 'their', 'theirs', 'themselves', 'what', 'which', 'who', 'whom', 'this', 'that', "that'll", 'these', 'those', 'am', 'is', 'are', 'was', 'were', 'be', 'been', 'being', 'have', 'has', 'had', 'having', 'do', 'does', 'did', 'doing', 'a', 'an', 'the', 'and', 'but', 'if', 'or', 'because', 'as', 'until', 'while', 'of', 'at', 'by', 'for', 'with', 'about', 'against', 'between', 'into', 'through', 'during', 'before', 'after', 'above', 'below', 'to', 'from', 'up', 'down', 'in', 'out', 'on', 'off', 'over', 'under', 'again', 'further', 'then', 'once', 'here', 'there', 'when', 'where', 'why', 'how', 'all', 'any', 'both

[CV]  tfidfVect__binary=True, tfidfVect__max_df=0.542037165616, tfidfVect__ngram_range=(1, 1), tfidfVect__stop_words=['i', 'me', 'my', 'myself', 'we', 'our', 'ours', 'ourselves', 'you', "you're", "you've", "you'll", "you'd", 'your', 'yours', 'yourself', 'yourselves', 'he', 'him', 'his', 'himself', 'she', "she's", 'her', 'hers', 'herself', 'it', "it's", 'its', 'itself', 'they', 'them', 'their', 'theirs', 'themselves', 'what', 'which', 'who', 'whom', 'this', 'that', "that'll", 'these', 'those', 'am', 'is', 'are', 'was', 'were', 'be', 'been', 'being', 'have', 'has', 'had', 'having', 'do', 'does', 'did', 'doing', 'a', 'an', 'the', 'and', 'but', 'if', 'or', 'because', 'as', 'until', 'while', 'of', 'at', 'by', 'for', 'with', 'about', 'against', 'between', 'into', 'through', 'during', 'before', 'after', 'above', 'below', 'to', 'from', 'up', 'down', 'in', 'out', 'on', 'off', 'over', 'under', 'again', 'further', 'then', 'once', 'here', 'there', 'when', 'where', 'why', 'how', 'all', 'any', 'both

[CV]  tfidfVect__binary=True, tfidfVect__max_df=0.858371194255, tfidfVect__ngram_range=(1, 1), tfidfVect__stop_words=['i', 'me', 'my', 'myself', 'we', 'our', 'ours', 'ourselves', 'you', "you're", "you've", "you'll", "you'd", 'your', 'yours', 'yourself', 'yourselves', 'he', 'him', 'his', 'himself', 'she', "she's", 'her', 'hers', 'herself', 'it', "it's", 'its', 'itself', 'they', 'them', 'their', 'theirs', 'themselves', 'what', 'which', 'who', 'whom', 'this', 'that', "that'll", 'these', 'those', 'am', 'is', 'are', 'was', 'were', 'be', 'been', 'being', 'have', 'has', 'had', 'having', 'do', 'does', 'did', 'doing', 'a', 'an', 'the', 'and', 'but', 'if', 'or', 'because', 'as', 'until', 'while', 'of', 'at', 'by', 'for', 'with', 'about', 'against', 'between', 'into', 'through', 'during', 'before', 'after', 'above', 'below', 'to', 'from', 'up', 'down', 'in', 'out', 'on', 'off', 'over', 'under', 'again', 'further', 'then', 'once', 'here', 'there', 'when', 'where', 'why', 'how', 'all', 'any', 'both

[CV]  tfidfVect__binary=True, tfidfVect__max_df=0.184670419885, tfidfVect__ngram_range=(1, 3), tfidfVect__stop_words=['i', 'me', 'my', 'myself', 'we', 'our', 'ours', 'ourselves', 'you', "you're", "you've", "you'll", "you'd", 'your', 'yours', 'yourself', 'yourselves', 'he', 'him', 'his', 'himself', 'she', "she's", 'her', 'hers', 'herself', 'it', "it's", 'its', 'itself', 'they', 'them', 'their', 'theirs', 'themselves', 'what', 'which', 'who', 'whom', 'this', 'that', "that'll", 'these', 'those', 'am', 'is', 'are', 'was', 'were', 'be', 'been', 'being', 'have', 'has', 'had', 'having', 'do', 'does', 'did', 'doing', 'a', 'an', 'the', 'and', 'but', 'if', 'or', 'because', 'as', 'until', 'while', 'of', 'at', 'by', 'for', 'with', 'about', 'against', 'between', 'into', 'through', 'during', 'before', 'after', 'above', 'below', 'to', 'from', 'up', 'down', 'in', 'out', 'on', 'off', 'over', 'under', 'again', 'further', 'then', 'once', 'here', 'there', 'when', 'where', 'why', 'how', 'all', 'any', 'both

[Parallel(n_jobs=1)]: Done  30 out of  30 | elapsed: 14.8min finished


In [30]:
helpers.print_gridSearch_report(tfidf_cv)

Best parameters set found on development set:

{'tfidfVect__binary': True, 'tfidfVect__max_df': 0.20289560315486427, 'tfidfVect__ngram_range': (1, 3), 'tfidfVect__stop_words': None}

Grid scores on development set:

0.868 (+/-0.008) for {'tfidfVect__binary': True, 'tfidfVect__max_df': 0.20289560315486427, 'tfidfVect__ngram_range': (1, 3), 'tfidfVect__stop_words': None}
0.866 (+/-0.010) for {'tfidfVect__binary': True, 'tfidfVect__max_df': 0.5917076441350233, 'tfidfVect__ngram_range': (1, 3), 'tfidfVect__stop_words': None}
0.854 (+/-0.002) for {'tfidfVect__binary': True, 'tfidfVect__max_df': 0.18467041988492347, 'tfidfVect__ngram_range': (1, 3), 'tfidfVect__stop_words': True}
0.854 (+/-0.004) for {'tfidfVect__binary': True, 'tfidfVect__max_df': 0.68743124516318488, 'tfidfVect__ngram_range': (1, 3), 'tfidfVect__stop_words': True}
0.852 (+/-0.009) for {'tfidfVect__binary': False, 'tfidfVect__max_df': 0.72209521972936352, 'tfidfVect__ngram_range': (1, 3), 'tfidfVect__stop_words': None}
0.81

In [31]:
print(end-st)

972.0011417865753


### Optimize on the ngram range

In [23]:
tfidfPipe2 = Pipeline([
    ('tfidfVect', TfidfVectorizer(max_df=0.2, binary=True)),
    ('clf', MultinomialNB()),
])

In [24]:
params2 = {'tfidfVect__ngram_range':[(1,3), (1,5), (1,7)]}

In [None]:
st = time.time()
tfidf_cv = GridSearchCV(tfidfPipe2, param_grid = params2, n_jobs=-1, scoring='accuracy', verbose=5)
tfidf_cv.fit(x_train, y_train)

In [None]:
print(time.time()-st)

In [None]:
helpers.print_gridSearch_report(tfidf_cv)

## Logistic Regression

### CountVectorizer - with Logistic Regression

In [25]:
logPipe1 = Pipeline([
    ('vectorizer', CountVectorizer()),
    ('clf', LogisticRegression(solver='sag')),
])


params = {'vectorizer__max_df': [ 0.2, 0.5, 1.0],
          'vectorizer__binary':[False, True],
          'vectorizer__ngram_range':[(1,1), (1,3)],
          'clf__C': [0.001, 0.01, 0.1, 1, 10, 100]
         }

In [26]:
log_cv = RandomizedSearchCV(logPipe1, param_distributions = params, n_iter=20, n_jobs=-1, scoring='accuracy', verbose=5)
log_cv.fit(x_train, y_train)

Fitting 3 folds for each of 20 candidates, totalling 60 fits


[Parallel(n_jobs=-1)]: Done  10 tasks      | elapsed:  4.6min
[Parallel(n_jobs=-1)]: Done  60 out of  60 | elapsed: 34.2min finished


RandomizedSearchCV(cv=None, error_score='raise',
          estimator=Pipeline(memory=None,
     steps=[('vectorizer', CountVectorizer(analyzer='word', binary=False, decode_error='strict',
        dtype=<class 'numpy.int64'>, encoding='utf-8', input='content',
        lowercase=True, max_df=1.0, max_features=None, min_df=1,
        ngram_range=(1, 1), preprocessor=None, stop_words=None,
       ... penalty='l2', random_state=None, solver='sag', tol=0.0001,
          verbose=0, warm_start=False))]),
          fit_params=None, iid=True, n_iter=20, n_jobs=-1,
          param_distributions={'vectorizer__max_df': [0.2, 0.5, 1.0], 'vectorizer__binary': [False, True], 'vectorizer__ngram_range': [(1, 1), (1, 3)], 'clf__C': [0.001, 0.01, 0.1, 1, 10, 100]},
          pre_dispatch='2*n_jobs', random_state=None, refit=True,
          return_train_score='warn', scoring='accuracy', verbose=5)

In [27]:
helpers.print_gridSearch_report(log_cv)

Best parameters set found on development set:

{'vectorizer__ngram_range': (1, 3), 'vectorizer__max_df': 0.2, 'vectorizer__binary': True, 'clf__C': 10}

Grid scores on development set:

0.884 (+/-0.010) for {'vectorizer__ngram_range': (1, 3), 'vectorizer__max_df': 0.2, 'vectorizer__binary': True, 'clf__C': 10}
0.882 (+/-0.008) for {'vectorizer__ngram_range': (1, 3), 'vectorizer__max_df': 1.0, 'vectorizer__binary': True, 'clf__C': 10}
0.882 (+/-0.005) for {'vectorizer__ngram_range': (1, 3), 'vectorizer__max_df': 0.5, 'vectorizer__binary': True, 'clf__C': 100}
0.881 (+/-0.005) for {'vectorizer__ngram_range': (1, 3), 'vectorizer__max_df': 0.5, 'vectorizer__binary': True, 'clf__C': 0.1}
0.877 (+/-0.009) for {'vectorizer__ngram_range': (1, 3), 'vectorizer__max_df': 0.2, 'vectorizer__binary': False, 'clf__C': 10}
0.877 (+/-0.008) for {'vectorizer__ngram_range': (1, 3), 'vectorizer__max_df': 0.2, 'vectorizer__binary': False, 'clf__C': 0.1}
0.874 (+/-0.011) for {'vectorizer__ngram_range': (1, 

### TFidf Vectorizer - with Logistic Regression

In [28]:
logPipe2 = Pipeline([
    ('vectorizer', TfidfVectorizer()),
    ('clf', LogisticRegression(solver='sag')),
])


params = {'vectorizer__max_df': ss.uniform(0.1, 1.0),
          'vectorizer__binary':[True],
          'vectorizer__ngram_range':[(1,3)],
          'clf__C': [10, 100, 1000]
         }

In [29]:
log_cv = RandomizedSearchCV(logPipe2, param_distributions = params, n_iter=20, n_jobs=-1, scoring='accuracy', verbose=5)
log_cv.fit(x_train, y_train)

Fitting 3 folds for each of 20 candidates, totalling 60 fits


[Parallel(n_jobs=-1)]: Done  10 tasks      | elapsed: 10.6min
[Parallel(n_jobs=-1)]: Done  60 out of  60 | elapsed: 47.1min finished


RandomizedSearchCV(cv=None, error_score='raise',
          estimator=Pipeline(memory=None,
     steps=[('vectorizer', TfidfVectorizer(analyzer='word', binary=False, decode_error='strict',
        dtype=<class 'numpy.int64'>, encoding='utf-8', input='content',
        lowercase=True, max_df=1.0, max_features=None, min_df=1,
        ngram_range=(1, 1), norm='l2', preprocessor=None, smooth_idf=Tr... penalty='l2', random_state=None, solver='sag', tol=0.0001,
          verbose=0, warm_start=False))]),
          fit_params=None, iid=True, n_iter=20, n_jobs=-1,
          param_distributions={'vectorizer__max_df': <scipy.stats._distn_infrastructure.rv_frozen object at 0x00000173081F1BE0>, 'vectorizer__binary': [True], 'vectorizer__ngram_range': [(1, 3)], 'clf__C': [10, 100, 1000]},
          pre_dispatch='2*n_jobs', random_state=None, refit=True,
          return_train_score='warn', scoring='accuracy', verbose=5)

In [30]:
helpers.print_gridSearch_report(log_cv)

Best parameters set found on development set:

{'clf__C': 1000, 'vectorizer__binary': True, 'vectorizer__max_df': 0.41802665473703027, 'vectorizer__ngram_range': (1, 3)}

Grid scores on development set:

0.888 (+/-0.006) for {'clf__C': 1000, 'vectorizer__binary': True, 'vectorizer__max_df': 0.41802665473703027, 'vectorizer__ngram_range': (1, 3)}
0.888 (+/-0.008) for {'clf__C': 100, 'vectorizer__binary': True, 'vectorizer__max_df': 0.16796439395052945, 'vectorizer__ngram_range': (1, 3)}
0.888 (+/-0.006) for {'clf__C': 10, 'vectorizer__binary': True, 'vectorizer__max_df': 0.11137347561435898, 'vectorizer__ngram_range': (1, 3)}
0.888 (+/-0.006) for {'clf__C': 1000, 'vectorizer__binary': True, 'vectorizer__max_df': 0.82096212515932554, 'vectorizer__ngram_range': (1, 3)}
0.887 (+/-0.007) for {'clf__C': 1000, 'vectorizer__binary': True, 'vectorizer__max_df': 0.32631884103352826, 'vectorizer__ngram_range': (1, 3)}
0.887 (+/-0.006) for {'clf__C': 10, 'vectorizer__binary': True, 'vectorizer__ma

In [None]:
logPipe2

# Random Forests

### TFIDF with RF

In [23]:
rfPipe1 = Pipeline([
    ('vectorizer', TfidfVectorizer(binary=True, ngram_range=(1,3), max_df=0.4)),
    ('clf', RandomForestClassifier(n_estimators=300)),
])


# params = {'vectorizer__max_df': [0.5],
#           'clf__n_estimators': [100],
#           'clf__max_features': ['auto'],
#          }

In [24]:
# rf_cv = GridSearchCV(rfPipe1, param_grid=params, n_jobs=-1, scoring='accuracy', verbose=5)
# rf_cv.fit(x_train, y_train)
rfPipe1.fit(x_train, y_train)

Pipeline(memory=None,
     steps=[('vectorizer', TfidfVectorizer(analyzer='word', binary=True, decode_error='strict',
        dtype=<class 'numpy.int64'>, encoding='utf-8', input='content',
        lowercase=True, max_df=0.4, max_features=None, min_df=1,
        ngram_range=(1, 3), norm='l2', preprocessor=None, smooth_idf=Tru...n_jobs=1,
            oob_score=False, random_state=None, verbose=0,
            warm_start=False))])

In [25]:
print ("Accuracy: %f" % rfPipe1.score(x_test, y_test))

Accuracy: 0.863760


## Run on Test Set

In [31]:
logPipe3 = Pipeline([
    ('vectorizer', TfidfVectorizer(ngram_range=(1,3), max_df=0.5, binary=True)),
    ('clf', LogisticRegression(C=100)),
])

In [32]:
logPipe3.fit(x_train, LabelEncoder().fit_transform(y_train))

Pipeline(memory=None,
     steps=[('vectorizer', TfidfVectorizer(analyzer='word', binary=True, decode_error='strict',
        dtype=<class 'numpy.int64'>, encoding='utf-8', input='content',
        lowercase=True, max_df=0.5, max_features=None, min_df=1,
        ngram_range=(1, 3), norm='l2', preprocessor=None, smooth_idf=Tru...ty='l2', random_state=None, solver='liblinear', tol=0.0001,
          verbose=0, warm_start=False))])

In [33]:
print ("Accuracy: %f" % logPipe3.score(x_test, LabelEncoder().fit_transform(y_test)))

Accuracy: 0.907280


In [37]:
print("Log loss: %f" % log_loss(LabelEncoder().fit_transform(y_test), logPipe3.predict_proba(x_test)))

Log loss: 0.230938
