In [1]:
import numpy as np
import pandas as pd

In [2]:
from matplotlib import pyplot as plt
%matplotlib inline

In [3]:
import helpers

In [4]:
from tempfile import mkdtemp
from shutil import rmtree

In [5]:
from nltk.corpus import stopwords

In [6]:
# First import naive bayes and create a pipeline
from sklearn.naive_bayes import MultinomialNB
from sklearn.pipeline import Pipeline
from sklearn.feature_extraction.text import TfidfVectorizer, CountVectorizer
from sklearn.linear_model import LogisticRegression

In [7]:
from sklearn.model_selection import train_test_split
from sklearn.model_selection import GridSearchCV, RandomizedSearchCV
from sklearn.metrics import log_loss

In [8]:
import scipy.stats as ss

In [9]:
# Results Dict
results = {}

### Load IMDB (Original Dataset)

In [10]:
import importlib

In [11]:
importlib.reload(helpers)

<module 'helpers' from 'C:\\Users\\sankalpg\\Desktop\\Learning\\notebooks\\imdbExercise\\helpers.py'>

In [12]:
path = '../kaggleData/imdb-review-dataset/imdb_master.csv'
x_train, y_train, x_test, y_test = helpers.load_imdb(path)

Loading IMDB dataset...
Original dataframe shape:  (100000, 4)
Original DF columns: Index(['type', 'review', 'label', 'file'], dtype='object')
Original Train/Test split: train    75000
test     25000
Name: type, dtype: int64
Dropping the file column...
Dropping the unlabeled rows and splitting into train/test...
X_train, y_train, shapes: (25000,) (25000,)
y_train counts: pos    12500
neg    12500
Name: label, dtype: int64
X_test, y_test, shapes: (25000,) (25000,)
y_test counts: pos    12500
neg    12500
Name: label, dtype: int64


In [13]:
print(type(x_train))
print(type(y_train))

<class 'pandas.core.series.Series'>
<class 'pandas.core.series.Series'>


In [14]:
y_train[0:10]

25000    neg
25001    neg
25002    neg
25003    neg
25004    neg
25005    neg
25006    neg
25007    neg
25008    neg
25009    neg
Name: label, dtype: object

## Naive Bayes

### CountVectorizer - Naive Bayes

In [15]:
countPipe1 = Pipeline([
    ('countVect', CountVectorizer()),
    ('clf', MultinomialNB()),
])

In [16]:
params = {'countVect__binary':[False, True],
          'countVect__ngram_range':[(1,1), (1,3)],
          'countVect__stop_words':[None, stopwords.words('english')]
         }

In [17]:
cv = GridSearchCV(countPipe1, param_grid = params, n_jobs=4, scoring='accuracy', verbose=5)

In [18]:
cv.fit(x_train, y_train)

Fitting 3 folds for each of 8 candidates, totalling 24 fits


[Parallel(n_jobs=4)]: Done  10 tasks      | elapsed:  1.9min
[Parallel(n_jobs=4)]: Done  22 out of  24 | elapsed:  4.0min remaining:   21.8s
[Parallel(n_jobs=4)]: Done  24 out of  24 | elapsed:  4.5min finished


GridSearchCV(cv=None, error_score='raise',
       estimator=Pipeline(memory=None,
     steps=[('countVect', CountVectorizer(analyzer='word', binary=False, decode_error='strict',
        dtype=<class 'numpy.int64'>, encoding='utf-8', input='content',
        lowercase=True, max_df=1.0, max_features=None, min_df=1,
        ngram_range=(1, 1), preproc...nizer=None, vocabulary=None)), ('clf', MultinomialNB(alpha=1.0, class_prior=None, fit_prior=True))]),
       fit_params=None, iid=True, n_jobs=4,
       param_grid={'countVect__binary': [False, True], 'countVect__ngram_range': [(1, 1), (1, 3)], 'countVect__stop_words': [None, ['i', 'me', 'my', 'myself', 'we', 'our', 'ours', 'ourselves', 'you', "you're", "you've", "you'll", "you'd", 'your', 'yours', 'yourself', 'yourselves', 'he', 'him', 'his', 'hims...shouldn', "shouldn't", 'wasn', "wasn't", 'weren', "weren't", 'won', "won't", 'wouldn', "wouldn't"]]},
       pre_dispatch='2*n_jobs', refit=True, return_train_score='warn',
       scoring='ac

In [19]:
helpers.print_gridSearch_report(cv)

Best parameters set found on development set:

{'countVect__binary': True, 'countVect__ngram_range': (1, 3), 'countVect__stop_words': None}

Grid scores on development set:

0.856 (+/-0.011) for {'countVect__binary': True, 'countVect__ngram_range': (1, 3), 'countVect__stop_words': None}
0.841 (+/-0.005) for {'countVect__binary': False, 'countVect__ngram_range': (1, 3), 'countVect__stop_words': None}
0.832 (+/-0.009) for {'countVect__binary': True, 'countVect__ngram_range': (1, 3), 'countVect__stop_words': True}
0.808 (+/-0.010) for {'countVect__binary': False, 'countVect__ngram_range': (1, 3), 'countVect__stop_words': True}
0.807 (+/-0.011) for {'countVect__binary': True, 'countVect__ngram_range': (1, 1), 'countVect__stop_words': True}
0.802 (+/-0.013) for {'countVect__binary': True, 'countVect__ngram_range': (1, 1), 'countVect__stop_words': None}
0.774 (+/-0.007) for {'countVect__binary': False, 'countVect__ngram_range': (1, 1), 'countVect__stop_words': True}
0.771 (+/-0.009) for {'co

### Predict with CountVect and NB on validation set

In [61]:
countPipe1 = Pipeline([
    ('countVect', CountVectorizer()),
    ('clf', MultinomialNB()),
])

In [66]:
X_traintrain, X_Val, y_traintrain, y_val = train_test_split(x_train, y_train, test_size=0.20, random_state=33)
logPipe3.fit(X_traintrain, y_traintrain)

Pipeline(memory=None,
     steps=[('vectorizer', TfidfVectorizer(analyzer='word', binary=True, decode_error='strict',
        dtype=<class 'numpy.int64'>, encoding='utf-8', input='content',
        lowercase=True, max_df=0.5, max_features=None, min_df=1,
        ngram_range=(1, 3), norm='l2', preprocessor=None, smooth_idf=Tru...ty='l2', random_state=None, solver='liblinear', tol=0.0001,
          verbose=0, warm_start=False))])

In [None]:
print ("Accuracy: %s" % classifier.score(X_test, y_test))
print("Log Loss: " % log_loss(y_pred=classifier.predict(X_test), y_true=y_test))

### TfidfVectorizer - Naive Bayes

In [142]:
tfidfPipe1 = Pipeline([
    ('tfidfVect', TfidfVectorizer()),
    ('clf', MultinomialNB()),
])

In [143]:
params = {'tfidfVect__max_df': [ 0.2, 0.5, 1.0],
          'tfidfVect__binary':[False, True],
          'tfidfVect__ngram_range':[(1,1), (1,3)],
          'tfidfVect__stop_words':[None, stopwords.words('english')]
         }

In [144]:
tfidf_cv = GridSearchCV(tfidfPipe1, param_grid = params, n_jobs=8, scoring='accuracy', verbose=5)

In [145]:
tfidf_cv.fit(x_train, y_train)

Fitting 3 folds for each of 24 candidates, totalling 72 fits


[Parallel(n_jobs=8)]: Done   2 tasks      | elapsed:   29.7s
[Parallel(n_jobs=8)]: Done  56 tasks      | elapsed: 10.6min
[Parallel(n_jobs=8)]: Done  72 out of  72 | elapsed: 14.6min remaining:    0.0s
[Parallel(n_jobs=8)]: Done  72 out of  72 | elapsed: 14.6min finished
  y = column_or_1d(y, warn=True)


GridSearchCV(cv=None, error_score='raise',
       estimator=Pipeline(memory=None,
     steps=[('tfidfVect', TfidfVectorizer(analyzer='word', binary=False, decode_error='strict',
        dtype=<class 'numpy.int64'>, encoding='utf-8', input='content',
        lowercase=True, max_df=1.0, max_features=None, min_df=1,
        ngram_range=(1, 1), norm='l2', preprocessor=None, smooth_idf=Tru...rue,
        vocabulary=None)), ('clf', MultinomialNB(alpha=1.0, class_prior=None, fit_prior=True))]),
       fit_params=None, iid=True, n_jobs=8,
       param_grid={'tfidfVect__max_df': [0.2, 0.5, 1.0], 'tfidfVect__binary': [False, True], 'tfidfVect__ngram_range': [(1, 1), (1, 3)], 'tfidfVect__stop_words': [None, ['i', 'me', 'my', 'myself', 'we', 'our', 'ours', 'ourselves', 'you', "you're", "you've", "you'll", "you'd", 'your', 'yours', 'yourself', '...shouldn', "shouldn't", 'wasn', "wasn't", 'weren', "weren't", 'won', "won't", 'wouldn', "wouldn't"]]},
       pre_dispatch='2*n_jobs', refit=True, return_

In [146]:
helpers.print_gridSearch_report(tfidf_cv)

Best parameters set found on development set:

{'tfidfVect__binary': True, 'tfidfVect__max_df': 0.2, 'tfidfVect__ngram_range': (1, 3), 'tfidfVect__stop_words': None}

Grid scores on development set:

0.868 (+/-0.008) for {'tfidfVect__binary': True, 'tfidfVect__max_df': 0.2, 'tfidfVect__ngram_range': (1, 3), 'tfidfVect__stop_words': None}
0.866 (+/-0.010) for {'tfidfVect__binary': True, 'tfidfVect__max_df': 0.5, 'tfidfVect__ngram_range': (1, 3), 'tfidfVect__stop_words': None}
0.865 (+/-0.011) for {'tfidfVect__binary': True, 'tfidfVect__max_df': 1.0, 'tfidfVect__ngram_range': (1, 3), 'tfidfVect__stop_words': None}
0.856 (+/-0.005) for {'tfidfVect__binary': False, 'tfidfVect__max_df': 0.2, 'tfidfVect__ngram_range': (1, 3), 'tfidfVect__stop_words': None}
0.854 (+/-0.004) for {'tfidfVect__binary': True, 'tfidfVect__max_df': 0.2, 'tfidfVect__ngram_range': (1, 3), 'tfidfVect__stop_words': True}
0.854 (+/-0.004) for {'tfidfVect__binary': True, 'tfidfVect__max_df': 0.5, 'tfidfVect__ngram_range'

### Optimize on the ngram range and min_df as well

In [147]:
tfidfPipe2 = Pipeline([
    ('tfidfVect', TfidfVectorizer(max_df=0.2, binary=True)),
    ('clf', MultinomialNB()),
])

In [148]:
params2 = {'tfidfVect__ngram_range':[(1,3), (1,5), (1,7)]}

In [None]:
tfidf_cv = RandomizedSearchCV(tfidfPipe2, param_grid = params2, n_jobs=-1, scoring='accuracy', verbose=5)
tfidf_cv.fit(x_train, y_train)

In [None]:
helpers.print_gridSearch_report(tfidf_cv)

## Logistic Regression

### CountVectorizer

In [None]:
logPipe1 = Pipeline([
    ('vectorizer', CountVectorizer()),`
    ('clf', LogisticRegression(solver='sag')),
])


params = {'vectorizer__max_df': [ 0.2, 0.5, 1.0],
          'vectorizer__binary':[False, True],
          'vectorizer__ngram_range':[(1,1), (1,3)],
          'clf__C': [0.001, 0.01, 0.1, 1, 10, 100]
         }

In [25]:
log_cv = RandomizedSearchCV(logPipe1, param_distributions = params, n_iter=20, n_jobs=-1, scoring='accuracy', verbose=5)
log_cv.fit(x_train, y_train)

Fitting 3 folds for each of 20 candidates, totalling 60 fits


[Parallel(n_jobs=-1)]: Done  10 tasks      | elapsed:  5.3min
[Parallel(n_jobs=-1)]: Done  60 out of  60 | elapsed: 33.4min finished


RandomizedSearchCV(cv=None, error_score='raise',
          estimator=Pipeline(memory=None,
     steps=[('vectorizer', CountVectorizer(analyzer='word', binary=False, decode_error='strict',
        dtype=<class 'numpy.int64'>, encoding='utf-8', input='content',
        lowercase=True, max_df=1.0, max_features=None, min_df=1,
        ngram_range=(1, 1), preprocessor=None, stop_words=None,
       ... penalty='l2', random_state=None, solver='sag', tol=0.0001,
          verbose=0, warm_start=False))]),
          fit_params=None, iid=True, n_iter=20, n_jobs=-1,
          param_distributions={'vectorizer__max_df': [0.2, 0.5, 1.0], 'vectorizer__binary': [False, True], 'vectorizer__ngram_range': [(1, 1), (1, 3)], 'clf__C': [0.001, 0.01, 0.1, 1, 10, 100]},
          pre_dispatch='2*n_jobs', random_state=None, refit=True,
          return_train_score='warn', scoring='accuracy', verbose=5)

In [26]:
helpers.print_gridSearch_report(log_cv)

Best parameters set found on development set:

{'vectorizer__ngram_range': (1, 3), 'vectorizer__max_df': 1.0, 'vectorizer__binary': True, 'clf__C': 100}

Grid scores on development set:

0.883 (+/-0.008) for {'vectorizer__ngram_range': (1, 3), 'vectorizer__max_df': 1.0, 'vectorizer__binary': True, 'clf__C': 100}
0.882 (+/-0.006) for {'vectorizer__ngram_range': (1, 3), 'vectorizer__max_df': 0.5, 'vectorizer__binary': True, 'clf__C': 10}
0.881 (+/-0.006) for {'vectorizer__ngram_range': (1, 3), 'vectorizer__max_df': 0.5, 'vectorizer__binary': True, 'clf__C': 100}
0.877 (+/-0.008) for {'vectorizer__ngram_range': (1, 3), 'vectorizer__max_df': 0.2, 'vectorizer__binary': False, 'clf__C': 1}
0.877 (+/-0.008) for {'vectorizer__ngram_range': (1, 3), 'vectorizer__max_df': 0.2, 'vectorizer__binary': False, 'clf__C': 100}
0.873 (+/-0.012) for {'vectorizer__ngram_range': (1, 3), 'vectorizer__max_df': 0.5, 'vectorizer__binary': False, 'clf__C': 1}
0.873 (+/-0.011) for {'vectorizer__ngram_range': (1, 

### Run on Validation Set

In [61]:
logPipe3 = Pipeline([
    ('vectorizer', TfidfVectorizer(ngram_range=(1,3), max_df=0.5, binary=True)),
    ('clf', LogisticRegression(C=100)),
])

In [66]:
X_traintrain, X_Val, y_traintrain, y_val = train_test_split(x_train, y_train, test_size=0.20, random_state=33)
logPipe3.fit(X_traintrain, y_traintrain)

Pipeline(memory=None,
     steps=[('vectorizer', TfidfVectorizer(analyzer='word', binary=True, decode_error='strict',
        dtype=<class 'numpy.int64'>, encoding='utf-8', input='content',
        lowercase=True, max_df=0.5, max_features=None, min_df=1,
        ngram_range=(1, 3), norm='l2', preprocessor=None, smooth_idf=Tru...ty='l2', random_state=None, solver='liblinear', tol=0.0001,
          verbose=0, warm_start=False))])

In [None]:
print ("Accuracy: %s" % classifier.score(X_test, y_test))
print("Log Loss: " % log_loss(y_pred=classifier.predict(X_test), y_true=y_test))

### TFidf Vectorizer

In [28]:
logPipe2 = Pipeline([
    ('vectorizer', TfidfVectorizer()),
    ('clf', LogisticRegression(solver='sag')),
])


params = {'vectorizer__max_df': [ 0.2, 0.5, 1.0],
          'vectorizer__binary':[False, True],
          'vectorizer__ngram_range':[(1,1), (1,3)],
          'clf__C': [0.001, 0.01, 0.1, 1, 10, 100]
         }

In [29]:
log_cv = RandomizedSearchCV(logPipe2, param_distributions = params, n_iter=20, n_jobs=-1, scoring='accuracy', verbose=5)
log_cv.fit(x_train, y_train)

Fitting 3 folds for each of 20 candidates, totalling 60 fits


[Parallel(n_jobs=-1)]: Done  10 tasks      | elapsed:  4.4min
[Parallel(n_jobs=-1)]: Done  60 out of  60 | elapsed: 24.7min finished


RandomizedSearchCV(cv=None, error_score='raise',
          estimator=Pipeline(memory=None,
     steps=[('vectorizer', TfidfVectorizer(analyzer='word', binary=False, decode_error='strict',
        dtype=<class 'numpy.int64'>, encoding='utf-8', input='content',
        lowercase=True, max_df=1.0, max_features=None, min_df=1,
        ngram_range=(1, 1), norm='l2', preprocessor=None, smooth_idf=Tr... penalty='l2', random_state=None, solver='sag', tol=0.0001,
          verbose=0, warm_start=False))]),
          fit_params=None, iid=True, n_iter=20, n_jobs=-1,
          param_distributions={'vectorizer__max_df': [0.2, 0.5, 1.0], 'vectorizer__binary': [False, True], 'vectorizer__ngram_range': [(1, 1), (1, 3)], 'clf__C': [0.001, 0.01, 0.1, 1, 10, 100]},
          pre_dispatch='2*n_jobs', random_state=None, refit=True,
          return_train_score='warn', scoring='accuracy', verbose=5)

In [30]:
helpers.print_gridSearch_report(log_cv, score='log_loss')

Best parameters set found on development set:

{'vectorizer__ngram_range': (1, 3), 'vectorizer__max_df': 0.5, 'vectorizer__binary': True, 'clf__C': 100}

Grid scores on development set:

0.887 (+/-0.005) for {'vectorizer__ngram_range': (1, 3), 'vectorizer__max_df': 0.5, 'vectorizer__binary': True, 'clf__C': 100}
0.886 (+/-0.005) for {'vectorizer__ngram_range': (1, 3), 'vectorizer__max_df': 0.2, 'vectorizer__binary': True, 'clf__C': 10}
0.884 (+/-0.005) for {'vectorizer__ngram_range': (1, 3), 'vectorizer__max_df': 1.0, 'vectorizer__binary': True, 'clf__C': 10}
0.878 (+/-0.002) for {'vectorizer__ngram_range': (1, 3), 'vectorizer__max_df': 0.2, 'vectorizer__binary': True, 'clf__C': 1}
0.876 (+/-0.007) for {'vectorizer__ngram_range': (1, 3), 'vectorizer__max_df': 0.2, 'vectorizer__binary': False, 'clf__C': 10}
0.876 (+/-0.007) for {'vectorizer__ngram_range': (1, 3), 'vectorizer__max_df': 0.5, 'vectorizer__binary': False, 'clf__C': 10}
0.870 (+/-0.011) for {'vectorizer__ngram_range': (1, 1)

### Optimize ngrams a bit more

In [40]:
logPipe3 = Pipeline([
    ('vectorizer', TfidfVectorizer()),
    ('clf', LogisticRegression()),
])


params = {'vectorizer__max_df': ss.uniform(0.1, 0.9),
          'vectorizer__binary':[True],
          'vectorizer__ngram_range':[(1,3), (1,5)],
          'clf__C': [100, 1000]
         }

In [44]:
log_cv = RandomizedSearchCV(logPipe3, param_distributions=params, n_iter=10, n_jobs=-1, scoring='neg_log_loss', verbose=100)
log_cv.fit(x_train, y_train)

Fitting 3 folds for each of 10 candidates, totalling 30 fits
Pickling array (shape=(25000,), dtype=int64).
Pickling array (shape=(25000,), dtype=object).
Pickling array (shape=(25000,), dtype=int64).
Pickling array (shape=(25000,), dtype=int64).
Pickling array (shape=(25000,), dtype=object).
Pickling array (shape=(25000,), dtype=int64).
Pickling array (shape=(16666,), dtype=int32).
Pickling array (shape=(8334,), dtype=int32).
Pickling array (shape=(25000,), dtype=int64).
Pickling array (shape=(25000,), dtype=object).
Pickling array (shape=(25000,), dtype=int64).
Pickling array (shape=(25000,), dtype=int64).
Pickling array (shape=(25000,), dtype=object).
Pickling array (shape=(25000,), dtype=int64).
Pickling array (shape=(16666,), dtype=int32).
Pickling array (shape=(8334,), dtype=int32).
Pickling array (shape=(25000,), dtype=int64).
Pickling array (shape=(25000,), dtype=object).
Pickling array (shape=(25000,), dtype=int64).
Pickling array (shape=(25000,), dtype=int64).
Pickling array (

[Parallel(n_jobs=-1)]: Done  16 tasks      | elapsed: 18.0min
Pickling array (shape=(25000,), dtype=int64).
Pickling array (shape=(25000,), dtype=object).
Pickling array (shape=(25000,), dtype=int64).
Pickling array (shape=(25000,), dtype=int64).
Pickling array (shape=(25000,), dtype=object).
Pickling array (shape=(25000,), dtype=int64).
Pickling array (shape=(16668,), dtype=int32).
Pickling array (shape=(8332,), dtype=int32).
[Parallel(n_jobs=-1)]: Done  17 tasks      | elapsed: 18.0min
Pickling array (shape=(25000,), dtype=int64).
Pickling array (shape=(25000,), dtype=object).
Pickling array (shape=(25000,), dtype=int64).
Pickling array (shape=(25000,), dtype=int64).
Pickling array (shape=(25000,), dtype=object).
Pickling array (shape=(25000,), dtype=int64).
Pickling array (shape=(16666,), dtype=int32).
Pickling array (shape=(8334,), dtype=int32).
[Parallel(n_jobs=-1)]: Done  18 tasks      | elapsed: 19.8min
Pickling array (shape=(25000,), dtype=int64).
Pickling array (shape=(25000,)

RandomizedSearchCV(cv=None, error_score='raise',
          estimator=Pipeline(memory=None,
     steps=[('vectorizer', TfidfVectorizer(analyzer='word', binary=False, decode_error='strict',
        dtype=<class 'numpy.int64'>, encoding='utf-8', input='content',
        lowercase=True, max_df=1.0, max_features=None, min_df=1,
        ngram_range=(1, 1), norm='l2', preprocessor=None, smooth_idf=Tr...ty='l2', random_state=None, solver='liblinear', tol=0.0001,
          verbose=0, warm_start=False))]),
          fit_params=None, iid=True, n_iter=10, n_jobs=-1,
          param_distributions={'vectorizer__max_df': <scipy.stats._distn_infrastructure.rv_frozen object at 0x000002CB214E96D8>, 'vectorizer__binary': [True], 'vectorizer__ngram_range': [(1, 3), (1, 5)], 'clf__C': [100, 1000]},
          pre_dispatch='2*n_jobs', random_state=None, refit=True,
          return_train_score='warn', scoring='neg_log_loss', verbose=100)

In [45]:
helpers.print_gridSearch_report(log_cv)

Best parameters set found on development set:

{'clf__C': 100, 'vectorizer__binary': True, 'vectorizer__max_df': 0.6775468592953785, 'vectorizer__ngram_range': (1, 3)}

Grid scores on development set:

-0.269 (+/-0.013) for {'clf__C': 100, 'vectorizer__binary': True, 'vectorizer__max_df': 0.6775468592953785, 'vectorizer__ngram_range': (1, 3)}
-0.280 (+/-0.006) for {'clf__C': 100, 'vectorizer__binary': True, 'vectorizer__max_df': 0.41501032289852635, 'vectorizer__ngram_range': (1, 5)}
-0.284 (+/-0.003) for {'clf__C': 100, 'vectorizer__binary': True, 'vectorizer__max_df': 0.614813958993365, 'vectorizer__ngram_range': (1, 5)}
-0.284 (+/-0.003) for {'clf__C': 100, 'vectorizer__binary': True, 'vectorizer__max_df': 0.71537138175446779, 'vectorizer__ngram_range': (1, 5)}
-0.285 (+/-0.003) for {'clf__C': 100, 'vectorizer__binary': True, 'vectorizer__max_df': 0.85923999601435741, 'vectorizer__ngram_range': (1, 5)}
-0.289 (+/-0.018) for {'clf__C': 1000, 'vectorizer__binary': True, 'vectorizer__m

In [47]:
from sklearn.preprocessing import OneHotEncoder

In [48]:
print(sklearn.__version__)

0.19.1


## Run on Validation Set

In [61]:
logPipe3 = Pipeline([
    ('vectorizer', TfidfVectorizer(ngram_range=(1,3), max_df=0.5, binary=True)),
    ('clf', LogisticRegression(C=100)),
])

In [66]:
X_traintrain, X_Val, y_traintrain, y_val = train_test_split(x_train, y_train, test_size=0.20, random_state=33)
logPipe3.fit(X_traintrain, y_traintrain)

Pipeline(memory=None,
     steps=[('vectorizer', TfidfVectorizer(analyzer='word', binary=True, decode_error='strict',
        dtype=<class 'numpy.int64'>, encoding='utf-8', input='content',
        lowercase=True, max_df=0.5, max_features=None, min_df=1,
        ngram_range=(1, 3), norm='l2', preprocessor=None, smooth_idf=Tru...ty='l2', random_state=None, solver='liblinear', tol=0.0001,
          verbose=0, warm_start=False))])

In [None]:
print ("Accuracy: %s" % classifier.score(X_test, y_test))
print("Log Loss: " % log_loss(y_pred=classifier.predict(X_test), y_true=y_test))