<h1>Table of Contents<span class="tocSkip"></span></h1>
<div class="toc"><ul class="toc-item"><li><ul class="toc-item"><li><ul class="toc-item"><li><ul class="toc-item"><li><span><a href="#Read-the-data" data-toc-modified-id="Read-the-data-0.0.0.1"><span class="toc-item-num">0.0.0.1&nbsp;&nbsp;</span>Read the data</a></span></li></ul></li></ul></li></ul></li><li><span><a href="#Predict" data-toc-modified-id="Predict-1"><span class="toc-item-num">1&nbsp;&nbsp;</span>Predict</a></span></li><li><span><a href="#Build-a-second-pipe-line" data-toc-modified-id="Build-a-second-pipe-line-2"><span class="toc-item-num">2&nbsp;&nbsp;</span>Build a second pipe-line</a></span></li><li><span><a href="#Try-with-a-random-search-grid" data-toc-modified-id="Try-with-a-random-search-grid-3"><span class="toc-item-num">3&nbsp;&nbsp;</span>Try with a random search grid</a></span></li></ul></div>

In [4]:
import pandas as pd
import numpy as np
from sklearn.svm import SVC
from sklearn import preprocessing, decomposition, model_selection, metrics, pipeline
from sklearn.model_selection import GridSearchCV
from sklearn.feature_extraction.text import TfidfVectorizer, CountVectorizer
from sklearn.decomposition import TruncatedSVD
from sklearn.linear_model import LogisticRegression
from sklearn.model_selection import train_test_split
from sklearn.naive_bayes import MultinomialNB

import  pprint
import  subprocess 
import sys 
sys.path.append('../')
import pandas as pd

#### Read the data

In [8]:
train = pd.read_csv('train.csv')
test = pd.read_csv('test.csv')
sample = pd.read_csv('sample_submission.csv')

Encode the labels as numbers

In [12]:
lbl_enc = preprocessing.LabelEncoder()
y = lbl_enc.fit_transform(train.author.values)

xtrain, xvalid, ytrain, yvalid = train_test_split(train.text.values, y, 
                                                  stratify=y, 
                                                  random_state=42, 
                                                  test_size=0.1, shuffle=True)

In [15]:
def multiclass_logloss(actual, predicted, eps=1e-15):
    """Multi class version of Logarithmic Loss metric.
    :param actual: Array containing the actual target classes
    :param predicted: Matrix with class predictions, one probability per class
    """
    # Convert 'actual' to a binary array if it's not already:
    if len(actual.shape) == 1:
        actual2 = np.zeros((actual.shape[0], predicted.shape[1]))
        for i, val in enumerate(actual):
            actual2[i, val] = 1
        actual = actual2

    clip = np.clip(predicted, eps, 1 - eps)
    rows = actual.shape[0]
    vsota = np.sum(actual * np.log(clip))
    return -(1.0 / rows) * vsota

In [20]:
ctv = CountVectorizer(analyzer='word',token_pattern=r'\w{1,}',
            ngram_range=(1, 3), stop_words = 'english')

# Fitting Count Vectorizer to both training and test sets (semi-supervised learning)
ctv.fit(list(xtrain) + list(xvalid))
xtrain_ctv =  ctv.transform(xtrain) 
xvalid_ctv = ctv.transform(xvalid)

In [24]:
# Fitting a simple Naive Bayes on Counts
clf = MultinomialNB()
clf.fit(xtrain_ctv, ytrain)
predictions = clf.predict_proba(xvalid_ctv)

print ("logloss: %0.3f " % multiclass_logloss(yvalid, predictions))

logloss: 0.485 


In [61]:
tfv = TfidfVectorizer(min_df=3,  max_features=None, 
            strip_accents='unicode', analyzer='word', token_pattern=r'\w{1,}',
            ngram_range=(1, 3), use_idf=1, smooth_idf=1, sublinear_tf=False,
            stop_words = 'english')

# Fitting TF-IDF to both training and test sets (semi-supervised learning)
tfv.fit(list(xtrain) + list(xvalid))
xtrain_tfv =  tfv.transform(xtrain) 
xvalid_tfv = tfv.transform(xvalid)

In [62]:
xtrain_tfv.shape

(17621, 15102)

In [63]:
# Fitting a simple Naive Bayes on Counts
clf = MultinomialNB()
clf.fit(xtrain_tfv, ytrain)
predictions = clf.predict_proba(xvalid_tfv)

print ("logloss: %0.3f " % multiclass_logloss(yvalid, predictions))

logloss: 0.579 


In [64]:
mll_scorer = metrics.make_scorer(multiclass_logloss, greater_is_better=False, needs_proba=True)

nb_model = MultinomialNB()

# Create the pipeline 
clf = pipeline.Pipeline([('nb', nb_model)])

# parameter grid
param_grid = {'nb__alpha': [0.001, 0.01, 0.1, 1, 10, 100]}

# Initialize Grid Search Model
model = GridSearchCV(estimator=clf, param_grid=param_grid, scoring=mll_scorer,
                                 verbose=10, n_jobs=-1, iid=True, refit=True, cv=2)

# Fit Grid Search Model
model.fit(xtrain_tfv, ytrain)  # we can use the full data here but im only using xtrain. 
print("Best score: %0.3f" % model.best_score_)
print("Best parameters set:")
best_parameters = model.best_estimator_.get_params()
for param_name in sorted(param_grid.keys()):
    print("\t%s: %r" % (param_name, best_parameters[param_name]))

Fitting 2 folds for each of 6 candidates, totalling 12 fits
[CV] nb__alpha=0.001 .................................................
[CV] nb__alpha=0.001 .................................................
[CV] nb__alpha=0.01 ..................................................
[CV] nb__alpha=0.01 ..................................................
[CV] ....... nb__alpha=0.001, score=-0.6197727138113207, total=   0.0s
[CV] ....... nb__alpha=0.001, score=-0.6405281207644529, total=   0.1s
[CV] nb__alpha=0.1 ...................................................
[CV] ........ nb__alpha=0.01, score=-0.5107077210899634, total=   0.0s
[CV] nb__alpha=0.1 ...................................................
[CV] ........ nb__alpha=0.01, score=-0.5227750429194576, total=   0.0s
[CV] nb__alpha=1 .....................................................
[CV] nb__alpha=1 .....................................................
[CV] ........ nb__alpha=0.1, score=-0.48977643250586556, total=   0.1s
[CV] ........ nb_

[Parallel(n_jobs=-1)]: Batch computation too fast (0.1216s.) Setting batch_size=2.
[Parallel(n_jobs=-1)]: Done   5 out of  12 | elapsed:    0.3s remaining:    0.4s
[Parallel(n_jobs=-1)]: Done   7 out of  12 | elapsed:    0.3s remaining:    0.2s


[CV] nb__alpha=100 ...................................................
[CV] .......... nb__alpha=10, score=-0.9507075585052208, total=   0.0s
[CV] .......... nb__alpha=100, score=-1.067347892749627, total=   0.0s
Best score: -0.493
Best parameters set:
	nb__alpha: 0.1


[Parallel(n_jobs=-1)]: Done  12 out of  12 | elapsed:    0.4s finished


In [29]:
model.predict_proba

<function sklearn.model_selection._search.BaseSearchCV.predict_proba>

#  Predict

In [34]:
xtest = test.text.values
xtest_tfv =  tfv.transform(xtest) 
xtest_tfv.shape

(8392, 15102)

In [40]:
model.predict_proba(xtest_tfv)

array([[0.16204179, 0.05630826, 0.78164995],
       [0.8852836 , 0.08006336, 0.03465305],
       [0.3409618 , 0.64702109, 0.01201712],
       ...,
       [0.82935608, 0.08435844, 0.08628548],
       [0.18011817, 0.02492834, 0.79495349],
       [0.10903673, 0.88732942, 0.00363384]])

In [42]:
df = pd.DataFrame(columns=["id", "EAP", "HPL", "MWS"])
df["id"] = test['id']
df["EAP"] = model.predict_proba(xtest_tfv)[:, 0]
df["HPL"] = model.predict_proba(xtest_tfv)[:, 1]
df["MWS"] = model.predict_proba(xtest_tfv)[:, 2]
df.to_csv('./prediction.csv', index=False)

In [43]:
df.head()

Unnamed: 0,id,EAP,HPL,MWS
0,id02310,0.162042,0.056308,0.78165
1,id24541,0.885284,0.080063,0.034653
2,id00134,0.340962,0.647021,0.012017
3,id27757,0.493556,0.501627,0.004817
4,id04081,0.83457,0.110247,0.055183


# Build a second pipe-line 

In [74]:
tfv = TfidfVectorizer(min_df=1,  max_features=None, 
            strip_accents='unicode', analyzer='word', token_pattern=r'\w{1,}',
            use_idf=1, smooth_idf=1, sublinear_tf=False, stop_words = 'english')

mb = MultinomialNB()

clf = pipeline.Pipeline([('tfv', tfv), ('mb', mb)])

param_grid = {'tfv__ngram_range' : [(1, 1), (1, 2), (2, 4)],
              'mb__alpha': [0.1, 0.5,  1.0]}



model = GridSearchCV(estimator=clf, param_grid=param_grid, scoring=mll_scorer,
                                 verbose=10, n_jobs=-1, iid=True, refit=True, cv=2)

# Fit Grid Search Model
model.fit(xtrain, ytrain)  # we can use the full data here but im only using xtrain. 
print("Best score: %0.3f" % model.best_score_)
print("Best parameters set:")
best_parameters = model.best_estimator_.get_params()
for param_name in sorted(param_grid.keys()):
    print("\t%s: %r" % (param_name, best_parameters[param_name]))

Fitting 2 folds for each of 9 candidates, totalling 18 fits
[CV] mb__alpha=0.1, tfv__ngram_range=(1, 1) ..........................
[CV] mb__alpha=0.1, tfv__ngram_range=(1, 1) ..........................
[CV] mb__alpha=0.1, tfv__ngram_range=(1, 2) ..........................
[CV] mb__alpha=0.1, tfv__ngram_range=(1, 2) ..........................
[CV]  mb__alpha=0.1, tfv__ngram_range=(1, 1), score=-0.5039381043430531, total=   1.2s
[CV]  mb__alpha=0.1, tfv__ngram_range=(1, 1), score=-0.49443189410997285, total=   1.3s
[CV] mb__alpha=0.1, tfv__ngram_range=(2, 4) ..........................
[CV] mb__alpha=0.1, tfv__ngram_range=(2, 4) ..........................
[CV]  mb__alpha=0.1, tfv__ngram_range=(1, 2), score=-0.49960986214232195, total=   2.2s
[CV] mb__alpha=1.0, tfv__ngram_range=(1, 1) ..........................
[CV]  mb__alpha=0.1, tfv__ngram_range=(1, 2), score=-0.4914413902153785, total=   2.5s
[CV] mb__alpha=1.0, tfv__ngram_range=(1, 1) ..........................
[CV]  mb__alpha=1.0, t

[Parallel(n_jobs=-1)]: Done   5 tasks      | elapsed:    6.2s


[CV]  mb__alpha=1.0, tfv__ngram_range=(1, 1), score=-0.6698978911845287, total=   1.6s
[CV] mb__alpha=1.0, tfv__ngram_range=(1, 2) ..........................
[CV]  mb__alpha=0.1, tfv__ngram_range=(2, 4), score=-0.8952759027046554, total=   6.6s
[CV] mb__alpha=1.0, tfv__ngram_range=(2, 4) ..........................
[CV]  mb__alpha=0.1, tfv__ngram_range=(2, 4), score=-0.8958904100442969, total=   6.7s
[CV] mb__alpha=1.0, tfv__ngram_range=(2, 4) ..........................
[CV]  mb__alpha=1.0, tfv__ngram_range=(1, 2), score=-0.7212198315655546, total=   3.8s
[CV] mb__alpha=10, tfv__ngram_range=(1, 1) ...........................
[CV]  mb__alpha=1.0, tfv__ngram_range=(1, 2), score=-0.7256043724888017, total=   3.6s
[CV] mb__alpha=10, tfv__ngram_range=(1, 1) ...........................


[Parallel(n_jobs=-1)]: Done  10 tasks      | elapsed:   11.4s


[CV]  mb__alpha=10, tfv__ngram_range=(1, 1), score=-0.9462171882438013, total=   1.7s
[CV] mb__alpha=10, tfv__ngram_range=(1, 2) ...........................
[CV]  mb__alpha=10, tfv__ngram_range=(1, 1), score=-0.9474584088889313, total=   1.7s
[CV] mb__alpha=10, tfv__ngram_range=(1, 2) ...........................
[CV]  mb__alpha=1.0, tfv__ngram_range=(2, 4), score=-1.0255528425476406, total=   7.4s
[CV] mb__alpha=10, tfv__ngram_range=(2, 4) ...........................


[Parallel(n_jobs=-1)]: Done  13 out of  18 | elapsed:   19.8s remaining:    7.6s


[CV]  mb__alpha=1.0, tfv__ngram_range=(2, 4), score=-1.0250757021574313, total=   7.4s
[CV] mb__alpha=10, tfv__ngram_range=(2, 4) ...........................
[CV]  mb__alpha=10, tfv__ngram_range=(1, 2), score=-0.9878162701596147, total=   4.8s
[CV]  mb__alpha=10, tfv__ngram_range=(1, 2), score=-0.9886457723455158, total=   4.8s


[Parallel(n_jobs=-1)]: Done  15 out of  18 | elapsed:   20.3s remaining:    4.1s


[CV]  mb__alpha=10, tfv__ngram_range=(2, 4), score=-1.0790228807512339, total=   3.9s
[CV]  mb__alpha=10, tfv__ngram_range=(2, 4), score=-1.0790759629192066, total=   4.1s


[Parallel(n_jobs=-1)]: Done  18 out of  18 | elapsed:   24.9s finished


Best score: -0.496
Best parameters set:
	mb__alpha: 0.1
	tfv__ngram_range: (1, 2)


# Try with a random search grid

In [77]:
from sklearn.model_selection import RandomizedSearchCV
import scipy as sp


In [82]:
uniform = sp.stats.uniform(0.01, 0.05)

In [85]:
tfv = TfidfVectorizer(min_df=1,  max_features=None, 
            strip_accents='unicode', analyzer='word', token_pattern=r'\w{1,}',
            use_idf=1, smooth_idf=1, sublinear_tf=False, stop_words = 'english')

mb = MultinomialNB()

clf = pipeline.Pipeline([('tfv', tfv), ('mb', mb)])

param_dis= {'tfv__ngram_range' : [(1, 1), (1, 2), (2, 4)],
              'mb__alpha': sp.stats.uniform(0.01, 0.05)}



model = RandomizedSearchCV(estimator=clf, param_distributions=param_dis, n_iter=10,  scoring=mll_scorer,
                                 verbose=10, n_jobs=-1, iid=True, refit=True, cv=2)

# Fit Grid Search Model
model.fit(xtrain, ytrain)  # we can use the full data here but im only using xtrain. 
print("Best score: %0.3f" % model.best_score_)
print("Best parameters set:")
best_parameters = model.best_estimator_.get_params()
for param_name in sorted(param_grid.keys()):
    print("\t%s: %r" % (param_name, best_parameters[param_name]))

Fitting 2 folds for each of 10 candidates, totalling 20 fits
[CV] mb__alpha=0.03000015827808987, tfv__ngram_range=(2, 4) ..........
[CV] mb__alpha=0.03000015827808987, tfv__ngram_range=(2, 4) ..........
[CV] mb__alpha=0.049367244385197545, tfv__ngram_range=(2, 4) .........
[CV] mb__alpha=0.049367244385197545, tfv__ngram_range=(2, 4) .........
[CV]  mb__alpha=0.03000015827808987, tfv__ngram_range=(2, 4), score=-0.883524073696141, total=   3.9s
[CV] mb__alpha=0.011329767222323702, tfv__ngram_range=(1, 1) .........
[CV]  mb__alpha=0.03000015827808987, tfv__ngram_range=(2, 4), score=-0.8805139842775999, total=   4.0s
[CV]  mb__alpha=0.049367244385197545, tfv__ngram_range=(2, 4), score=-0.8792038660007394, total=   4.0s
[CV]  mb__alpha=0.049367244385197545, tfv__ngram_range=(2, 4), score=-0.8781179408115126, total=   4.0s
[CV] mb__alpha=0.05314623116577253, tfv__ngram_range=(1, 1) ..........
[CV] mb__alpha=0.011329767222323702, tfv__ngram_range=(1, 1) .........
[CV] mb__alpha=0.053146231165

[Parallel(n_jobs=-1)]: Done   5 tasks      | elapsed:    8.3s


[CV]  mb__alpha=0.05314623116577253, tfv__ngram_range=(1, 1), score=-0.4849256202369818, total=   1.7s
[CV]  mb__alpha=0.05314623116577253, tfv__ngram_range=(1, 1), score=-0.49655724232496307, total=   1.7s
[CV] mb__alpha=0.03388584381222716, tfv__ngram_range=(2, 4) ..........
[CV] mb__alpha=0.03388584381222716, tfv__ngram_range=(2, 4) ..........
[CV]  mb__alpha=0.04560163909446411, tfv__ngram_range=(1, 1), score=-0.48454300214409984, total=   1.8s
[CV] mb__alpha=0.03979681042368726, tfv__ngram_range=(1, 2) ..........
[CV]  mb__alpha=0.04560163909446411, tfv__ngram_range=(1, 1), score=-0.49675318125849804, total=   1.8s
[CV] mb__alpha=0.03979681042368726, tfv__ngram_range=(1, 2) ..........


[Parallel(n_jobs=-1)]: Done  10 tasks      | elapsed:   11.2s


[CV]  mb__alpha=0.03979681042368726, tfv__ngram_range=(1, 2), score=-0.46667318310819883, total=   4.6s
[CV]  mb__alpha=0.03979681042368726, tfv__ngram_range=(1, 2), score=-0.4769108297148564, total=   4.6s
[CV] mb__alpha=0.03804031560695873, tfv__ngram_range=(1, 2) ..........
[CV] mb__alpha=0.03804031560695873, tfv__ngram_range=(1, 2) ..........
[CV]  mb__alpha=0.03388584381222716, tfv__ngram_range=(2, 4), score=-0.8812651929645912, total=   7.3s
[CV] mb__alpha=0.05388553728745226, tfv__ngram_range=(1, 1) ..........
[CV]  mb__alpha=0.03388584381222716, tfv__ngram_range=(2, 4), score=-0.8787744167247004, total=   7.5s
[CV] mb__alpha=0.05388553728745226, tfv__ngram_range=(1, 1) ..........
[CV]  mb__alpha=0.05388553728745226, tfv__ngram_range=(1, 1), score=-0.4849944230543829, total=   1.9s
[CV] mb__alpha=0.05064167389848534, tfv__ngram_range=(1, 1) ..........
[CV]  mb__alpha=0.05388553728745226, tfv__ngram_range=(1, 1), score=-0.49657499015717393, total=   1.9s
[CV] mb__alpha=0.05064167

[Parallel(n_jobs=-1)]: Done  16 out of  20 | elapsed:   20.8s remaining:    5.2s


[CV]  mb__alpha=0.03804031560695873, tfv__ngram_range=(1, 2), score=-0.4663578975060131, total=   4.1s
[CV]  mb__alpha=0.03804031560695873, tfv__ngram_range=(1, 2), score=-0.47672830606530714, total=   4.1s
[CV]  mb__alpha=0.05064167389848534, tfv__ngram_range=(1, 1), score=-0.484730315457137, total=   1.7s
[CV]  mb__alpha=0.05064167389848534, tfv__ngram_range=(1, 1), score=-0.496541848313532, total=   1.6s


[Parallel(n_jobs=-1)]: Done  20 out of  20 | elapsed:   22.9s finished


Best score: -0.472
Best parameters set:
	mb__alpha: 0.03804031560695873
	tfv__ngram_range: (1, 2)


In [86]:
import spacy


ModuleNotFoundError: No module named 'spacy'