<h1>Table of Contents<span class="tocSkip"></span></h1>
<div class="toc"><ul class="toc-item"><li><ul class="toc-item"><li><ul class="toc-item"><li><ul class="toc-item"><li><span><a href="#Read-the-data" data-toc-modified-id="Read-the-data-0.0.0.1"><span class="toc-item-num">0.0.0.1&nbsp;&nbsp;</span>Read the data</a></span></li></ul></li></ul></li></ul></li><li><span><a href="#Predict" data-toc-modified-id="Predict-1"><span class="toc-item-num">1&nbsp;&nbsp;</span>Predict</a></span></li></ul></div>

In [4]:
import pandas as pd
import numpy as np
from sklearn.svm import SVC
from sklearn import preprocessing, decomposition, model_selection, metrics, pipeline
from sklearn.model_selection import GridSearchCV
from sklearn.feature_extraction.text import TfidfVectorizer, CountVectorizer
from sklearn.decomposition import TruncatedSVD
from sklearn.linear_model import LogisticRegression
from sklearn.model_selection import train_test_split
from sklearn.naive_bayes import MultinomialNB

import  pprint
import  subprocess 
import sys 
sys.path.append('../')
import pandas as pd

#### Read the data

In [8]:
train = pd.read_csv('train.csv')
test = pd.read_csv('test.csv')
sample = pd.read_csv('sample_submission.csv')

Encode the labels as numbers

In [12]:
lbl_enc = preprocessing.LabelEncoder()
y = lbl_enc.fit_transform(train.author.values)

xtrain, xvalid, ytrain, yvalid = train_test_split(train.text.values, y, 
                                                  stratify=y, 
                                                  random_state=42, 
                                                  test_size=0.1, shuffle=True)

In [15]:
def multiclass_logloss(actual, predicted, eps=1e-15):
    """Multi class version of Logarithmic Loss metric.
    :param actual: Array containing the actual target classes
    :param predicted: Matrix with class predictions, one probability per class
    """
    # Convert 'actual' to a binary array if it's not already:
    if len(actual.shape) == 1:
        actual2 = np.zeros((actual.shape[0], predicted.shape[1]))
        for i, val in enumerate(actual):
            actual2[i, val] = 1
        actual = actual2

    clip = np.clip(predicted, eps, 1 - eps)
    rows = actual.shape[0]
    vsota = np.sum(actual * np.log(clip))
    return -(1.0 / rows) * vsota

In [20]:
ctv = CountVectorizer(analyzer='word',token_pattern=r'\w{1,}',
            ngram_range=(1, 3), stop_words = 'english')

# Fitting Count Vectorizer to both training and test sets (semi-supervised learning)
ctv.fit(list(xtrain) + list(xvalid))
xtrain_ctv =  ctv.transform(xtrain) 
xvalid_ctv = ctv.transform(xvalid)

In [24]:
# Fitting a simple Naive Bayes on Counts
clf = MultinomialNB()
clf.fit(xtrain_ctv, ytrain)
predictions = clf.predict_proba(xvalid_ctv)

print ("logloss: %0.3f " % multiclass_logloss(yvalid, predictions))

logloss: 0.485 


In [25]:
tfv = TfidfVectorizer(min_df=3,  max_features=None, 
            strip_accents='unicode', analyzer='word', token_pattern=r'\w{1,}',
            ngram_range=(1, 3), use_idf=1,smooth_idf=1,sublinear_tf=1,
            stop_words = 'english')

# Fitting TF-IDF to both training and test sets (semi-supervised learning)
tfv.fit(list(xtrain) + list(xvalid))
xtrain_tfv =  tfv.transform(xtrain) 
xvalid_tfv = tfv.transform(xvalid)

In [27]:
# Fitting a simple Naive Bayes on Counts
clf = MultinomialNB()
clf.fit(xtrain_tfv, ytrain)
predictions = clf.predict_proba(xvalid_tfv)

print ("logloss: %0.3f " % multiclass_logloss(yvalid, predictions))

logloss: 0.578 


In [37]:
mll_scorer = metrics.make_scorer(multiclass_logloss, greater_is_better=False, needs_proba=True)

nb_model = MultinomialNB()

# Create the pipeline 
clf = pipeline.Pipeline([('nb', nb_model)])

# parameter grid
param_grid = {'nb__alpha': [0.001, 0.01, 0.1, 1, 10, 100]}

# Initialize Grid Search Model
model = GridSearchCV(estimator=clf, param_grid=param_grid, scoring=mll_scorer,
                                 verbose=10, n_jobs=-1, iid=True, refit=True, cv=2)

# Fit Grid Search Model
model.fit(xtrain_tfv, ytrain)  # we can use the full data here but im only using xtrain. 
print("Best score: %0.3f" % model.best_score_)
print("Best parameters set:")
best_parameters = model.best_estimator_.get_params()
for param_name in sorted(param_grid.keys()):
    print("\t%s: %r" % (param_name, best_parameters[param_name]))

Fitting 2 folds for each of 6 candidates, totalling 12 fits
[CV] nb__alpha=0.001 .................................................
[CV] nb__alpha=0.001 .................................................
[CV] nb__alpha=0.01 ..................................................
[CV] nb__alpha=0.01 ..................................................
[CV] ....... nb__alpha=0.001, score=-0.6414540936460588, total=   0.0s
[CV] nb__alpha=0.1 ...................................................
[CV] ....... nb__alpha=0.001, score=-0.6204702511359115, total=   0.1s
[CV] ........ nb__alpha=0.01, score=-0.5229890118962334, total=   0.1s
[CV] nb__alpha=0.1 ...................................................
[CV] ........ nb__alpha=0.01, score=-0.5107778556718855, total=   0.1s
[CV] nb__alpha=1 .....................................................
[CV] nb__alpha=1 .....................................................
[CV] ......... nb__alpha=0.1, score=-0.4891909737902392, total=   0.0s
[CV] nb__alpha=10

[Parallel(n_jobs=-1)]: Batch computation too fast (0.0872s.) Setting batch_size=4.
[Parallel(n_jobs=-1)]: Done   3 out of  12 | elapsed:    0.2s remaining:    0.5s
[Parallel(n_jobs=-1)]: Done   5 out of  12 | elapsed:    0.2s remaining:    0.3s
[Parallel(n_jobs=-1)]: Done   7 out of  12 | elapsed:    0.3s remaining:    0.2s


[CV] .......... nb__alpha=10, score=-0.9496648029421535, total=   0.0s
[CV] nb__alpha=10 ....................................................
[CV] ........... nb__alpha=10, score=-0.950588400704189, total=   0.0s
[CV] nb__alpha=100 ...................................................
[CV] .......... nb__alpha=100, score=-1.067264698247982, total=   0.0s
[CV] nb__alpha=100 ...................................................
[CV] ......... nb__alpha=100, score=-1.0673577682186433, total=   0.0s
Best score: -0.492
Best parameters set:
	nb__alpha: 0.1


[Parallel(n_jobs=-1)]: Done  12 out of  12 | elapsed:    0.4s finished


In [29]:
model.predict_proba

<function sklearn.model_selection._search.BaseSearchCV.predict_proba>

#  Predict

In [34]:
xtest = test.text.values
xtest_tfv =  tfv.transform(xtest) 
xtest_tfv.shape

(8392, 15102)

In [40]:
model.predict_proba(xtest_tfv)

array([[0.16204179, 0.05630826, 0.78164995],
       [0.8852836 , 0.08006336, 0.03465305],
       [0.3409618 , 0.64702109, 0.01201712],
       ...,
       [0.82935608, 0.08435844, 0.08628548],
       [0.18011817, 0.02492834, 0.79495349],
       [0.10903673, 0.88732942, 0.00363384]])

In [42]:
df = pd.DataFrame(columns=["id", "EAP", "HPL", "MWS"])
df["id"] = test['id']
df["EAP"] = model.predict_proba(xtest_tfv)[:, 0]
df["HPL"] = model.predict_proba(xtest_tfv)[:, 1]
df["MWS"] = model.predict_proba(xtest_tfv)[:, 2]
df.to_csv('./prediction.csv', index=False)

In [43]:
df.head()

Unnamed: 0,id,EAP,HPL,MWS
0,id02310,0.162042,0.056308,0.78165
1,id24541,0.885284,0.080063,0.034653
2,id00134,0.340962,0.647021,0.012017
3,id27757,0.493556,0.501627,0.004817
4,id04081,0.83457,0.110247,0.055183
