In [1]:
import pandas as pd
import numpy as np
import gensim
import matplotlib.pyplot as plt
from sklearn.feature_extraction.text import CountVectorizer
from sklearn.feature_extraction.text import TfidfVectorizer, TfidfTransformer
from sklearn.model_selection import train_test_split
from sklearn.metrics import accuracy_score, confusion_matrix
from sklearn import preprocessing
from sklearn.naive_bayes import MultinomialNB
from sklearn.linear_model import LogisticRegression
from sklearn.svm import LinearSVC
from sklearn.linear_model import SGDClassifier

In [2]:
train = pd.read_csv('train.csv')
print(len(train))
train[0:10]

19579


Unnamed: 0,id,text,author
0,id26305,"This process, however, afforded me no means of...",EAP
1,id17569,It never once occurred to me that the fumbling...,HPL
2,id11008,"In his left hand was a gold snuff box, from wh...",EAP
3,id27763,How lovely is spring As we looked from Windsor...,MWS
4,id12958,"Finding nothing else, not even gold, the Super...",HPL
5,id22965,"A youth passed in solitude, my best years spen...",MWS
6,id09674,"The astronomer, perhaps, at this point, took r...",EAP
7,id13515,The surcingle hung in ribands from my body.,EAP
8,id19322,I knew that you could not say to yourself 'ste...,EAP
9,id00912,I confess that neither the structure of langua...,MWS


In [3]:
test = pd.read_csv('test.csv')
print(len(test))
test[0:10]

8392


Unnamed: 0,id,text
0,id02310,"Still, as I urged our leaving Ireland with suc..."
1,id24541,"If a fire wanted fanning, it could readily be ..."
2,id00134,And when they had broken down the frail door t...
3,id27757,While I was thinking how I should possibly man...
4,id04081,I am not sure to what limit his knowledge may ...
5,id27337,"""The thick and peculiar mist, or smoke, which ..."
6,id24265,"That which is not matter, is not at all unless..."
7,id25917,I sought for repose although I did not hope fo...
8,id04951,"Upon the fourth day of the assassination, a pa..."
9,id14549,"""The tone metaphysical is also a good one."


In [4]:
EAP = train[train['author']=='EAP'].reset_index(drop=True)
print("Size of Edgar Allan Poe dataset = {}".format(len(EAP)))
print("% of Edgar Allan Poe dataset = {0:.03f}".format(len(EAP)/len(train)))
EAP[0:10]

Size of Edgar Allan Poe dataset = 7900
% of Edgar Allan Poe dataset = 0.403


Unnamed: 0,id,text,author
0,id26305,"This process, however, afforded me no means of...",EAP
1,id11008,"In his left hand was a gold snuff box, from wh...",EAP
2,id09674,"The astronomer, perhaps, at this point, took r...",EAP
3,id13515,The surcingle hung in ribands from my body.,EAP
4,id19322,I knew that you could not say to yourself 'ste...,EAP
5,id16607,"Here we barricaded ourselves, and, for the pre...",EAP
6,id17189,But a glance will show the fallacy of this idea.,EAP
7,id08441,"To these speeches they gave, of course, their ...",EAP
8,id14862,I even went so far as to speak of a slightly h...,EAP
9,id11411,Now the net work was not permanently fastened ...,EAP


In [5]:
HPL = train[train['author']=='HPL'].reset_index(drop=True)
print("Size of HP Lovercraft dataset = {}".format(len(HPL)))
print("% of HP Lovercraft dataset = {0:.03f}".format(len(HPL)/len(train)))
HPL[0:10]

Size of HP Lovercraft dataset = 5635
% of HP Lovercraft dataset = 0.288


Unnamed: 0,id,text,author
0,id17569,It never once occurred to me that the fumbling...,HPL
1,id12958,"Finding nothing else, not even gold, the Super...",HPL
2,id19764,Herbert West needed fresh bodies because his l...,HPL
3,id18886,The farm like grounds extended back very deepl...,HPL
4,id20836,"His facial aspect, too, was remarkable for its...",HPL
5,id08075,"It was not that the sounds were hideous, for t...",HPL
6,id27907,"Our compasses, depth gauges, and other delicat...",HPL
7,id08121,This the young warriors took back with them to...,HPL
8,id11733,Even now They talked in Their tombs.,HPL
9,id03205,Sheehan especially did they ply with inquiries...,HPL


In [6]:
MWS = train[train['author']=='MWS'].reset_index(drop=True)
print("Size of Mary Shelley dataset = {}".format(len(MWS)))
print("% of Marry Shelley dataset = {0:.03f}".format(len(MWS)/len(train)))
MWS[0:10]

Size of Mary Shelley dataset = 6044
% of Marry Shelley dataset = 0.309


Unnamed: 0,id,text,author
0,id27763,How lovely is spring As we looked from Windsor...,MWS
1,id22965,"A youth passed in solitude, my best years spen...",MWS
2,id00912,I confess that neither the structure of langua...,MWS
3,id16737,He shall find that I can feel my injuries; he ...,MWS
4,id12799,"He had escaped me, and I must commence a destr...",MWS
5,id13117,Her native sprightliness needed no undue excit...,MWS
6,id00764,"I was rich and young, and had a guardian appoi...",MWS
7,id00683,"We could make out little by the dim light, but...",MWS
8,id05258,"His soul overflowed with ardent affections, an...",MWS
9,id20751,"The visits of Merrival to Windsor, before freq...",MWS


In [7]:
label_enconder = preprocessing.LabelEncoder()
label_enconder.fit(train['author'])
train['label_encoded'] = label_enconder.transform(train['author'])
train[0:20]

Unnamed: 0,id,text,author,label_encoded
0,id26305,"This process, however, afforded me no means of...",EAP,0
1,id17569,It never once occurred to me that the fumbling...,HPL,1
2,id11008,"In his left hand was a gold snuff box, from wh...",EAP,0
3,id27763,How lovely is spring As we looked from Windsor...,MWS,2
4,id12958,"Finding nothing else, not even gold, the Super...",HPL,1
5,id22965,"A youth passed in solitude, my best years spen...",MWS,2
6,id09674,"The astronomer, perhaps, at this point, took r...",EAP,0
7,id13515,The surcingle hung in ribands from my body.,EAP,0
8,id19322,I knew that you could not say to yourself 'ste...,EAP,0
9,id00912,I confess that neither the structure of langua...,MWS,2


## Pre-processing text

In [8]:
def lower_case(text):
    return text.lower()

In [76]:
import re
import nltk
from nltk.corpus import stopwords
nltk.download('stopwords')
stops = set(stopwords.words('english'))

[nltk_data] Downloading package stopwords to
[nltk_data]     /home/rsilveira79/nltk_data...
[nltk_data]   Package stopwords is already up-to-date!


In [77]:
def transformText(text):
    
    stops = set(stopwords.words("english"))
    
    # Convert text to lower
    text = text.lower()
    # Removing non ASCII chars    
    text = re.sub(r'[^\x00-\x7f]',r' ',text)
    
    # Strip multiple whitespaces
    text = gensim.corpora.textcorpus.strip_multiple_whitespaces(text)
    
    # Removing all the stopwords
    filtered_words = [word for word in text.split() if word not in stops]
    
    # Removing all the tokens with lesser than 3 characters
    filtered_words = gensim.corpora.textcorpus.remove_short(filtered_words, minsize=3)
    
    # Preprocessed text after stop words removal
    text = " ".join(filtered_words)
    
    # Remove the punctuation
    text = gensim.parsing.preprocessing.strip_punctuation2(text)
    
    # Strip all the numerics
    text = gensim.parsing.preprocessing.strip_numeric(text)
    
    # Strip multiple whitespaces
    text = gensim.corpora.textcorpus.strip_multiple_whitespaces(text)
    
    # Stemming
    return gensim.parsing.preprocessing.stem_text(text)

In [78]:
train['text_processed']=train['text'].apply(lambda x: transformText(x))
train

Unnamed: 0,id,text,author,label_encoded,text_processed
0,id26305,"This process, however, afforded me no means of...",EAP,0,process howev afford mean ascertain dimens dun...
1,id17569,It never once occurred to me that the fumbling...,HPL,1,never occur fumbl might mere mistak
2,id11008,"In his left hand was a gold snuff box, from wh...",EAP,0,left hand gold snuff box which caper hill cut ...
3,id27763,How lovely is spring As we looked from Windsor...,MWS,2,love spring look windsor terrac sixteen fertil...
4,id12958,"Finding nothing else, not even gold, the Super...",HPL,1,find noth els even gold superintend abandon at...
5,id22965,"A youth passed in solitude, my best years spen...",MWS,2,youth pass solitud best year spent gentl femin...
6,id09674,"The astronomer, perhaps, at this point, took r...",EAP,0,astronom perhap point took refug suggest non l...
7,id13515,The surcingle hung in ribands from my body.,EAP,0,surcingl hung riband bodi
8,id19322,I knew that you could not say to yourself 'ste...,EAP,0,knew could sai stereotomi without brought thin...
9,id00912,I confess that neither the structure of langua...,MWS,2,confess neither structur languag code govern p...


## Train_test split

In [79]:
x_train, x_test, y_train, y_test = train_test_split(train['text_processed'], train['label_encoded'], test_size = 0.2, random_state = 4)
true_label = np.array(y_test)
print("#" * 20 + " Some stats " + "#"*20)
print("Dataset training: {} uterances".format(x_train.shape[0]))
print("Dataset testing: {} uterances".format(x_test.shape[0]))
print("Different classes: {}".format(len(y_train.unique())))

#################### Some stats ####################
Dataset training: 15663 uterances
Dataset testing: 3916 uterances
Different classes: 3


In [80]:
features = TfidfVectorizer(
                        max_df = 0.5,
                        stop_words = 'english')

In [81]:
x_train_features = features.fit_transform(x_train)
x_test_features = features.transform(x_test)

In [82]:
model_sgd = SGDClassifier(penalty = 'l2', loss = 'log', class_weight = 'balanced')
model_sgd.fit(x_train_features,y_train)

SGDClassifier(alpha=0.0001, average=False, class_weight='balanced',
       epsilon=0.1, eta0=0.0, fit_intercept=True, l1_ratio=0.15,
       learning_rate='optimal', loss='log', n_iter=5, n_jobs=1,
       penalty='l2', power_t=0.5, random_state=None, shuffle=True,
       verbose=0, warm_start=False)

In [83]:
preds_sgd = model_sgd.predict(x_test_features)
print("Current Accuracy: {0:.3f}".format(accuracy_score(preds_sgd,true_label)))

Current Accuracy: 0.805


## Making a better classifier with Grid Search

Best parameters set:  
clf__penalty: 'l2'  
tfidf__norm: 'l2'  
tfidf__use_idf: True  
vect__max_df: 0.9  
vect__max_features: None  
vect__ngram_range: (1, 2)  
vect__stop_words: None  

In [86]:
features_2 = TfidfVectorizer(max_df = 0.9, ngram_range = (1,2), norm = 'l2')

In [88]:
x_train_features_2 = features_2.fit_transform(x_train)
x_test_features_2 = features_2.transform(x_test)

In [96]:
model_sgd_2 = LogisticRegression(penalty = 'l2')
model_sgd_2.fit(x_train_features_2,y_train)

LogisticRegression(C=1.0, class_weight=None, dual=False, fit_intercept=True,
          intercept_scaling=1, max_iter=100, multi_class='ovr', n_jobs=1,
          penalty='l2', random_state=None, solver='liblinear', tol=0.0001,
          verbose=0, warm_start=False)

In [97]:
preds_sgd_2 = model_sgd_2.predict(x_test_features_2)
print("Current Accuracy: {0:.3f}".format(accuracy_score(preds_sgd_2,true_label)))

Current Accuracy: 0.805


In [98]:
my_sub_2 = pd.DataFrame(columns={'id', 'EAP','HPL', 'MWS'})
my_sub_2=my_sub_2[['id', 'EAP','HPL', 'MWS']]
my_sub_2

Unnamed: 0,id,EAP,HPL,MWS


In [99]:
for i in range(len(test)):
    reply=get_txt_proba_response(transformText(test['text'][i]),features_2, model_sgd_2)
    my_sub_2.loc[i] = [test['id'][i], reply['EAP'], reply['HPL'], reply['MWS']]

In [100]:
my_sub_2.to_csv('roberto_3.csv',index=False)

### Making predictions

In [16]:
def get_txt_proba_response(msg, vectors, model):
    msg_vec = vectors.transform([msg])
#    print(msg_vec)
    pred_prob=model.predict_proba(msg_vec)
    pd_unsorted = pd.DataFrame(
        {'label_encode': model.classes_,
         'label_decode': label_enconder.inverse_transform(model.classes_),
         'pred_proba':  pred_prob[0]})
    
    probas = {
        pd_unsorted.sort_values(['pred_proba'], ascending=False,axis=0).iloc[0][0]:
        pd_unsorted.sort_values(['pred_proba'], ascending=False,axis=0).iloc[0][2],
        pd_unsorted.sort_values(['pred_proba'], ascending=False,axis=0).iloc[1][0]:
        pd_unsorted.sort_values(['pred_proba'], ascending=False,axis=0).iloc[1][2],
         pd_unsorted.sort_values(['pred_proba'], ascending=False,axis=0).iloc[2][0]:
         pd_unsorted.sort_values(['pred_proba'], ascending=False,axis=0).iloc[2][2]}
    
    return probas

In [17]:
msg = (test['text'][0]).lower()
print(msg)
msg_vec = features.transform([msg])
print(msg_vec)

still, as i urged our leaving ireland with such inquietude and impatience, my father thought it best to yield.
  (0, 22613)	0.363206185203
  (0, 21447)	0.324597787876
  (0, 20104)	0.221377709078
  (0, 11589)	0.31261071479
  (0, 10923)	0.408377277829
  (0, 10585)	0.427050560563
  (0, 10076)	0.357781807416
  (0, 7599)	0.246656627889
  (0, 1941)	0.2792713697


In [18]:
get_txt_proba_response(msg,features, model_sgd)

{'EAP': 0.30619150628334907,
 'HPL': 0.15566281022546408,
 'MWS': 0.53814568349118674}

In [19]:
for msg in range(0,10):
    text = test['text'][msg]
    print('Sentence {} - {}'.format(msg, text))
    print('Prediction = {} \n'.format(get_txt_proba_response(text,features, model_sgd)))

Sentence 0 - Still, as I urged our leaving Ireland with such inquietude and impatience, my father thought it best to yield.
Prediction = {'MWS': 0.53814568349118674, 'EAP': 0.30619150628334907, 'HPL': 0.15566281022546408} 

Sentence 1 - If a fire wanted fanning, it could readily be fanned with a newspaper, and as the government grew weaker, I have no doubt that leather and iron acquired durability in proportion, for, in a very short time, there was not a pair of bellows in all Rotterdam that ever stood in need of a stitch or required the assistance of a hammer.
Prediction = {'EAP': 0.59990522533387403, 'HPL': 0.27629484381700581, 'MWS': 0.12379993084911999} 

Sentence 2 - And when they had broken down the frail door they found only this: two cleanly picked human skeletons on the earthen floor, and a number of singular beetles crawling in the shadowy corners.
Prediction = {'HPL': 0.46957481032034409, 'EAP': 0.41351314399610051, 'MWS': 0.11691204568355555} 

Sentence 3 - While I was thin

## Generating submissions

In [20]:
sample_submission = pd.read_csv('sample_submission.csv')
sample_submission[0:10]

Unnamed: 0,id,EAP,HPL,MWS
0,id02310,0.403494,0.287808,0.308698
1,id24541,0.403494,0.287808,0.308698
2,id00134,0.403494,0.287808,0.308698
3,id27757,0.403494,0.287808,0.308698
4,id04081,0.403494,0.287808,0.308698
5,id27337,0.403494,0.287808,0.308698
6,id24265,0.403494,0.287808,0.308698
7,id25917,0.403494,0.287808,0.308698
8,id04951,0.403494,0.287808,0.308698
9,id14549,0.403494,0.287808,0.308698


In [21]:
my_sub = pd.DataFrame(columns={'id', 'EAP','HPL', 'MWS'})
my_sub=my_sub[['id', 'EAP','HPL', 'MWS']]
my_sub

Unnamed: 0,id,EAP,HPL,MWS


In [23]:
for i in range(len(test)):
    reply=get_txt_proba_response(test['text'][i].lower(),features, model_sgd)
    my_sub.loc[i] = [test['id'][i], reply['EAP'], reply['HPL'], reply['MWS']]

In [25]:
my_sub.to_csv('roberto_2.csv',index=False)

## Testing other models

In [26]:
from sklearn.model_selection import GridSearchCV
from sklearn.pipeline import Pipeline
from pprint import pprint
from time import time

In [27]:
pipeline = Pipeline([
    ('vect', CountVectorizer()),
    ('tfidf', TfidfTransformer()),
    ('clf', SGDClassifier()),
])

In [None]:
CountVectorizer()

In [35]:
parameters = {
   # 'vect__min_df': (0.1,0.2,0.3, 0.5),
    'vect__max_df': (0.1, 0.5, 0.75, 0.9, 1.0),
    'vect__max_features': (None, 5000, 10000, 50000),
    'vect__ngram_range': ((1, 1), (1, 2), (1, 3)),  # unigrams or bigrams,
    'vect__stop_words': (None, 'english'),
    'tfidf__use_idf': (True, False),
    'tfidf__norm': ('l1', 'l2'),
   # 'clf__alpha': (0.00001, 0.000001),
    'clf__penalty': ('l1','l2', 'elasticnet'),
   # 'clf__n_iter': (10, 50, 80, 200),
   # 'clf__class_weight': (None, 'balanced')
}

In [36]:
grid_search = GridSearchCV(pipeline, parameters, n_jobs=-1, verbose=1)

print("Performing grid search...")
print("pipeline:", [name for name, _ in pipeline.steps])
print("parameters:")
print(parameters)
t0 = time()
grid_search.fit(x_train, y_train)
print("done in %0.3fs" % (time() - t0))
print()

print("Best score: %0.3f" % grid_search.best_score_)
print("Best parameters set:")
best_parameters = grid_search.best_estimator_.get_params()
for param_name in sorted(parameters.keys()):
    print("\t%s: %r" % (param_name, best_parameters[param_name]))

Performing grid search...
pipeline: ['vect', 'tfidf', 'clf']
parameters:
{'vect__max_df': (0.1, 0.5, 0.75, 0.9, 1.0), 'vect__max_features': (None, 5000, 10000, 50000), 'vect__ngram_range': ((1, 1), (1, 2), (1, 3)), 'vect__stop_words': (None, 'english'), 'tfidf__use_idf': (True, False), 'tfidf__norm': ('l1', 'l2'), 'clf__penalty': ('l1', 'l2', 'elasticnet')}
Fitting 3 folds for each of 1440 candidates, totalling 4320 fits


[Parallel(n_jobs=-1)]: Done  34 tasks      | elapsed:   10.1s
[Parallel(n_jobs=-1)]: Done 184 tasks      | elapsed:   51.4s
[Parallel(n_jobs=-1)]: Done 434 tasks      | elapsed:  2.0min
[Parallel(n_jobs=-1)]: Done 784 tasks      | elapsed:  3.6min
[Parallel(n_jobs=-1)]: Done 1234 tasks      | elapsed:  5.6min
[Parallel(n_jobs=-1)]: Done 1784 tasks      | elapsed:  8.1min
[Parallel(n_jobs=-1)]: Done 2434 tasks      | elapsed: 11.0min
[Parallel(n_jobs=-1)]: Done 3184 tasks      | elapsed: 14.3min
[Parallel(n_jobs=-1)]: Done 4034 tasks      | elapsed: 18.2min
[Parallel(n_jobs=-1)]: Done 4320 out of 4320 | elapsed: 19.5min finished


done in 1173.555s

Best score: 0.815
Best parameters set:
	clf__penalty: 'l2'
	tfidf__norm: 'l2'
	tfidf__use_idf: True
	vect__max_df: 0.9
	vect__max_features: None
	vect__ngram_range: (1, 2)
	vect__stop_words: None


In [59]:
def multiclass_logloss(actual, predicted, eps=1e-15):
    """Multi class version of Logarithmic Loss metric.
    :param actual: Array containing the actual target classes
    :param predicted: Matrix with class predictions, one probability per class
    """
    # Convert 'actual' to a binary array if it's not already:
    if len(actual.shape) == 1:
        actual2 = np.zeros((actual.shape[0], predicted.shape[0]))
        for i, val in enumerate(actual):
            actual2[i, val] = 1
        actual = actual2
    clip = np.clip(predicted, eps, 1 - eps)
    rows = actual.shape[0]
    vsota = np.sum(actual * np.log(clip))
    return -1.0 / rows * vsota

In [60]:
len(true_label)

3916

In [61]:
len(preds_sgd)

3916

In [63]:
multiclass_logloss(true_label, preds_sgd)

10.081159708729144