# Spooky Author Identification - Identify authors from their writings

### Loading Data Sets

In [1]:
import pandas as pd
import numpy as np
from collections import defaultdict

In [2]:
train = pd.read_csv('./input/train.csv')
test = pd.read_csv('./input/test.csv')
print("training set size: {}".format(train.shape[0]))
print("test set size: {}".format(test.shape[0]))

training set size: 19579
test set size: 8392


In [3]:
train.head()

Unnamed: 0,id,text,author
0,id26305,"This process, however, afforded me no means of...",EAP
1,id17569,It never once occurred to me that the fumbling...,HPL
2,id11008,"In his left hand was a gold snuff box, from wh...",EAP
3,id27763,How lovely is spring As we looked from Windsor...,MWS
4,id12958,"Finding nothing else, not even gold, the Super...",HPL


Building a simple tokenizer function in order to remove stop words and punctuation characters. the Gensim algorithms work on sentences instead of arbitrary documents. This process is done automatically in `scikit-learn` vectorizers.

In [6]:
from string import punctuation
from nltk.corpus import stopwords
from nltk.tokenize import word_tokenize

def tokenize(docs):
    pattern = stopwords.words("english") + list(punctuation)
    sentences = []
    for d in docs:
        sentence = word_tokenize(d)
        sentences.append([w for w in sentence if w not in pattern])
    return sentences 

In [7]:
sentences = tokenize(train.text)

### Gnerate features and  Build models

In [12]:
from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn.ensemble import ExtraTreesClassifier
from sklearn.ensemble import RandomForestClassifier
from sklearn.naive_bayes import BernoulliNB, MultinomialNB
from sklearn.svm import SVC
from sklearn.pipeline import Pipeline

#### Models without word embeddings 
* multinomial NB 
* bernoulli NB 
* SVC 

In [11]:
# mutlinomial NB
multi_nb = Pipeline([("tfidf_vectorizer", TfidfVectorizer(min_df=5, ngram_range=(1,3))),
                    ("mutlinomial nb", MultinomialNB())])
#bernoulli NB
bernoulli_nb = Pipeline([("tfidf_vectorizer", TfidfVectorizer(min_df=5, ngram_range=(1,3))),
                         ("bernoulli nb", BernoulliNB())])
# SVM
svc = Pipeline([("tfidf_vectorizer", TfidfVectorizer(min_df=5, ngram_range=(1,3))),
               ("linear svc", SVC(kernel="linear", probability=True))])

#### Models with word embeddings 
* Extra Trees Classifier
* Random Forest Classifier
<br>

The word2vec features will convert the documents from the sparse word-count features into only hundreds of dense features. 
<br>
Following is an implementation of embedding vectorizer. With a given word, this vectorizes texts by taking the mean of all the vectors corresponding to individual words

In [None]:
from gensim.models.word2vec import Word2Vec
model = Word2Vec(sentences, size=300, window=5, min_count=5, workers=4)
model.init_sims(replace=True) # unload memory

In [None]:
# model.wv is KeyedVector which contain vectors and vocab for
# the word2vec training class 
w2v = {w: vec for w,vec in zip(model.wv.index2word, model.wv.syn0)}

In [None]:
class EmbeddingVectorizer(object):
    def __init__(self, word2vec):
        self.word2vec = word2vec
        self.word2weight = None
        self.dim = 300
    
    def fit(self, X, y):
        vect = TfidfVectorizer(min_df=5, ngram_range=(1,3))
        vect.fit(X)
        max_idf = max(vect.idf_)
        self.word2weight = defaultdict(
        lambda: max_idf, [(w, vect.idf_[i]) for w, i in vect.vocabulary_.items()]
        )
        
        return self
    
    def transform(self, X):
        return np.array([
            np.mean([
                self.word2vec[w]*self.word2weight[w] 
                for w in words if w in self.word2vec] or 
                [np.zeros(self.dim)], axis=0) 
            for words in X
        ])
        

In [None]:
etree = Pipeline([("word2vec vectorier", EmbeddingVectorizer(w2v)),
                  ("extra trees", ExtraTreesClassifier(n_estimators=100))
])
random_forest = Pipeline([("word2vec vectorier", EmbeddingVectorizer(w2v)),
                        ("random forest", RandomForestClassifier(n_estimators=100))   
])

Cross Validation Scores

In [None]:
from sklearn.model_selection import cross_val_score
from tabulate import tabulate

models = [
    #("multi_nb", multi_nb),
    #("bernoulli_nb", bernoulli_nb),
    ("svc", svc),
    #("etree", etree),
    #("random_forest", random_forest)
]

scores =  [(name, cross_val_score(model, train.text, train.author, cv=5).mean())
                for name, model in models]

print(tabulate(scores, floatfmt=".4f", headers=("model", "score")))

In [13]:
# fit pipeline(svc) on test.text(X_train) and test.author(y_train)
svc.fit(train.text, train.author)

# make predictions 
pred = svc.predict_proba(test.text)

### Hyperparameter Tuning

In [35]:
# checkout parmeter names 


dict_keys(['memory', 'steps', 'tfidf_vectorizer', 'linear svc', 'tfidf_vectorizer__analyzer', 'tfidf_vectorizer__binary', 'tfidf_vectorizer__decode_error', 'tfidf_vectorizer__dtype', 'tfidf_vectorizer__encoding', 'tfidf_vectorizer__input', 'tfidf_vectorizer__lowercase', 'tfidf_vectorizer__max_df', 'tfidf_vectorizer__max_features', 'tfidf_vectorizer__min_df', 'tfidf_vectorizer__ngram_range', 'tfidf_vectorizer__norm', 'tfidf_vectorizer__preprocessor', 'tfidf_vectorizer__smooth_idf', 'tfidf_vectorizer__stop_words', 'tfidf_vectorizer__strip_accents', 'tfidf_vectorizer__sublinear_tf', 'tfidf_vectorizer__token_pattern', 'tfidf_vectorizer__tokenizer', 'tfidf_vectorizer__use_idf', 'tfidf_vectorizer__vocabulary', 'linear svc__C', 'linear svc__cache_size', 'linear svc__class_weight', 'linear svc__coef0', 'linear svc__decision_function_shape', 'linear svc__degree', 'linear svc__gamma', 'linear svc__kernel', 'linear svc__max_iter', 'linear svc__probability', 'linear svc__random_state', 'linear svc

In [36]:
from sklearn.model_selection import GridSearchCV

pg = {'linear svc__C':[0.1, 1, 10, 100]} # parameter_grid

grid = GridSearchCV(svc, param_grid=pg, cv=5)

grid.fit(train.text, train.author)

print(grid.best_params_)

# grid.best_score_

{'linear svc__C': 1}


Single word is represented as a vector of 300 numbers. In order to use the word2vec model to generate features for your ML algorithm,
you need to convert your reviews into feature vectors. Represent a review document as the average vector of all words in the document. 

In [37]:
def save_to_csv(pred, idx):
    #file_title = str(pred_with_param.size)+"_"+str(pred_with_param.window)+"_"+str(pred_with_param.sample)+"_"+str(pred_min_count)+"_"+str(n_estimator)
    results_df = pd.DataFrame(pred, columns=['EAP','HPL','MWS'])
    merged_df = pd.concat([pd.Series(test.id), results_df], axis=1)
    merged_df.to_csv("./output/output"+str(idx)+".csv", sep=",",index=False)

In [None]:
save_to_csv_csv(pred, 4)

In [15]:
results_df = pd.DataFrame(pred, columns=['EAP','HPL','MWS'])

In [16]:
merged_df = pd.concat([pd.Series(test.id), results_df], axis=1)
merged_df.to_csv("./output/output3.csv", sep=",",index=False)