In [1]:
# Imports
import pandas as pd
import sys
import glob
import errno
import csv
import numpy as np
from nltk.corpus import stopwords
import re
import nltk.data
import nltk
import os
from collections import OrderedDict
from subprocess import check_call
from shutil import copyfile
from sklearn.metrics import log_loss
%matplotlib inline
import matplotlib.pyplot as plt
import mpld3
mpld3.enable_notebook()
import seaborn as sns
from collections import Counter
from sklearn.cross_validation import train_test_split
from sklearn.feature_extraction.text import TfidfVectorizer, CountVectorizer
from sklearn.decomposition import TruncatedSVD
from sklearn import ensemble, metrics, model_selection, naive_bayes
from sklearn.preprocessing import LabelEncoder
import xgboost as xgb
from tqdm import tqdm
from sklearn import preprocessing, decomposition, model_selection, metrics, pipeline
from keras.layers import GlobalMaxPooling1D, Conv1D, MaxPooling1D, Flatten, Bidirectional, SpatialDropout1D
from keras.preprocessing import sequence, text
from keras.callbacks import EarlyStopping
from nltk import word_tokenize
from keras.utils import np_utils
from keras.models import Sequential
from keras.layers.recurrent import LSTM, GRU
from keras.layers.core import Dense, Activation, Dropout
from keras.layers.embeddings import Embedding
from keras.layers.normalization import BatchNormalization

Using TensorFlow backend.


In [52]:
# Read data
train = "../data/train.csv"
test = "../data/test.csv"
wv = "../../../../glove.6B/glove.6B.50d.txt"
X_train = pd.read_csv( train, header=0,delimiter="," )
X_test = pd.read_csv( test, header=0,delimiter="," )

word_vecs = {}
with open(wv) as f:
    for line in f:
       vals = line.split()
       word_vecs[vals[0]] = np.array(vals[1::],dtype=float)
authors = ['EAP','MWS','HPL']

Y_train = LabelEncoder().fit_transform(X_train['author'])

In [3]:
# Clean data
def getWordVectors(X_train,X_test,word_vecs):
    X_train['word_vectors'] = [ [ word_vecs[word] for word in sentence if word in word_vecs] for sentence in X_train['text']]
    X_test['word_vectors'] = [ [ word_vecs[word] for word in sentence if word in word_vecs] for sentence in X_test['text']] 
    return X_train,X_test

def getSentenceVectors(X_train,X_test):
    X_train['sentence_vectors'] =[np.mean(sentence,axis = 0) for sentence in X_train['word_vectors']]
    X_test['sentence_vectors'] =[np.mean(sentence,axis = 0) for sentence in X_test['word_vectors']] 
    return X_train,X_test

def clean(X_train,X_test):
    X_train['words'] = [re.sub("[^a-zA-Z]"," ", data).lower().split() for data in X_train['text']]
    X_test['words'] = [re.sub("[^a-zA-Z]"," ", data).lower().split() for data in X_test['text']]
    return X_train,X_test
X_train,X_test = clean(X_train,X_test)
X_train,X_test = getWordVectors(X_train,X_test,word_vecs)
X_train,X_test = getSentenceVectors(X_train,X_test)
#X_train.head()

In [37]:
# Feature Engineering
# Punctuation
punctuations = [{"id":1,"p":"[;:]"},{"id":2,"p":"[,.]"},{"id":3,"p":"[?]"},{"id":4,"p":"[\']"},{"id":5,"p":"[\"]"},{"id":6,"p":"[;:,.?\'\"]"}]
for p in punctuations:
    punctuation = p["p"]
    _train =  [ sentence.split() for sentence in X_train['text'] ]
    X_train['punc_'+str(p["id"])] = [len([word for word in sentence if bool(re.search(punctuation, word))])*100.0/len(sentence) for sentence in _train]    

    _test =  [ sentence.split() for sentence in X_test['text'] ]
    X_test['punc_'+str(p["id"])] = [len([word for word in sentence if bool(re.search(punctuation, word))])*100.0/len(sentence) for sentence in _test]    



In [38]:
# Feature Engineering
# Stop Words
_dist_train = [x for x in X_train['words']]
X_train['stop_word'] = [len([word for word in sentence if word in stopwords.words('english')])*100.0/len(sentence) for sentence in _dist_train]

_dist_test = [x for x in X_test['words']]
X_test['stop_word'] = [len([word for word in sentence if word in stopwords.words('english')])*100.0/len(sentence) for sentence in _dist_test]    

In [39]:
# Feature Engineering
# tfidf - words - nb+svd
def tfidfWords(X_train,X_test):
    tfidf_vec = TfidfVectorizer(stop_words='english', ngram_range=(1,3))
    full_tfidf = tfidf_vec.fit_transform(X_train['text'].values.tolist() + X_test['text'].values.tolist())
    train_tfidf = tfidf_vec.transform(X_train['text'].values.tolist())
    test_tfidf = tfidf_vec.transform(X_test['text'].values.tolist())
    return train_tfidf,test_tfidf,full_tfidf
    
def runMNB(train_X, train_y, test_X, test_y, test_X2):
    model = naive_bayes.MultinomialNB()
    model.fit(train_X, train_y)
    pred_test_y = model.predict_proba(test_X)
    pred_test_y2 = model.predict_proba(test_X2)
    return pred_test_y, pred_test_y2, model

def runSVD(full_tfidf,train_tfidf,test_tfidf):   
    n_comp = 20
    svd_obj = TruncatedSVD(n_components=n_comp)
    svd_obj.fit(full_tfidf)
    train_svd = pd.DataFrame(svd_obj.transform(train_tfidf))
    test_svd = pd.DataFrame(svd_obj.transform(test_tfidf))

    train_svd.columns = ['svd_word_'+str(i) for i in range(n_comp)]
    test_svd.columns = ['svd_word_'+str(i) for i in range(n_comp)]
    return train_svd,test_svd

def do_tfidf_MNB(X_train,X_test,Y_train):
    train_tfidf,test_tfidf,full_tfidf = tfidfWords(X_train,X_test)
    cv_scores = []
    pred_full_test = 0
    pred_train = np.zeros([X_train.shape[0], 3])
    kf = model_selection.KFold(n_splits=5, shuffle=True, random_state=2017)
    for dev_index, val_index in kf.split(X_train):
        dev_X, val_X = train_tfidf[dev_index], train_tfidf[val_index]
        dev_y, val_y = Y_train[dev_index], Y_train[val_index]
        pred_val_y, pred_test_y, model = runMNB(dev_X, dev_y, val_X, val_y, test_tfidf)
        pred_full_test = pred_full_test + pred_test_y
        pred_train[val_index,:] = pred_val_y
        cv_scores.append(metrics.log_loss(val_y, pred_val_y))
    print("Mean cv score : ", np.mean(cv_scores))
    pred_full_test = pred_full_test / 5.
    return pred_train,pred_full_test

def do_tfidf_SVD(X_train,X_test,Y_train):
    train_tfidf,test_tfidf,full_tfidf = tfidfWords(X_train,X_test)
    train_svd,test_svd = runSVD(full_tfidf,train_tfidf,test_tfidf)
    return train_svd,test_svd

pred_train,pred_test = do_tfidf_MNB(X_train,X_test,Y_train)
X_train["tfidf_words_nb_eap"] = pred_train[:,0]
X_train["tfidf_words_nb_hpl"] = pred_train[:,1]
X_train["tfidf_words_nb_mws"] = pred_train[:,2]
X_test["tfidf_words_nb_eap"] = pred_test[:,0]
X_test["tfidf_words_nb_hpl"] = pred_test[:,1]
X_test["tfidf_words_nb_mws"] = pred_test[:,2]

# pred_train,pred_test = do_tfidf_SVD(X_train,X_test,Y_train)
# print pred_train
# # X_train["tfidf_words_nb_eap"] = pred_train[:,0]
# # X_train["tfidf_words_nb_hpl"] = pred_train[:,1]
# # X_train["tfidf_words_nb_mws"] = pred_train[:,2]
# # X_test["tfidf_words_nb_eap"] = pred_test[:,0]
# # X_test["tfidf_words_nb_hpl"] = pred_test[:,1]
# # X_test["tfidf_words_nb_mws"] = pred_test[:,2]

('Mean cv score : ', 0.84221619836128525)


In [40]:
# Feature Engineering
# tfidf - chars - nb+svd
def tfidfWords(X_train,X_test):
    tfidf_vec = TfidfVectorizer(stop_words='english', ngram_range=(1,5),analyzer='char')
    full_tfidf = tfidf_vec.fit_transform(X_train['text'].values.tolist() + X_test['text'].values.tolist())
    train_tfidf = tfidf_vec.transform(X_train['text'].values.tolist())
    test_tfidf = tfidf_vec.transform(X_test['text'].values.tolist())
    return train_tfidf,test_tfidf
    
def runMNB(train_X, train_y, test_X, test_y, test_X2):
    model = naive_bayes.MultinomialNB()
    model.fit(train_X, train_y)
    pred_test_y = model.predict_proba(test_X)
    pred_test_y2 = model.predict_proba(test_X2)
    return pred_test_y, pred_test_y2, model

def do(X_train,X_test,Y_train):
    train_tfidf,test_tfidf = tfidfWords(X_train,X_test)
    cv_scores = []
    pred_full_test = 0
    pred_train = np.zeros([X_train.shape[0], 3])
    kf = model_selection.KFold(n_splits=5, shuffle=True, random_state=2017)
    for dev_index, val_index in kf.split(X_train):
        dev_X, val_X = train_tfidf[dev_index], train_tfidf[val_index]
        dev_y, val_y = Y_train[dev_index], Y_train[val_index]
        pred_val_y, pred_test_y, model = runMNB(dev_X, dev_y, val_X, val_y, test_tfidf)
        pred_full_test = pred_full_test + pred_test_y
        pred_train[val_index,:] = pred_val_y
        cv_scores.append(metrics.log_loss(val_y, pred_val_y))
    print("Mean cv score : ", np.mean(cv_scores))
    pred_full_test = pred_full_test / 5.
    return pred_train,pred_full_test
pred_train,pred_test = do(X_train,X_test,Y_train)
X_train["tfidf_chars_nb_eap"] = pred_train[:,0]
X_train["tfidf_chars_nb_hpl"] = pred_train[:,1]
X_train["tfidf_chars_nb_mws"] = pred_train[:,2]
X_test["tfidf_chars_nb_eap"] = pred_test[:,0]
X_test["tfidf_chars_nb_hpl"] = pred_test[:,1]
X_test["tfidf_chars_nb_mws"] = pred_test[:,2]

('Mean cv score : ', 0.7904152589474216)


In [41]:
# Feature Engineering
# count - words - nb
def countWords(X_train,X_test):
    count_vec = CountVectorizer(stop_words='english', ngram_range=(1,3))
    count_vec.fit(X_train['text'].values.tolist() + X_test['text'].values.tolist())
    train_count = count_vec.transform(X_train['text'].values.tolist())
    test_count = count_vec.transform(X_test['text'].values.tolist())
    return train_count,test_count
    
def runMNB(train_X, train_y, test_X, test_y, test_X2):
    model = naive_bayes.MultinomialNB()
    model.fit(train_X, train_y)
    pred_test_y = model.predict_proba(test_X)
    pred_test_y2 = model.predict_proba(test_X2)
    return pred_test_y, pred_test_y2, model

def do_count_MNB(X_train,X_test,Y_train):
    train_count,test_count=countWords(X_train,X_test)
    cv_scores = []
    pred_full_test = 0
    pred_train = np.zeros([X_train.shape[0], 3])
    kf = model_selection.KFold(n_splits=5, shuffle=True, random_state=2017)
    for dev_index, val_index in kf.split(X_train):
        dev_X, val_X = train_count[dev_index], train_count[val_index]
        dev_y, val_y = Y_train[dev_index], Y_train[val_index]
        pred_val_y, pred_test_y, model = runMNB(dev_X, dev_y, val_X, val_y, test_count)
        pred_full_test = pred_full_test + pred_test_y
        pred_train[val_index,:] = pred_val_y
        cv_scores.append(metrics.log_loss(val_y, pred_val_y))
    print("Mean cv score : ", np.mean(cv_scores))
    pred_full_test = pred_full_test / 5.
    return pred_train,pred_full_test

pred_train,pred_test = do_count_MNB(X_train,X_test,Y_train)
X_train["count_words_nb_eap"] = pred_train[:,0]
X_train["count_words_nb_hpl"] = pred_train[:,1]
X_train["count_words_nb_mws"] = pred_train[:,2]
X_test["count_words_nb_eap"] = pred_test[:,0]
X_test["count_words_nb_hpl"] = pred_test[:,1]
X_test["count_words_nb_mws"] = pred_test[:,2]

('Mean cv score : ', 0.45091841616567468)


In [42]:
# Feature Engineering
# count - chars - nb
def countChars(X_train,X_test):
    count_vec = CountVectorizer(ngram_range=(1,7),analyzer='char')
    count_vec.fit(X_train['text'].values.tolist() + X_test['text'].values.tolist())
    train_count = count_vec.transform(X_train['text'].values.tolist())
    test_count = count_vec.transform(X_test['text'].values.tolist())
    return train_count,test_count
    
def runMNB(train_X, train_y, test_X, test_y, test_X2):
    model = naive_bayes.MultinomialNB()
    model.fit(train_X, train_y)
    pred_test_y = model.predict_proba(test_X)
    pred_test_y2 = model.predict_proba(test_X2)
    return pred_test_y, pred_test_y2, model

def do_count_chars_MNB(X_train,X_test,Y_train):
    train_count,test_count=countChars(X_train,X_test)
    cv_scores = []
    pred_full_test = 0
    pred_train = np.zeros([X_train.shape[0], 3])
    kf = model_selection.KFold(n_splits=5, shuffle=True, random_state=2017)
    for dev_index, val_index in kf.split(X_train):
        dev_X, val_X = train_count[dev_index], train_count[val_index]
        dev_y, val_y = Y_train[dev_index], Y_train[val_index]
        pred_val_y, pred_test_y, model = runMNB(dev_X, dev_y, val_X, val_y, test_count)
        pred_full_test = pred_full_test + pred_test_y
        pred_train[val_index,:] = pred_val_y
        cv_scores.append(metrics.log_loss(val_y, pred_val_y))
    print("Mean cv score : ", np.mean(cv_scores))
    pred_full_test = pred_full_test / 5.
    return pred_train,pred_full_test

pred_train,pred_test = do_count_chars_MNB(X_train,X_test,Y_train)
X_train["count_chars_nb_eap"] = pred_train[:,0]
X_train["count_chars_nb_hpl"] = pred_train[:,1]
X_train["count_chars_nb_mws"] = pred_train[:,2]
X_test["count_chars_nb_eap"] = pred_test[:,0]
X_test["count_chars_nb_hpl"] = pred_test[:,1]
X_test["count_chars_nb_mws"] = pred_test[:,2]

('Mean cv score : ', 3.750763922681903)


In [105]:
# Word Embeddings
# this function creates a normalized vector for the whole sentence
#X_train['sentence_vectors'][0]

#foo.columns = ['svd_char_'+str(i) for i in range(n_comp)]
X_train = pd.concat([X_train,pd.DataFrame(X_train['sentence_vectors'].tolist())],axis=1)
X_test = pd.concat([X_test,pd.DataFrame(X_test['sentence_vectors'].tolist())],axis=1)

# xtrain_glove = [sent2vec(x) for x in tqdm(xtrain)]
# xvalid_glove = [sent2vec(x) for x in tqdm(xvalid)]
# xtrain_glove = np.array(xtrain_glove)
# xvalid_glove = np.array(xvalid_glove)

In [60]:
# load the GloVe vectors in a dictionary:

def loadWordVecs():
    embeddings_index = {}
    f = open(wv)
    for line in f:
        values = line.split()
        word = values[0]
        coefs = np.asarray(values[1:], dtype='float32')
        embeddings_index[word] = coefs
    f.close()
    print('Found %s word vectors.' % len(embeddings_index))
    return embeddings_index

def sent2vec(embeddings_index,s): # this function creates a normalized vector for the whole sentence
    words = str(s).lower().decode('utf-8')
    words = word_tokenize(words)
    words = [w for w in words if not w in stopwords.words('english')]
    words = [w for w in words if w.isalpha()]
    M = []
    for w in words:
        try:
            M.append(embeddings_index[w])
        except:
            continue
    M = np.array(M)
    v = M.sum(axis=0)
    if type(v) != np.ndarray:
        return np.zeros(50)
    return v / np.sqrt((v ** 2).sum())

def doGlove(x_train,x_test):
    embeddings_index = loadWordVecs()
    # create sentence vectors using the above function for training and validation set
    xtrain_glove = [sent2vec(embeddings_index,x) for x in tqdm(x_train)]
    xtest_glove = [sent2vec(embeddings_index,x) for x in tqdm(x_test)]
    xtrain_glove = np.array(xtrain_glove)
    xtest_glove = np.array(xtest_glove)
    return xtrain_glove,xtest_glove

glove_vecs_train,glove_vecs_test = doGlove(X_train['text'],X_test['text'])
X_train[['sent_vec_'+str(i) for i in range(50)]] = pd.DataFrame(glove_vecs_train.tolist())
X_test[['sent_vec_'+str(i) for i in range(50)]] = pd.DataFrame(glove_vecs_test.tolist())




  0%|          | 0/19579 [00:00<?, ?it/s][A
  0%|          | 16/19579 [00:00<02:12, 147.18it/s][A

Found 400000 word vectors.



  0%|          | 27/19579 [00:00<02:37, 123.78it/s][A
  0%|          | 42/19579 [00:00<02:30, 129.93it/s][A
  0%|          | 59/19579 [00:00<02:20, 139.13it/s][A
  0%|          | 72/19579 [00:00<02:22, 136.52it/s][A
  0%|          | 89/19579 [00:00<02:17, 141.29it/s][A
  1%|          | 105/19579 [00:00<02:15, 143.44it/s][A
  1%|          | 124/19579 [00:00<02:10, 148.54it/s][A
  1%|          | 140/19579 [00:00<02:11, 147.53it/s][A
  1%|          | 157/19579 [00:01<02:09, 149.59it/s][A
  1%|          | 177/19579 [00:01<02:06, 153.17it/s][A
  1%|          | 194/19579 [00:01<02:06, 153.01it/s][A
  1%|          | 211/19579 [00:01<02:07, 151.43it/s][A
  1%|          | 227/19579 [00:01<02:09, 148.94it/s][A
  1%|▏         | 248/19579 [00:01<02:06, 152.45it/s][A
Exception in thread Thread-15:
Traceback (most recent call last):
  File "/Users/16521/anaconda2/lib/python2.7/threading.py", line 801, in __bootstrap_inner
    self.run()
  File "/Users/16521/anaconda2/lib/python2.7/sit

 13%|█▎        | 2589/19579 [00:15<01:39, 170.09it/s][A
 13%|█▎        | 2608/19579 [00:15<01:39, 170.04it/s][A
 13%|█▎        | 2626/19579 [00:15<01:39, 169.69it/s][A
 14%|█▎        | 2646/19579 [00:15<01:39, 169.85it/s][A
 14%|█▎        | 2666/19579 [00:15<01:39, 170.03it/s][A
 14%|█▎        | 2684/19579 [00:15<01:39, 169.98it/s][A
 14%|█▍        | 2705/19579 [00:15<01:39, 170.21it/s][A
 14%|█▍        | 2724/19579 [00:16<01:39, 169.78it/s][A
 14%|█▍        | 2747/19579 [00:16<01:38, 170.13it/s][A
 14%|█▍        | 2766/19579 [00:16<01:38, 169.99it/s][A
 14%|█▍        | 2784/19579 [00:16<01:38, 170.01it/s][A
 14%|█▍        | 2802/19579 [00:16<01:38, 170.05it/s][A
 14%|█▍        | 2821/19579 [00:16<01:38, 170.15it/s][A
 15%|█▍        | 2844/19579 [00:16<01:38, 170.40it/s][A
 15%|█▍        | 2863/19579 [00:16<01:38, 170.40it/s][A
 15%|█▍        | 2882/19579 [00:16<01:37, 170.44it/s][A
 15%|█▍        | 2901/19579 [00:17<01:37, 170.54it/s][A
 15%|█▍        | 2920/19579 [00

 26%|██▌       | 5057/19579 [00:31<01:29, 162.27it/s][A
 26%|██▌       | 5074/19579 [00:31<01:29, 162.11it/s][A
 26%|██▌       | 5090/19579 [00:31<01:29, 162.08it/s][A
 26%|██▌       | 5106/19579 [00:31<01:29, 161.99it/s][A
 26%|██▌       | 5121/19579 [00:31<01:29, 161.78it/s][A
 26%|██▌       | 5139/19579 [00:31<01:29, 161.75it/s][A
 26%|██▋       | 5154/19579 [00:31<01:29, 161.68it/s][A
 26%|██▋       | 5169/19579 [00:31<01:29, 161.54it/s][A
 26%|██▋       | 5183/19579 [00:32<01:29, 161.45it/s][A
 27%|██▋       | 5202/19579 [00:32<01:29, 161.51it/s][A
 27%|██▋       | 5217/19579 [00:32<01:28, 161.39it/s][A
 27%|██▋       | 5233/19579 [00:32<01:28, 161.35it/s][A
 27%|██▋       | 5248/19579 [00:32<01:28, 161.31it/s][A
 27%|██▋       | 5263/19579 [00:32<01:28, 161.26it/s][A
 27%|██▋       | 5279/19579 [00:32<01:28, 161.23it/s][A
 27%|██▋       | 5294/19579 [00:32<01:28, 161.18it/s][A
 27%|██▋       | 5309/19579 [00:32<01:28, 161.11it/s][A
 27%|██▋       | 5326/19579 [00

 39%|███▉      | 7620/19579 [00:47<01:13, 162.12it/s][A
 39%|███▉      | 7638/19579 [00:47<01:13, 162.15it/s][A
 39%|███▉      | 7656/19579 [00:47<01:13, 162.18it/s][A
 39%|███▉      | 7676/19579 [00:47<01:13, 162.26it/s][A
 39%|███▉      | 7699/19579 [00:47<01:13, 162.39it/s][A
 39%|███▉      | 7719/19579 [00:47<01:13, 162.42it/s][A
 40%|███▉      | 7738/19579 [00:47<01:12, 162.37it/s][A
 40%|███▉      | 7756/19579 [00:47<01:12, 162.37it/s][A
 40%|███▉      | 7774/19579 [00:47<01:12, 162.36it/s][A
 40%|███▉      | 7791/19579 [00:47<01:12, 162.37it/s][A
 40%|███▉      | 7808/19579 [00:48<01:12, 162.34it/s][A
 40%|███▉      | 7825/19579 [00:48<01:12, 162.32it/s][A
 40%|████      | 7841/19579 [00:48<01:12, 162.27it/s][A
 40%|████      | 7862/19579 [00:48<01:12, 162.37it/s][A
 40%|████      | 7880/19579 [00:48<01:12, 162.39it/s][A
 40%|████      | 7899/19579 [00:48<01:11, 162.44it/s][A
 40%|████      | 7917/19579 [00:48<01:11, 162.40it/s][A
 41%|████      | 7934/19579 [00

 53%|█████▎    | 10359/19579 [01:02<00:55, 164.67it/s][A
 53%|█████▎    | 10376/19579 [01:03<00:55, 164.67it/s][A
 53%|█████▎    | 10392/19579 [01:03<00:55, 164.65it/s][A
 53%|█████▎    | 10410/19579 [01:03<00:55, 164.66it/s][A
 53%|█████▎    | 10429/19579 [01:03<00:55, 164.68it/s][A
 53%|█████▎    | 10446/19579 [01:03<00:55, 164.68it/s][A
 53%|█████▎    | 10467/19579 [01:03<00:55, 164.73it/s][A
 54%|█████▎    | 10485/19579 [01:03<00:55, 164.69it/s][A
 54%|█████▎    | 10506/19579 [01:03<00:55, 164.75it/s][A
 54%|█████▍    | 10529/19579 [01:03<00:54, 164.84it/s][A
 54%|█████▍    | 10549/19579 [01:03<00:54, 164.88it/s][A
 54%|█████▍    | 10569/19579 [01:04<00:54, 164.93it/s][A
 54%|█████▍    | 10589/19579 [01:04<00:54, 164.98it/s][A
 54%|█████▍    | 10609/19579 [01:04<00:54, 164.98it/s][A
 54%|█████▍    | 10628/19579 [01:04<00:54, 164.98it/s][A
 54%|█████▍    | 10646/19579 [01:04<00:54, 164.94it/s][A
 54%|█████▍    | 10663/19579 [01:04<00:54, 164.89it/s][A
 55%|█████▍   

 67%|██████▋   | 13041/19579 [01:18<00:39, 166.40it/s][A
 67%|██████▋   | 13058/19579 [01:18<00:39, 166.37it/s][A
 67%|██████▋   | 13076/19579 [01:18<00:39, 166.38it/s][A
 67%|██████▋   | 13093/19579 [01:18<00:38, 166.36it/s][A
 67%|██████▋   | 13114/19579 [01:18<00:38, 166.40it/s][A
 67%|██████▋   | 13132/19579 [01:18<00:38, 166.38it/s][A
 67%|██████▋   | 13149/19579 [01:19<00:38, 166.34it/s][A
 67%|██████▋   | 13165/19579 [01:19<00:38, 166.32it/s][A
 67%|██████▋   | 13181/19579 [01:19<00:38, 166.29it/s][A
 67%|██████▋   | 13199/19579 [01:19<00:38, 166.31it/s][A
 68%|██████▊   | 13218/19579 [01:19<00:38, 166.33it/s][A
 68%|██████▊   | 13236/19579 [01:19<00:38, 166.34it/s][A
 68%|██████▊   | 13254/19579 [01:19<00:38, 166.31it/s][A
 68%|██████▊   | 13278/19579 [01:19<00:37, 166.40it/s][A
 68%|██████▊   | 13297/19579 [01:19<00:37, 166.41it/s][A
 68%|██████▊   | 13316/19579 [01:20<00:37, 166.41it/s][A
 68%|██████▊   | 13337/19579 [01:20<00:37, 166.45it/s][A
 68%|██████▊  

 80%|███████▉  | 15620/19579 [01:33<00:23, 166.53it/s][A
 80%|███████▉  | 15639/19579 [01:33<00:23, 166.56it/s][A
 80%|███████▉  | 15658/19579 [01:34<00:23, 166.56it/s][A
 80%|████████  | 15676/19579 [01:34<00:23, 166.56it/s][A
 80%|████████  | 15694/19579 [01:34<00:23, 166.54it/s][A
 80%|████████  | 15711/19579 [01:34<00:23, 166.51it/s][A
 80%|████████  | 15728/19579 [01:34<00:23, 166.51it/s][A
 80%|████████  | 15745/19579 [01:34<00:23, 166.49it/s][A
 81%|████████  | 15762/19579 [01:34<00:22, 166.49it/s][A
 81%|████████  | 15778/19579 [01:34<00:22, 166.46it/s][A
 81%|████████  | 15797/19579 [01:34<00:22, 166.48it/s][A
 81%|████████  | 15814/19579 [01:34<00:22, 166.47it/s][A
 81%|████████  | 15831/19579 [01:35<00:22, 166.47it/s][A
 81%|████████  | 15848/19579 [01:35<00:22, 166.47it/s][A
 81%|████████  | 15866/19579 [01:35<00:22, 166.48it/s][A
 81%|████████  | 15885/19579 [01:35<00:22, 166.50it/s][A
 81%|████████  | 15906/19579 [01:35<00:22, 166.53it/s][A
 81%|████████▏

 93%|█████████▎| 18216/19579 [01:49<00:08, 166.70it/s][A
 93%|█████████▎| 18233/19579 [01:49<00:08, 166.68it/s][A
 93%|█████████▎| 18249/19579 [01:49<00:07, 166.66it/s][A
 93%|█████████▎| 18270/19579 [01:49<00:07, 166.70it/s][A
 93%|█████████▎| 18290/19579 [01:49<00:07, 166.72it/s][A
 94%|█████████▎| 18310/19579 [01:49<00:07, 166.75it/s][A
 94%|█████████▎| 18329/19579 [01:49<00:07, 166.77it/s][A
 94%|█████████▎| 18348/19579 [01:50<00:07, 166.78it/s][A
 94%|█████████▍| 18368/19579 [01:50<00:07, 166.81it/s][A
 94%|█████████▍| 18387/19579 [01:50<00:07, 166.80it/s][A
 94%|█████████▍| 18405/19579 [01:50<00:07, 166.78it/s][A
 94%|█████████▍| 18425/19579 [01:50<00:06, 166.81it/s][A
 94%|█████████▍| 18443/19579 [01:50<00:06, 166.82it/s][A
 94%|█████████▍| 18461/19579 [01:50<00:06, 166.78it/s][A
 94%|█████████▍| 18478/19579 [01:50<00:06, 166.76it/s][A
 94%|█████████▍| 18495/19579 [01:50<00:06, 166.75it/s][A
 95%|█████████▍| 18511/19579 [01:51<00:06, 166.71it/s][A
 95%|█████████

In [107]:
# LSTM

def doAddNN(X_train,X_test,pred_train,pred_test):
    X_train["nn_eap"] = pred_train[:,0]
    X_train["nn_hpl"] = pred_train[:,1]
    X_train["nn_mws"] = pred_train[:,2]
    X_test["nn_eap"] = pred_test[:,0]
    X_test["nn_hpl"] = pred_test[:,1]
    X_test["nn_mws"] = pred_test[:,2]
    return X_train,X_test

def initNN():
    # create a simple 3 layer sequential neural net
    model = Sequential()

    model.add(Dense(1024, input_dim=50, activation='relu'))
    model.add(Dropout(0.2))
    model.add(BatchNormalization())

    model.add(Dense(1024, activation='relu'))
    model.add(Dropout(0.3))
    model.add(BatchNormalization())

    model.add(Dense(3))
    model.add(Activation('softmax'))

    # compile the model
    model.compile(loss='categorical_crossentropy', optimizer='adam')
    return model

def doNN():
    #glove_vecs_train,glove_vecs_test = doGlove(X_train['text'],X_test['text'])
    # scale the data before any neural net:
    embeddings_index = loadWordVecs()
    scl = preprocessing.StandardScaler()
    xtrain_glove_scl = scl.fit_transform(glove_vecs_train)
    xtest_glove_scl = scl.transform(glove_vecs_test)

    # we need to binarize the labels for the neural net
    ytrain_enc = np_utils.to_categorical(Y_train)
    #yvalid_enc = np_utils.to_categorical(yvalid)



    kf = model_selection.KFold(n_splits=5, shuffle=True, random_state=2017)
    cv_scores = []
    pred_full_test = 0
    pred_train = np.zeros([xtrain_glove_scl.shape[0], 3])
    for dev_index, val_index in kf.split(xtrain_glove_scl):
        dev_X, val_X = xtrain_glove_scl[dev_index], xtrain_glove_scl[val_index]
        dev_y, val_y = ytrain_enc[dev_index], ytrain_enc[val_index]
        model = initNN()
        model.fit(dev_X, y=dev_y, batch_size=100,epochs=10, verbose=1,validation_data=(val_X, val_y))
        pred_val_y = model.predict(val_X)
        pred_test_y = model.predict(xtest_glove_scl)
        pred_full_test = pred_full_test + pred_test_y
        pred_train[val_index,:] = pred_val_y

    

    # using keras tokenizer here
#     token = text.Tokenizer(num_words=None)
#     max_len = 70

#     token.fit_on_texts(list(X_train['text']) + list(X_test['text']))
#     xtrain_seq = token.texts_to_sequences(X_train['text'])
#     xtest_seq = token.texts_to_sequences(X_test['text'])

#     # zero pad the sequences
#     xtrain_pad = sequence.pad_sequences(xtrain_seq, maxlen=max_len)
#     xtest_pad = sequence.pad_sequences(xtest_seq, maxlen=max_len)

#     word_index = token.word_index

#     # create an embedding matrix for the words we have in the dataset
#     embedding_matrix = np.zeros((len(word_index) + 1, 50))
#     for word, i in tqdm(word_index.items()):
#         embedding_vector = embeddings_index.get(word)
#         if embedding_vector is not None:
#             embedding_matrix[i] = embedding_vector

#     # A simple LSTM with glove embeddings and two dense layers
#     model = Sequential()
#     model.add(Embedding(len(word_index) + 1,
#                          50,
#                          weights=[embedding_matrix],
#                          input_length=max_len,
#                          trainable=False))
#     model.add(Dropout(0.3))
#     model.add(LSTM(32, dropout=0.3, recurrent_dropout=0.3))

#     model.add(Dense(1024, activation='relu'))
#     model.add(Dropout(0.8))

#     model.add(Dense(1024, activation='relu'))
#     model.add(Dropout(0.8))

#     model.add(Dense(3))
#     model.add(Activation('softmax'))
#     model.compile(loss='categorical_crossentropy', optimizer='adam')

#     model.fit(xtrain_pad, y=ytrain_enc, batch_size=64, epochs=5, verbose=1)#, validation_data=(xvalid_pad, yvalid_enc))
    return doAddNN(X_train,X_test,pred_train,pred_test)
X_train,X_test = doNN()

Found 400000 word vectors.
Train on 15663 samples, validate on 3916 samples
Epoch 1/10
Epoch 2/10
Epoch 3/10
Epoch 4/10
Epoch 5/10
Epoch 6/10
Epoch 7/10
Epoch 8/10
Epoch 9/10
Epoch 10/10
Train on 15663 samples, validate on 3916 samples
Epoch 1/10
Epoch 2/10
Epoch 3/10
Epoch 4/10
Epoch 5/10
Epoch 6/10
Epoch 7/10
Epoch 8/10
Epoch 9/10
Epoch 10/10
Train on 15663 samples, validate on 3916 samples
Epoch 1/10
Epoch 2/10
Epoch 3/10
Epoch 4/10
Epoch 5/10
Epoch 6/10
Epoch 7/10
Epoch 8/10
Epoch 9/10
Epoch 10/10
Train on 15663 samples, validate on 3916 samples
Epoch 1/10
Epoch 2/10
Epoch 3/10
Epoch 4/10
Epoch 5/10
Epoch 6/10
Epoch 7/10
Epoch 8/10
Epoch 9/10
Epoch 10/10
Train on 15664 samples, validate on 3915 samples
Epoch 1/10
Epoch 2/10
Epoch 3/10
Epoch 4/10
Epoch 5/10
Epoch 6/10
Epoch 7/10
Epoch 8/10
Epoch 9/10
Epoch 10/10


(19579, 50)

In [108]:
# Model
# XGBoost
def runXGB(train_X, train_y, test_X, test_y=None, test_X2=None, seed_val=0, child=1, colsample=0.3):
    param = {}
    param['objective'] = 'multi:softprob'
    param['eta'] = 0.1
    param['max_depth'] = 3
    param['silent'] = 1
    param['num_class'] = 3
    param['eval_metric'] = "mlogloss"
    param['min_child_weight'] = child
    param['subsample'] = 0.8
    param['colsample_bytree'] = colsample
    param['seed'] = seed_val
    num_rounds = 2000

    plst = list(param.items())
    xgtrain = xgb.DMatrix(train_X, label=train_y)

    if test_y is not None:
        xgtest = xgb.DMatrix(test_X, label=test_y)
        watchlist = [ (xgtrain,'train'), (xgtest, 'test') ]
        model = xgb.train(plst, xgtrain, num_rounds, watchlist, early_stopping_rounds=50, verbose_eval=20)
    else:
        xgtest = xgb.DMatrix(test_X)
        model = xgb.train(plst, xgtrain, num_rounds)

    pred_test_y = model.predict(xgtest, ntree_limit = model.best_ntree_limit)
    if test_X2 is not None:
        xgtest2 = xgb.DMatrix(test_X2)
        pred_test_y2 = model.predict(xgtest2, ntree_limit = model.best_ntree_limit)
    return pred_test_y, pred_test_y2, model

def do(X_train,X_test,Y_train):
    drop_columns=["id","text","words","word_vectors","sentence_vectors"]
    x_train = X_train.drop(drop_columns+['author'],axis=1)
    x_test = X_test.drop(drop_columns,axis=1)
    y_train = Y_train
    
    kf = model_selection.KFold(n_splits=5, shuffle=True, random_state=2017)
    cv_scores = []
    pred_full_test = 0
    pred_train = np.zeros([x_train.shape[0], 3])
    for dev_index, val_index in kf.split(x_train):
        dev_X, val_X = x_train.loc[dev_index], x_train.loc[val_index]
        dev_y, val_y = y_train[dev_index], y_train[val_index]
        pred_val_y, pred_test_y, model = runXGB(dev_X, dev_y, val_X, val_y, x_test, seed_val=0, colsample=0.7)
        pred_full_test = pred_full_test + pred_test_y
        pred_train[val_index,:] = pred_val_y
        cv_scores.append(metrics.log_loss(val_y, pred_val_y))
    print("cv scores : ", cv_scores)
    return pred_full_test/5
result = do(X_train,X_test,Y_train)

[0]	train-mlogloss:1.00288	test-mlogloss:1.00257
Multiple eval metrics have been passed: 'test-mlogloss' will be used for early stopping.

Will train until test-mlogloss hasn't improved in 50 rounds.
[20]	train-mlogloss:0.411797	test-mlogloss:0.417844
[40]	train-mlogloss:0.340403	test-mlogloss:0.353986
[60]	train-mlogloss:0.31472	test-mlogloss:0.336772
[80]	train-mlogloss:0.299552	test-mlogloss:0.329917
[100]	train-mlogloss:0.286775	test-mlogloss:0.32556
[120]	train-mlogloss:0.276183	test-mlogloss:0.323955
[140]	train-mlogloss:0.266659	test-mlogloss:0.321831
[160]	train-mlogloss:0.257829	test-mlogloss:0.321086
[180]	train-mlogloss:0.249448	test-mlogloss:0.320623
[200]	train-mlogloss:0.242359	test-mlogloss:0.320272
[220]	train-mlogloss:0.235114	test-mlogloss:0.319675
[240]	train-mlogloss:0.228824	test-mlogloss:0.319564
[260]	train-mlogloss:0.222932	test-mlogloss:0.319111
[280]	train-mlogloss:0.216638	test-mlogloss:0.318577
[300]	train-mlogloss:0.210651	test-mlogloss:0.317977
[320]	train

In [109]:
# Write Results

def writeResult(result,test):
    # count number of files
    path, dirs, files = os.walk("../results").next()
    file_count = len(files)/2+1

    # Write the test results
    data=OrderedDict()
    data["id"]=test["id"] 
    data["EAP"]=result[0]#["EAP"]
    data["HPL"]=result[1]#["HPL"]	
    data["MWS"]=result[2]#["MWS"]
    output = pd.DataFrame(data=data)
    filename = "../results/result"+str(file_count)+".csv"
    output.to_csv( filename, index=False )
    filename = "../results/result"+str(file_count)+"compr.csv"
    output.to_csv( filename, index=False )
    check_call(['gzip', filename])

writeResult(result.T,X_test)

Unnamed: 0,id,text,author,words,word_vectors,sentence_vectors,punc_1,punc_2,punc_3,punc_4,punc_5,punc_6
0,id26305,"This process, however, afforded me no means of...",EAP,"[this, process, however, afforded, me, no, mea...","[[-0.22701, 0.70329, 0.96125, 0.93479, 0.7205,...","[-0.102682611702, 0.843608723404, 0.6941906914...",4.878049,12.195122,0.000000,0.000000,0.000000,17.073171
1,id17569,It never once occurred to me that the fumbling...,HPL,"[it, never, once, occurred, to, me, that, the,...","[[-0.37915, 0.61848, 0.9593, 0.90403, 0.36806,...","[-0.071579877193, 0.818990877193, 0.7487534035...",0.000000,7.142857,0.000000,0.000000,0.000000,7.142857
2,id11008,"In his left hand was a gold snuff box, from wh...",EAP,"[in, his, left, hand, was, a, gold, snuff, box...","[[-0.27004, 1.1144, 1.0493, 0.57924, 0.78968, ...","[-0.137265402439, 0.810834207317, 0.6727634268...",0.000000,13.888889,0.000000,0.000000,0.000000,13.888889
3,id27763,How lovely is spring As we looked from Windsor...,MWS,"[how, lovely, is, spring, as, we, looked, from...","[[-0.043861, 1.3183, -0.03715, 0.85478, 0.1221...","[-0.0491552662722, 0.823676804734, 0.681447236...",0.000000,11.764706,0.000000,0.000000,0.000000,11.764706
4,id12958,"Finding nothing else, not even gold, the Super...",HPL,"[finding, nothing, else, not, even, gold, the,...","[[0.11891, 0.15255, -0.082073, -0.74144, 0.759...","[-0.0283006027397, 0.804272054795, 0.685995212...",3.703704,11.111111,0.000000,0.000000,0.000000,14.814815
5,id22965,"A youth passed in solitude, my best years spen...",MWS,"[a, youth, passed, in, solitude, my, best, yea...","[[0.11723, 1.0841, -0.053105, 1.5335, -0.14481...","[-0.0410986325459, 0.79631328084, 0.6697033727...",1.204819,6.024096,0.000000,0.000000,0.000000,7.228916
6,id09674,"The astronomer, perhaps, at this point, took r...",EAP,"[the, astronomer, perhaps, at, this, point, to...","[[-0.22701, 0.70329, 0.96125, 0.93479, 0.7205,...","[-0.0840731495327, 0.857992523364, 0.664399906...",4.761905,19.047619,0.000000,0.000000,0.000000,23.809524
7,id13515,The surcingle hung in ribands from my body.,EAP,"[the, surcingle, hung, in, ribands, from, my, ...","[[-0.22701, 0.70329, 0.96125, 0.93479, 0.7205,...","[-0.141073485714, 0.853343142857, 0.6402297428...",0.000000,12.500000,0.000000,0.000000,0.000000,12.500000
8,id19322,I knew that you could not say to yourself 'ste...,EAP,"[i, knew, that, you, could, not, say, to, your...","[[-0.01397, 0.9522, 1.3895, 0.31898, 0.7499, 0...","[-0.0970629771574, 0.842538477157, 0.648277215...",1.136364,9.090909,0.000000,1.136364,0.000000,11.363636
9,id00912,I confess that neither the structure of langua...,MWS,"[i, confess, that, neither, the, structure, of...","[[-0.48882, 0.48228, 0.45726, 0.89723, 0.84066...","[-0.102452322314, 0.832763305785, 0.6967663719...",0.000000,13.043478,0.000000,0.000000,0.000000,13.043478


Index([u'id', u'text', u'author', u'words', u'word_vectors',
       u'sentence_vectors', u'sent_vec_0', u'sent_vec_1', u'sent_vec_2',
       u'sent_vec_3', u'sent_vec_4', u'sent_vec_5', u'sent_vec_6',
       u'sent_vec_7', u'sent_vec_8', u'sent_vec_9', u'sent_vec_10',
       u'sent_vec_11', u'sent_vec_12', u'sent_vec_13', u'sent_vec_14',
       u'sent_vec_15', u'sent_vec_16', u'sent_vec_17', u'sent_vec_18',
       u'sent_vec_19', u'sent_vec_20', u'sent_vec_21', u'sent_vec_22',
       u'sent_vec_23', u'sent_vec_24', u'sent_vec_25', u'sent_vec_26',
       u'sent_vec_27', u'sent_vec_28', u'sent_vec_29', u'sent_vec_30',
       u'sent_vec_31', u'sent_vec_32', u'sent_vec_33', u'sent_vec_34',
       u'sent_vec_35', u'sent_vec_36', u'sent_vec_37', u'sent_vec_38',
       u'sent_vec_39', u'sent_vec_40', u'sent_vec_41', u'sent_vec_42',
       u'sent_vec_43', u'sent_vec_44', u'sent_vec_45', u'sent_vec_46',
       u'sent_vec_47', u'sent_vec_48', u'sent_vec_49', u'nn_eap', u'nn_hpl',
       u'nn_mws