In [1]:
import pandas as pd
import numpy as np
import xgboost as xgb
from sklearn.model_selection import KFold
import matplotlib.pyplot as plt

In [2]:
text_feats = pd.read_csv('../../data/spooky-author/data/train_text_feats.csv')
nb_score = pd.read_csv('../../data/spooky-author/data/train_nb_score.csv').drop(['id', 'author'], axis=1)
nb_feats = pd.read_csv('../../data/spooky-author/data/train_nb_feats.csv')
nn_score = pd.read_csv('../../data/spooky-author/data/train_nn_score.csv')
nn_score['nn_prob'] = np.max(np.hstack([nn_score[['k1']], nn_score[['k2']], nn_score[['k3']]]), axis=1)
nn_score = nn_score[['id','keras','nn_prob']]
lstm_score = pd.read_csv('../../data/spooky-author/data/train_lstm_score.csv')
lstm_score['lstm_prob'] = np.max(np.hstack([lstm_score[['l1']], lstm_score[['l2']], lstm_score[['l3']]]), axis=1)
lstm_score = lstm_score[['id','lstm','lstm_prob']]
train_data = text_feats.join(nb_feats)
train_data = train_data.join(nb_score)
train_data = train_data.merge(nn_score, on='id')
train_data = train_data.merge(lstm_score, on='id')
train_data['agree'] = 1.* np.equal(train_data['keras'], train_data['lstm'])
dependent = pd.read_csv('../../data/spooky-author/download/train.csv', usecols=['id','author'])
mapper = {'EAP':0, 'HPL':1, 'MWS':2}
dependent['author'] = dependent['author'].map(lambda x : mapper[x])
train_data = dependent.merge(train_data, on='id').drop('id', axis=1)
print('data shapes:', train_data.shape)
train_matrix = xgb.DMatrix(data = train_data.iloc[:,1:], label = train_data['author'])

data shapes: (19579, 57)


In [3]:
booster = {}
booster['booster'] = 'gbtree'
booster['nthread'] = 7
booster['max_depth'] = 4
booster['min_child_weight'] = 1
booster['subsample'] = 0.75
booster['colsample_bytree'] = 1.0
booster['colsample_bylevel'] = 0.9
booster['lambda'] = 2.0
booster['alpha'] = 1.0
booster['objective'] = 'multi:softprob'
booster['eval_metric'] = ['mlogloss']
booster['num_class'] = 3
booster['seed'] = 2017

In [4]:
params = {}
params['params'] = booster
params['dtrain'] = train_matrix
params['num_boost_round'] = 2000
params['folds'] =  KFold(n_splits=5, random_state=2017, shuffle=True).split(train_data)
params['early_stopping_rounds'] = 50
params['verbose_eval'] = 100
params['show_stdv'] = False
params['callbacks'] = [xgb.callback.reset_learning_rate([0.05] * 2000)]

In [5]:
model = xgb.cv(**params)

[0]	train-mlogloss:1.04743	test-mlogloss:1.0481
[100]	train-mlogloss:0.293741	test-mlogloss:0.332912
[200]	train-mlogloss:0.242156	test-mlogloss:0.315832
[300]	train-mlogloss:0.207261	test-mlogloss:0.311394
[400]	train-mlogloss:0.178985	test-mlogloss:0.310673


In [6]:
params = {}
params['params'] = booster
params['dtrain'] = train_matrix
params['num_boost_round'] = 500
params['verbose_eval'] = 200
params['callbacks'] = [xgb.callback.reset_learning_rate([0.05] * 500)]
model = xgb.train(**params)

In [None]:
model.save_model('../../data/spooky-author/data/xgb_model')

In [8]:
sorted(model.get_fscore().items(), key=lambda x : x[1], reverse=True)[:20]

[('2_y', 1064),
 ('1_x', 849),
 ('2_x', 783),
 ('0_x', 685),
 ('1_y', 676),
 ('svd_char_3', 635),
 ('0_y', 630),
 ('svd_char_1', 552),
 ('ratio_punct', 524),
 ('svd_char_5', 514),
 ('svd_char_6', 490),
 ('svd_char_2', 481),
 ('0', 479),
 ('svd_wrd_7', 469),
 ('1', 455),
 ('svd_wrd_8', 449),
 ('svd_char_9', 447),
 ('svd_char_8', 437),
 ('svd_char_7', 432),
 ('svd_wrd_5', 409)]

In [9]:
sorted(model.get_fscore().items(), key=lambda x : x[1], reverse=False)[:10]

[('agree', 22),
 ('keras', 32),
 ('lstm', 40),
 ('count_pronoun', 56),
 ('count_det', 61),
 ('count_stopword', 75),
 ('count_word', 79),
 ('count_noun', 88),
 ('count_adj', 100),
 ('count_word_unique', 119)]