In [1]:
import pandas as pd
import numpy as np
import xgboost as xgb
from sklearn.model_selection import KFold
import matplotlib.pyplot as plt

In [2]:
text_feats = pd.read_csv('../../data/spooky-author/data/train_text_feats.csv')
nb_score = pd.read_csv('../../data/spooky-author/data/train_nb_score.csv').drop(['id', 'author'], axis=1)
nb_feats = pd.read_csv('../../data/spooky-author/data/train_nb_feats.csv')
nn_score = pd.read_csv('../../data/spooky-author/data/train_nn_score.csv')[['id','keras']]
train_data = text_feats.join(nb_feats)
train_data = train_data.join(nb_score)
train_data = train_data.merge(nn_score, on='id')
dependent = pd.read_csv('../../data/spooky-author/download/train.csv', usecols=['id','author'])
mapper = {'EAP':0, 'HPL':1, 'MWS':2}
dependent['author'] = dependent['author'].map(lambda x : mapper[x])
train_data = dependent.merge(train_data, on='id').drop('id', axis=1)
print('data shapes:', train_data.shape)
train_matrix = xgb.DMatrix(data = train_data.iloc[:,1:], label = train_data['author'])

data shapes: (19579, 53)


In [3]:
booster = {}
booster['booster'] = 'gbtree'
booster['nthread'] = 7
booster['max_depth'] = 3
booster['min_child_weight'] = 1
booster['subsample'] = 0.8
booster['colsample_bytree'] = 1.0
booster['colsample_bylevel'] = 0.9
booster['lambda'] = 1.0
booster['alpha'] = 0.0
booster['objective'] = 'multi:softprob'
booster['eval_metric'] = ['mlogloss']
booster['num_class'] = 3
booster['seed'] = 2017

In [4]:
params = {}
params['params'] = booster
params['dtrain'] = train_matrix
params['num_boost_round'] = 2000
params['folds'] =  KFold(n_splits=5, random_state=2017, shuffle=True).split(train_data)
params['early_stopping_rounds'] = 50
params['verbose_eval'] = 100
params['show_stdv'] = False
params['callbacks'] = [xgb.callback.reset_learning_rate([0.05] * 2000)]

In [5]:
model = xgb.cv(**params)

[0]	train-mlogloss:1.04826	test-mlogloss:1.0486
[100]	train-mlogloss:0.321079	test-mlogloss:0.342846
[200]	train-mlogloss:0.278196	test-mlogloss:0.321653
[300]	train-mlogloss:0.252204	test-mlogloss:0.314913
[400]	train-mlogloss:0.231661	test-mlogloss:0.31162
[500]	train-mlogloss:0.214224	test-mlogloss:0.310236
[600]	train-mlogloss:0.198933	test-mlogloss:0.309644
[700]	train-mlogloss:0.184865	test-mlogloss:0.309103


In [6]:
params = {}
params['params'] = booster
params['dtrain'] = train_matrix
params['num_boost_round'] = 700
params['verbose_eval'] = 200
params['callbacks'] = [xgb.callback.reset_learning_rate([0.05] * 700)]
model = xgb.train(**params)

In [7]:
model.save_model('../../data/spooky-author/data/xgb_model')