In [1]:
import os
import numpy as np
import pandas as pd
from sklearn.model_selection import train_test_split, GridSearchCV, StratifiedKFold
import xgboost as xgb
from sklearn.metrics import accuracy_score
from transformers.dummies_encoder import DummiesEncoder
from transformers.item_selector import ItemSelector
from transformers.morphology_extractor import MorphologyExtractor
from transformers.dict_class_transformer import DictClassTransformer
from sklearn.pipeline import Pipeline, FeatureUnion
from sklearn.datasets import dump_svmlight_file
from transformers.multi_label_encoder import MultiLabelEncoder
from transformers.reshape_2d import Reshape2D
from transformers.pandas_union import PandasUnion
from transformers.pandas_shift import PandasShift
from transformers.string_splitter import StringSplitter
from loading import load_train
import matplotlib
matplotlib.use('Agg')
from matplotlib import pyplot
import seaborn as sns

%matplotlib inline

#Prepare data

In [2]:
df = load_train(['before', 'class']).sample(1000, random_state=2017)
df.info()

<class 'pandas.core.frame.DataFrame'>
Int64Index: 1000 entries, 4656679 to 7180668
Data columns (total 2 columns):
class     1000 non-null object
before    1000 non-null object
dtypes: object(2)
memory usage: 23.4+ KB


In [3]:
pipeline = Pipeline([
    # ('features', FeatureUnion([
    ('select', ItemSelector('before')),
    ('features', PandasUnion([
        ('char', Pipeline([
            ('split', StringSplitter(10))
        ])),
        ('ctx', Pipeline([
            ('extract', MorphologyExtractor()),
            ('one_hot', DummiesEncoder())
        ])),
        ('char_prev', Pipeline([
            ('shift', PandasShift(1)),
            ('split', StringSplitter(5))
        ])),
        ('ctx_prev', Pipeline([
            ('shift', PandasShift(1)),
            ('extract', MorphologyExtractor()),
            ('one_hot', DummiesEncoder())
        ])),
        ('char_next', Pipeline([
            ('shift', PandasShift(-1)),
            ('split', StringSplitter(5))
        ])),
        ('ctx_next', Pipeline([
            ('shift', PandasShift(-1)),
            ('extract', MorphologyExtractor()),
            ('one_hot', DummiesEncoder())
        ])),
    ])),
])

x_data = pipeline.fit_transform(df.drop(['class'], axis=1))
print(x_data.shape)

y_data = pd.factorize(df['class'])
labels = y_data[1]
y_data = y_data[0]
x_train, x_test, y_train, y_test = train_test_split(x_data, y_data, test_size=0.1, random_state=2017)

MorphologyExtractor transform:   0%|          | 0/1000 [00:00<?, ?it/s]

MorphologyExtractor transform: 100%|██████████| 1000/1000 [00:00<00:00, 14272.85it/s]


MorphologyExtractor transform:   0%|          | 0/1000 [00:00<?, ?it/s]

MorphologyExtractor transform: 100%|██████████| 1000/1000 [00:00<00:00, 13133.22it/s]


MorphologyExtractor transform:   0%|          | 0/1000 [00:00<?, ?it/s]

MorphologyExtractor transform: 100%|██████████| 1000/1000 [00:00<00:00, 11696.00it/s]


MorphologyExtractor transform:   0%|          | 0/1000 [00:00<?, ?it/s]

MorphologyExtractor transform: 100%|██████████| 1000/1000 [00:00<00:00, 41965.70it/s]




MorphologyExtractor transform:   0%|          | 0/1000 [00:00<?, ?it/s]

MorphologyExtractor transform: 100%|██████████| 1000/1000 [00:00<00:00, 26243.27it/s]




MorphologyExtractor transform:   0%|          | 0/1000 [00:00<?, ?it/s]

MorphologyExtractor transform: 100%|██████████| 1000/1000 [00:00<00:00, 34716.17it/s]




(1000, 155)


In [69]:
dump_svmlight_file(x_train, y_train, 'models/class.txt.train')
dump_svmlight_file(x_test, y_test, 'models/class.txt.test')

#Search number of estimators by early stopping

In [4]:
dtrain = xgb.DMatrix('models/class.txt.train#dtrain.cache')
# dtest = xgb.DMatrix('models/class.txt.test#dtest.cache')

In [13]:
num_classes = len(set(dtrain.get_label()))

199

In [7]:
xgb_param = {
    'learning_rate': 0.1,
    'num_boost_round': 1000,
    'max_depth': 5,
    'min_child_weight': 1,
    'gamma': 0,
    'subsample': 0.8,
    'colsample_bytree': 0.8,
    'objective': 'multi:softmax',
    'num_class': num_classes,
    'nthread': 4,
    'scale_pos_weight': 1,
    'seed': 2017}
cvresult = xgb.cv(xgb_param, dtrain, num_boost_round=xgb_param['num_boost_round'], 
                  stratified=True, nfold=10,
                  metrics=['merror'], early_stopping_rounds=100, verbose_eval=True)
print('Best num_boost_round value', cvresult.shape[0])

pyplot.errorbar(cvresult.index, cvresult['train-merror-mean'], 
                yerr=cvresult['train-merror-std'], ecolor='r')
pyplot.errorbar(cvresult.index, cvresult['test-merror-mean'], 
                yerr=cvresult['test-merror-std'], ecolor='g')
pyplot.title("XGBoost num_boost_round vs MError")
pyplot.xlabel('num_boost_round')
pyplot.ylabel('MError')
pyplot.savefig('num_boost_round.png')

KeyboardInterrupt: 

In [None]:
watchlist = [(dtest, 'test'), (dtrain, 'train')]
param = {'objective': 'multi:softmax',
         'eta': '0.3',
         'max_depth': 5,
         'silent': 1,
         'nthread': -1,
         'num_class': len(labels),
         'eval_metric': 'merror',
         'seed': '2017'}
model = xgb.train(param, dtrain, num_boost_round=cvresult.shape[0], evals=watchlist, 
                  early_stopping_rounds=100, verbose_eval=True)

predicted_val = model.predict(dtrain)
print(f'pipeline val error {1.0-accuracy_score(y_train, predicted_val)}', flush=True)
predicted = model.predict(dtest)
print(f'pipeline test error {1.0-accuracy_score(y_test, predicted)}', flush=True)

plt.rcParams['font.size'] = 8
feat_imp = pd.Series(model.get_fscore()).sort_values(ascending=False)
print(feat_imp.index)
print(x_data.columns[~x_data.columns.isin(feat_imp.index)])
feat_imp.plot(kind='bar', title='Feature Importances')
plt.ylabel('Feature Importance Score')
plt.tight_layout()
plt.show()

#CV search

In [19]:
model = xgb.XGBClassifier()
n_estimators = range(100, 1000, 100)
param_grid = dict(n_estimators=n_estimators)
kfold = StratifiedKFold(n_splits=3, shuffle=True, random_state=2017)
grid_search = GridSearchCV(model, param_grid, scoring="accuracy", n_jobs=-1, cv=kfold, verbose=10)
grid_result = grid_search.fit(x_train, y_train)

print("Best: %f using %s" % (grid_result.best_score_, grid_result.best_params_))
means = grid_result.cv_results_['mean_test_score']
stds = grid_result.cv_results_['std_test_score']
params = grid_result.cv_results_['params']
for mean, stdev, param in zip(means, stds, params):
    print("%f (%f) with: %r" % (mean, stdev, param))

pyplot.errorbar(n_estimators, means, yerr=stds)
pyplot.title("XGBoost n_estimators vs Accuracy")
pyplot.xlabel('n_estimators')
pyplot.ylabel('Accuracy')
pyplot.savefig('n_estimators.png')

Fitting 3 folds for each of 9 candidates, totalling 27 fits


[CV] n_estimators=100 ................................................


[CV] n_estimators=100 ................................................


[CV] n_estimators=100 ................................................


[CV] n_estimators=200 ................................................


KeyboardInterrupt: 

In [None]:
model = XGBClassifier()
max_depth = range(1, 11, 2)
print(max_depth)
param_grid = dict(max_depth=max_depth)
model = xgb.XGBClassifier()
n_estimators = range(100, 1000, 100)
param_grid = dict(n_estimators=n_estimators)
kfold = StratifiedKFold(n_splits=3, shuffle=True, random_state=2017)
grid_search = GridSearchCV(model, param_grid, scoring="accuracy", n_jobs=-1, cv=kfold, verbose=10)
grid_result = grid_search.fit(x_train, y_train)

print("Best: %f using %s" % (grid_result.best_score_, grid_result.best_params_))
means = grid_result.cv_results_['mean_test_score']
stds = grid_result.cv_results_['std_test_score']
params = grid_result.cv_results_['params']
for mean, stdev, param in zip(means, stds, params):
    print("%f (%f) with: %r" % (mean, stdev, param))

pyplot.errorbar(n_estimators, means, yerr=stds)
pyplot.title("XGBoost n_estimators vs Accuracy")
pyplot.xlabel('n_estimators')
pyplot.ylabel('Accuracy')
pyplot.savefig('n_estimators.png')

MorphologyExtractor transform:   0%|          | 0/100000 [00:00<?, ?it/s]

MorphologyExtractor transform:   1%|▏         | 1318/100000 [00:00<00:07, 13173.10it/s]

MorphologyExtractor transform:   2%|▏         | 2363/100000 [00:00<00:08, 11658.19it/s]

MorphologyExtractor transform:   4%|▍         | 3767/100000 [00:00<00:07, 12443.39it/s]

MorphologyExtractor transform:   5%|▍         | 4596/100000 [00:00<00:10, 9444.87it/s] 

MorphologyExtractor transform:   6%|▌         | 5531/100000 [00:00<00:10, 9423.64it/s]

MorphologyExtractor transform:   7%|▋         | 6557/100000 [00:00<00:09, 9545.32it/s]

MorphologyExtractor transform:   8%|▊         | 7533/100000 [00:00<00:09, 9571.73it/s]

MorphologyExtractor transform:   8%|▊         | 8424/100000 [00:00<00:09, 9367.94it/s]

MorphologyExtractor transform:   9%|▉         | 9286/100000 [00:01<00:09, 9271.78it/s]

MorphologyExtractor transform:  11%|█         | 11076/100000 [00:01<00:08, 10054.81it/s]

MorphologyExtractor transform:  12%|█▏        | 12212/100000 [00:01<00:08, 10076.64it/s]

MorphologyExtractor transform:  14%|█▍        | 14355/100000 [00:01<00:07, 10941.18it/s]

MorphologyExtractor transform:  16%|█▋        | 16348/100000 [00:01<00:07, 11577.78it/s]

MorphologyExtractor transform:  18%|█▊        | 18485/100000 [00:01<00:06, 12225.25it/s]

MorphologyExtractor transform:  21%|██        | 20686/100000 [00:01<00:06, 12832.04it/s]

MorphologyExtractor transform:  23%|██▎       | 22875/100000 [00:01<00:05, 13361.03it/s]

MorphologyExtractor transform:  25%|██▌       | 25180/100000 [00:01<00:05, 13895.63it/s]

MorphologyExtractor transform:  27%|██▋       | 27444/100000 [00:01<00:05, 14352.26it/s]

MorphologyExtractor transform:  30%|██▉       | 29802/100000 [00:02<00:04, 14809.92it/s]

MorphologyExtractor transform:  32%|███▏      | 32001/100000 [00:02<00:04, 14609.67it/s]

MorphologyExtractor transform:  34%|███▍      | 34363/100000 [00:02<00:04, 15003.01it/s]

MorphologyExtractor transform:  37%|███▋      | 36647/100000 [00:02<00:04, 15330.56it/s]

MorphologyExtractor transform:  39%|███▉      | 39024/100000 [00:02<00:03, 15668.78it/s]

MorphologyExtractor transform:  41%|████▏     | 41347/100000 [00:02<00:03, 15959.97it/s]

MorphologyExtractor transform:  44%|████▎     | 43679/100000 [00:02<00:03, 16232.82it/s]

MorphologyExtractor transform:  46%|████▌     | 46198/100000 [00:02<00:03, 16553.79it/s]

MorphologyExtractor transform:  49%|████▊     | 48683/100000 [00:02<00:03, 16834.81it/s]

MorphologyExtractor transform:  51%|█████     | 51058/100000 [00:03<00:02, 16635.24it/s]

MorphologyExtractor transform:  54%|█████▎    | 53683/100000 [00:03<00:02, 16937.25it/s]

MorphologyExtractor transform:  56%|█████▋    | 56271/100000 [00:03<00:02, 17210.45it/s]

MorphologyExtractor transform:  59%|█████▉    | 58962/100000 [00:03<00:02, 17498.31it/s]

MorphologyExtractor transform:  61%|██████▏   | 61406/100000 [00:03<00:02, 17694.78it/s]

MorphologyExtractor transform:  64%|██████▍   | 63984/100000 [00:03<00:02, 17921.15it/s]

MorphologyExtractor transform:  67%|██████▋   | 66579/100000 [00:03<00:01, 18139.43it/s]

MorphologyExtractor transform:  69%|██████▉   | 69204/100000 [00:03<00:01, 18354.45it/s]

MorphologyExtractor transform:  72%|███████▏  | 71849/100000 [00:03<00:01, 18563.41it/s]

MorphologyExtractor transform:  74%|███████▍  | 74426/100000 [00:04<00:01, 18330.57it/s]

MorphologyExtractor transform:  77%|███████▋  | 77055/100000 [00:04<00:01, 18521.78it/s]

MorphologyExtractor transform:  80%|███████▉  | 79653/100000 [00:04<00:01, 18696.72it/s]

MorphologyExtractor transform:  82%|████████▏ | 82377/100000 [00:04<00:00, 18892.12it/s]

MorphologyExtractor transform:  85%|████████▌ | 85024/100000 [00:04<00:00, 19061.48it/s]

MorphologyExtractor transform:  88%|████████▊ | 87712/100000 [00:04<00:00, 19232.34it/s]

MorphologyExtractor transform:  90%|█████████ | 90348/100000 [00:04<00:00, 19385.22it/s]

MorphologyExtractor transform:  93%|█████████▎| 92973/100000 [00:04<00:00, 19529.14it/s]

MorphologyExtractor transform:  96%|█████████▌| 95654/100000 [00:04<00:00, 19678.87it/s]

MorphologyExtractor transform:  98%|█████████▊| 98285/100000 [00:04<00:00, 19793.33it/s]

MorphologyExtractor transform: 100%|██████████| 100000/100000 [00:05<00:00, 19864.93it/s]




MorphologyExtractor transform:   0%|          | 0/100000 [00:00<?, ?it/s]

MorphologyExtractor transform:   1%|          | 1215/100000 [00:00<00:08, 12141.39it/s]

MorphologyExtractor transform:   2%|▏         | 2498/100000 [00:00<00:09, 10592.06it/s]

MorphologyExtractor transform:   4%|▍         | 4415/100000 [00:00<00:07, 13142.65it/s]

MorphologyExtractor transform:   6%|▋         | 6473/100000 [00:00<00:06, 14847.69it/s]

MorphologyExtractor transform:   9%|▊         | 8564/100000 [00:00<00:05, 15978.27it/s]

MorphologyExtractor transform:  11%|█         | 10706/100000 [00:00<00:05, 16832.43it/s]

MorphologyExtractor transform:  13%|█▎        | 12939/100000 [00:00<00:04, 17578.27it/s]

MorphologyExtractor transform:  15%|█▌        | 15088/100000 [00:00<00:04, 18045.30it/s]

MorphologyExtractor transform:  17%|█▋        | 17279/100000 [00:00<00:04, 18457.41it/s]

MorphologyExtractor transform:  20%|█▉        | 19667/100000 [00:01<00:04, 18977.67it/s]

MorphologyExtractor transform:  22%|██▏       | 21799/100000 [00:01<00:04, 17993.65it/s]

MorphologyExtractor transform:  24%|██▍       | 24199/100000 [00:01<00:04, 18450.78it/s]

MorphologyExtractor transform:  26%|██▋       | 26475/100000 [00:01<00:03, 18755.70it/s]

MorphologyExtractor transform:  29%|██▉       | 29094/100000 [00:01<00:03, 19247.14it/s]

MorphologyExtractor transform:  31%|███▏      | 31463/100000 [00:01<00:03, 19522.57it/s]

MorphologyExtractor transform:  34%|███▍      | 33815/100000 [00:01<00:03, 19755.94it/s]

MorphologyExtractor transform:  36%|███▋      | 36250/100000 [00:01<00:03, 20009.44it/s]

MorphologyExtractor transform:  39%|███▊      | 38684/100000 [00:01<00:03, 20235.56it/s]

MorphologyExtractor transform:  41%|████      | 41056/100000 [00:02<00:03, 19449.92it/s]

MorphologyExtractor transform:  44%|████▎     | 43532/100000 [00:02<00:02, 19686.95it/s]

MorphologyExtractor transform:  46%|████▌     | 45986/100000 [00:02<00:02, 19896.63it/s]

MorphologyExtractor transform:  49%|████▉     | 48763/100000 [00:02<00:02, 20223.22it/s]

MorphologyExtractor transform:  51%|█████▏    | 51405/100000 [00:02<00:02, 20469.88it/s]

MorphologyExtractor transform:  54%|█████▍    | 54192/100000 [00:02<00:02, 20753.13it/s]

MorphologyExtractor transform:  57%|█████▋    | 56761/100000 [00:02<00:02, 20928.17it/s]

MorphologyExtractor transform:  59%|█████▉    | 59440/100000 [00:02<00:01, 21135.90it/s]

MorphologyExtractor transform:  62%|██████▏   | 62209/100000 [00:02<00:01, 21360.90it/s]

MorphologyExtractor transform:  65%|██████▍   | 64935/100000 [00:03<00:01, 21556.69it/s]

MorphologyExtractor transform:  68%|██████▊   | 67607/100000 [00:03<00:01, 20990.39it/s]

MorphologyExtractor transform:  70%|███████   | 70101/100000 [00:03<00:01, 21109.26it/s]

MorphologyExtractor transform:  73%|███████▎  | 72700/100000 [00:03<00:01, 21251.79it/s]

MorphologyExtractor transform:  75%|███████▌  | 75370/100000 [00:03<00:01, 21405.66it/s]

MorphologyExtractor transform:  78%|███████▊  | 78032/100000 [00:03<00:01, 21549.63it/s]

MorphologyExtractor transform:  81%|████████  | 80643/100000 [00:03<00:00, 21671.88it/s]

MorphologyExtractor transform:  83%|████████▎ | 83423/100000 [00:03<00:00, 21831.83it/s]

MorphologyExtractor transform:  86%|████████▌ | 86085/100000 [00:03<00:00, 21953.89it/s]

MorphologyExtractor transform:  89%|████████▊ | 88722/100000 [00:04<00:00, 22051.49it/s]

MorphologyExtractor transform:  91%|█████████▏| 91342/100000 [00:04<00:00, 22127.87it/s]

MorphologyExtractor transform:  94%|█████████▍| 93928/100000 [00:04<00:00, 22212.79it/s]

MorphologyExtractor transform:  97%|█████████▋| 96509/100000 [00:04<00:00, 22294.10it/s]

MorphologyExtractor transform:  99%|█████████▉| 99137/100000 [00:04<00:00, 22383.99it/s]

MorphologyExtractor transform: 100%|██████████| 100000/100000 [00:04<00:00, 21697.81it/s]




MorphologyExtractor transform:   0%|          | 0/100000 [00:00<?, ?it/s]

MorphologyExtractor transform:   1%|          | 1175/100000 [00:00<00:08, 11636.76it/s]

MorphologyExtractor transform:   3%|▎         | 3264/100000 [00:00<00:05, 16237.75it/s]

MorphologyExtractor transform:   5%|▌         | 5303/100000 [00:00<00:05, 17616.12it/s]

MorphologyExtractor transform:   7%|▋         | 7445/100000 [00:00<00:04, 18563.30it/s]

MorphologyExtractor transform:  10%|▉         | 9666/100000 [00:00<00:04, 19290.56it/s]

MorphologyExtractor transform:  12%|█▏        | 11918/100000 [00:00<00:04, 19825.86it/s]

MorphologyExtractor transform:  14%|█▍        | 14358/100000 [00:00<00:04, 20478.02it/s]

MorphologyExtractor transform:  16%|█▋        | 16403/100000 [00:00<00:04, 20327.20it/s]

MorphologyExtractor transform:  18%|█▊        | 18415/100000 [00:00<00:04, 19603.37it/s]

MorphologyExtractor transform:  20%|██        | 20279/100000 [00:01<00:04, 18802.20it/s]

MorphologyExtractor transform:  22%|██▏       | 22303/100000 [00:01<00:04, 18922.40it/s]

MorphologyExtractor transform:  25%|██▍       | 24596/100000 [00:01<00:03, 19231.25it/s]

MorphologyExtractor transform:  27%|██▋       | 26884/100000 [00:01<00:03, 19495.81it/s]

MorphologyExtractor transform:  30%|██▉       | 29591/100000 [00:01<00:03, 20005.45it/s]

MorphologyExtractor transform:  32%|███▏      | 31920/100000 [00:01<00:03, 20210.03it/s]

MorphologyExtractor transform:  34%|███▍      | 34343/100000 [00:01<00:03, 20447.48it/s]

MorphologyExtractor transform:  37%|███▋      | 36790/100000 [00:01<00:03, 20674.58it/s]

MorphologyExtractor transform:  39%|███▉      | 39144/100000 [00:01<00:03, 19583.61it/s]

MorphologyExtractor transform:  42%|████▏     | 41677/100000 [00:02<00:02, 19856.51it/s]

MorphologyExtractor transform:  44%|████▍     | 43931/100000 [00:02<00:02, 19978.50it/s]

MorphologyExtractor transform:  46%|████▌     | 46104/100000 [00:02<00:02, 19818.63it/s]

MorphologyExtractor transform:  48%|████▊     | 48137/100000 [00:02<00:02, 19583.04it/s]

MorphologyExtractor transform:  50%|█████     | 50023/100000 [00:02<00:02, 19539.61it/s]

MorphologyExtractor transform:  53%|█████▎    | 52660/100000 [00:02<00:02, 19795.56it/s]

MorphologyExtractor transform:  55%|█████▌    | 55302/100000 [00:02<00:02, 20029.06it/s]

MorphologyExtractor transform:  58%|█████▊    | 57984/100000 [00:02<00:02, 20266.17it/s]

MorphologyExtractor transform:  61%|██████    | 60526/100000 [00:02<00:01, 20440.17it/s]

MorphologyExtractor transform:  63%|██████▎   | 63123/100000 [00:03<00:01, 20620.20it/s]

MorphologyExtractor transform:  66%|██████▌   | 65798/100000 [00:03<00:01, 20814.02it/s]

MorphologyExtractor transform:  68%|██████▊   | 68337/100000 [00:03<00:01, 20173.27it/s]

MorphologyExtractor transform:  71%|███████   | 71125/100000 [00:03<00:01, 20394.13it/s]

MorphologyExtractor transform:  74%|███████▎  | 73747/100000 [00:03<00:01, 20556.45it/s]

MorphologyExtractor transform:  76%|███████▌  | 76155/100000 [00:03<00:01, 20647.54it/s]

MorphologyExtractor transform:  79%|███████▊  | 78746/100000 [00:03<00:01, 20786.27it/s]

MorphologyExtractor transform:  81%|████████▏ | 81254/100000 [00:03<00:00, 20894.85it/s]

MorphologyExtractor transform:  84%|████████▍ | 83782/100000 [00:03<00:00, 21004.40it/s]

MorphologyExtractor transform:  86%|████████▋ | 86369/100000 [00:04<00:00, 21123.34it/s]

MorphologyExtractor transform:  89%|████████▉ | 88990/100000 [00:04<00:00, 21244.41it/s]

MorphologyExtractor transform:  92%|█████████▏| 91538/100000 [00:04<00:00, 21332.56it/s]

MorphologyExtractor transform:  94%|█████████▍| 94191/100000 [00:04<00:00, 21450.80it/s]

MorphologyExtractor transform:  97%|█████████▋| 96796/100000 [00:04<00:00, 21552.54it/s]

MorphologyExtractor transform:  99%|█████████▉| 99399/100000 [00:04<00:00, 21650.07it/s]

MorphologyExtractor transform: 100%|██████████| 100000/100000 [00:04<00:00, 21657.18it/s]


