In [1]:
import numpy as np
import pandas as pd
import scipy

import string
import re
import gc

In [2]:
import xgboost as xgb
from sklearn.model_selection import train_test_split
from sklearn.preprocessing import LabelEncoder
from sklearn.feature_extraction.text import CountVectorizer



In [37]:
train = pd.read_csv('data/train.csv', encoding='utf-8')
train.dropna(inplace=True)
train = train[train.sentence_id != 492799]
train.reset_index(inplace=True)

#### Prepare full data

In [71]:
vectorizer = CountVectorizer (
    analyzer='char_wb',
    ngram_range=(1, 2),
    lowercase=False
)

In [73]:
%time vectorizer.fit(train['before'].values)

CPU times: user 2min 17s, sys: 5.31 s, total: 2min 23s
Wall time: 2min 24s


CountVectorizer(analyzer='char_wb', binary=False, decode_error=u'strict',
        dtype=<type 'numpy.int64'>, encoding=u'utf-8', input=u'content',
        lowercase=False, max_df=1.0, max_features=None, min_df=1,
        ngram_range=(1, 2), preprocessor=None, stop_words=None,
        strip_accents=None, token_pattern=u'(?u)\\b\\w\\w+\\b',
        tokenizer=None, vocabulary=None)

In [90]:
len(vectorizer.get_feature_names())

16351

In [77]:
labels = np.array(train['after'].values != train['before'].values, dtype=np.uint8)

enc = LabelEncoder()
labels = enc.fit_transform(labels)

#### Prepare context data

In [91]:
def get_context_value(values, sentences, offset):
    offset_values = np.asarray([u'' for v in range(len(values))], dtype=object)
    for i in range(len(values)):
        offset_value = ""
        i_offset = i + offset
        if i_offset >= 0 and i_offset < len(values) \
                and sentences[i_offset] == sentences[i]:
            offset_value = values[i_offset]
        offset_values[i] = offset_value

    return offset_values

In [40]:
train['before_m_2'] = pd.Series(get_context_value(train.before.values, train.sentence_id.values, -2))
train['before_m_1'] = pd.Series(get_context_value(train.before.values, train.sentence_id.values, -1))
train['before_p_1'] = pd.Series(get_context_value(train.before.values, train.sentence_id.values, 1))
train['before_p_2'] = pd.Series(get_context_value(train.before.values, train.sentence_id.values, 2))

In [93]:
def generate_context_ngrams(data, columns):
    return scipy.sparse.hstack([vectorizer.transform(data[c].values) for c in columns])

In [43]:
context_cols = ['before_m_2', 'before_m_1', 'before', 'before_p_1', 'before_p_2']

#### Prepare train data

In [79]:
sample_sizes = {
    "PUNCT" : 25000,
    "PLAIN" : 30000,
    "CARDINAL" : 20000,
    "LETTERS" : 20000,
    "DATE" : 20000,
    "ORDINAL" : 15000,
    "MEASURE" : 15000
}

# train.reset_index(inplace=True)
train_data = pd.DataFrame(columns=train.columns)
sample_max_size = 10000

for c in train['class'].unique():
    class_sample = train[train['class'] == c]
    sample_size = sample_sizes[c] if c in sample_sizes else sample_max_size
    if len(class_sample) > sample_max_size:
        train_data = train_data.append(class_sample.sample(sample_size))
    else:
        train_data = train_data.append(class_sample)

In [94]:
train_ngrams = generate_context_ngrams(train_data, context_cols)

In [96]:
labels_train = np.array(train_data['after'].values != train_data['before'].values, dtype=np.uint8)
labels_train = enc.transform(labels_train)

### Train XGBoost classifier 

In [97]:
xtr, xcv, ytr, ycv = train_test_split(train_ngrams, labels_train, test_size = 0.05, random_state = 42)
feature_names = ['f_{}'.format(i) for i in range(train_ngrams.shape[1])]

In [98]:
dtrain = xgb.DMatrix(xtr, label=ytr, feature_names=feature_names)
dvalid = xgb.DMatrix(xcv, label=ycv, feature_names=feature_names)

In [99]:
params = [("objective", "binary:logistic"),
          ("booster", "gbtree"),
          ("nthread", 3),
          ("eta", 0.01),
          ("max_depth", 8),
          ("subsample", 0.9),
          ("min_child_weight", 1),
          ("colsample_bytree", 0.7),
          ("eval_metric", 'error'),
          ("eval_metric", 'auc'),
         ]
num_rounds = 10000
stop = 60

In [100]:
watchlist = [(dtrain, 'train'), (dvalid, 'valid')]

In [101]:
gbm = xgb.train(params, dtrain, num_rounds, evals=watchlist, early_stopping_rounds=stop)

[0]	train-error:0.15299	train-auc:0.926252	valid-error:0.15659	valid-auc:0.923645
Multiple eval metrics have been passed: 'valid-auc' will be used for early stopping.

Will train until valid-auc hasn't improved in 60 rounds.
[1]	train-error:0.094578	train-auc:0.969127	valid-error:0.095706	valid-auc:0.968482
[2]	train-error:0.08131	train-auc:0.973787	valid-error:0.080859	valid-auc:0.973352
[3]	train-error:0.086724	train-auc:0.973908	valid-error:0.087161	valid-auc:0.9735
[4]	train-error:0.081333	train-auc:0.973835	valid-error:0.080966	valid-auc:0.973216
[5]	train-error:0.075115	train-auc:0.979218	valid-error:0.076266	valid-auc:0.97869
[6]	train-error:0.065344	train-auc:0.982773	valid-error:0.065798	valid-auc:0.982741
[7]	train-error:0.068492	train-auc:0.984472	valid-error:0.068789	valid-auc:0.984588
[8]	train-error:0.068526	train-auc:0.986457	valid-error:0.069109	valid-auc:0.986652
[9]	train-error:0.067267	train-auc:0.987555	valid-error:0.067614	valid-auc:0.987935
[10]	train-error:0.0652

[96]	train-error:0.055832	train-auc:0.990762	valid-error:0.055757	valid-auc:0.990693
[97]	train-error:0.055854	train-auc:0.990764	valid-error:0.055757	valid-auc:0.990688
[98]	train-error:0.055843	train-auc:0.99148	valid-error:0.055757	valid-auc:0.991638
[99]	train-error:0.055792	train-auc:0.991467	valid-error:0.055757	valid-auc:0.991639
[100]	train-error:0.055815	train-auc:0.991471	valid-error:0.055757	valid-auc:0.991638
[101]	train-error:0.055804	train-auc:0.991474	valid-error:0.055757	valid-auc:0.991651
[102]	train-error:0.05582	train-auc:0.991477	valid-error:0.055864	valid-auc:0.991666
[103]	train-error:0.05582	train-auc:0.992308	valid-error:0.055864	valid-auc:0.992477
[104]	train-error:0.055849	train-auc:0.992314	valid-error:0.055864	valid-auc:0.992495
[105]	train-error:0.055815	train-auc:0.992314	valid-error:0.055864	valid-auc:0.992475
[106]	train-error:0.055854	train-auc:0.992316	valid-error:0.055864	valid-auc:0.992494
[107]	train-error:0.055849	train-auc:0.992781	valid-error:0.0

[192]	train-error:0.030145	train-auc:0.996692	valid-error:0.029801	valid-auc:0.99673
[193]	train-error:0.027008	train-auc:0.996873	valid-error:0.025208	valid-auc:0.99699
[194]	train-error:0.026991	train-auc:0.996875	valid-error:0.025208	valid-auc:0.996987
[195]	train-error:0.031466	train-auc:0.996872	valid-error:0.030015	valid-auc:0.996983
[196]	train-error:0.029729	train-auc:0.997033	valid-error:0.028626	valid-auc:0.997069
[197]	train-error:0.02965	train-auc:0.997038	valid-error:0.028626	valid-auc:0.997058
[198]	train-error:0.029684	train-auc:0.997047	valid-error:0.028733	valid-auc:0.997069
[199]	train-error:0.027632	train-auc:0.997077	valid-error:0.027558	valid-auc:0.997083
[200]	train-error:0.027643	train-auc:0.99708	valid-error:0.027558	valid-auc:0.997087
[201]	train-error:0.028846	train-auc:0.997069	valid-error:0.029374	valid-auc:0.997072
[202]	train-error:0.027683	train-auc:0.997093	valid-error:0.027665	valid-auc:0.997099
[203]	train-error:0.027615	train-auc:0.997119	valid-error:

[288]	train-error:0.021504	train-auc:0.997795	valid-error:0.021897	valid-auc:0.997752
[289]	train-error:0.021526	train-auc:0.997798	valid-error:0.021897	valid-auc:0.997757
[290]	train-error:0.021538	train-auc:0.997804	valid-error:0.021897	valid-auc:0.99776
[291]	train-error:0.021526	train-auc:0.997807	valid-error:0.02179	valid-auc:0.997764
[292]	train-error:0.021526	train-auc:0.99781	valid-error:0.02179	valid-auc:0.997767
[293]	train-error:0.021521	train-auc:0.997819	valid-error:0.02179	valid-auc:0.997774
[294]	train-error:0.021487	train-auc:0.997819	valid-error:0.02179	valid-auc:0.997775
[295]	train-error:0.021504	train-auc:0.997821	valid-error:0.021897	valid-auc:0.997777
[296]	train-error:0.02151	train-auc:0.997821	valid-error:0.021897	valid-auc:0.997782
[297]	train-error:0.02151	train-auc:0.997822	valid-error:0.02179	valid-auc:0.997784
[298]	train-error:0.02151	train-auc:0.997828	valid-error:0.021897	valid-auc:0.997794
[299]	train-error:0.02151	train-auc:0.997822	valid-error:0.02189

[384]	train-error:0.021251	train-auc:0.998129	valid-error:0.021683	valid-auc:0.998088
[385]	train-error:0.02124	train-auc:0.998132	valid-error:0.021683	valid-auc:0.998089
[386]	train-error:0.02124	train-auc:0.998133	valid-error:0.021683	valid-auc:0.998088
[387]	train-error:0.02124	train-auc:0.998186	valid-error:0.021683	valid-auc:0.998152
[388]	train-error:0.021223	train-auc:0.998188	valid-error:0.021577	valid-auc:0.998155
[389]	train-error:0.021223	train-auc:0.998189	valid-error:0.021577	valid-auc:0.99816
[390]	train-error:0.021217	train-auc:0.99819	valid-error:0.021577	valid-auc:0.998163
[391]	train-error:0.021217	train-auc:0.998194	valid-error:0.021577	valid-auc:0.998163
[392]	train-error:0.021212	train-auc:0.998197	valid-error:0.021577	valid-auc:0.998166
[393]	train-error:0.021217	train-auc:0.998198	valid-error:0.021577	valid-auc:0.998164
[394]	train-error:0.021189	train-auc:0.998208	valid-error:0.021577	valid-auc:0.998176
[395]	train-error:0.021212	train-auc:0.998224	valid-error:0

[480]	train-error:0.016871	train-auc:0.998529	valid-error:0.017197	valid-auc:0.998513
[481]	train-error:0.01686	train-auc:0.998533	valid-error:0.017304	valid-auc:0.998519
[482]	train-error:0.016821	train-auc:0.998534	valid-error:0.017304	valid-auc:0.998519
[483]	train-error:0.016984	train-auc:0.998536	valid-error:0.01709	valid-auc:0.998517
[484]	train-error:0.016984	train-auc:0.998538	valid-error:0.01709	valid-auc:0.998518
[485]	train-error:0.016961	train-auc:0.998541	valid-error:0.01709	valid-auc:0.998517
[486]	train-error:0.01722	train-auc:0.998542	valid-error:0.017731	valid-auc:0.998516
[487]	train-error:0.017209	train-auc:0.998543	valid-error:0.017731	valid-auc:0.998517
[488]	train-error:0.017203	train-auc:0.998547	valid-error:0.017731	valid-auc:0.99852
[489]	train-error:0.017344	train-auc:0.998548	valid-error:0.017945	valid-auc:0.998517
[490]	train-error:0.017344	train-auc:0.998554	valid-error:0.017945	valid-auc:0.99853
[491]	train-error:0.017361	train-auc:0.998555	valid-error:0.0

[576]	train-error:0.015545	train-auc:0.998753	valid-error:0.01677	valid-auc:0.998717
[577]	train-error:0.015528	train-auc:0.998753	valid-error:0.01677	valid-auc:0.998721
[578]	train-error:0.015545	train-auc:0.998754	valid-error:0.01677	valid-auc:0.998722
[579]	train-error:0.015533	train-auc:0.998757	valid-error:0.01677	valid-auc:0.998724
[580]	train-error:0.015539	train-auc:0.998759	valid-error:0.01677	valid-auc:0.998725
[581]	train-error:0.015533	train-auc:0.998759	valid-error:0.01677	valid-auc:0.998725
[582]	train-error:0.015528	train-auc:0.99876	valid-error:0.01677	valid-auc:0.998723
[583]	train-error:0.015511	train-auc:0.998762	valid-error:0.01677	valid-auc:0.998724
[584]	train-error:0.015494	train-auc:0.998764	valid-error:0.016663	valid-auc:0.998725
[585]	train-error:0.0155	train-auc:0.998766	valid-error:0.016663	valid-auc:0.998726
[586]	train-error:0.0155	train-auc:0.998767	valid-error:0.016663	valid-auc:0.998727
[587]	train-error:0.0155	train-auc:0.998769	valid-error:0.016663	va

[672]	train-error:0.013465	train-auc:0.998901	valid-error:0.014206	valid-auc:0.99886
[673]	train-error:0.013448	train-auc:0.998909	valid-error:0.014206	valid-auc:0.998873
[674]	train-error:0.013448	train-auc:0.998909	valid-error:0.014206	valid-auc:0.998873
[675]	train-error:0.013453	train-auc:0.998912	valid-error:0.014206	valid-auc:0.998874
[676]	train-error:0.013324	train-auc:0.998911	valid-error:0.013886	valid-auc:0.998875
[677]	train-error:0.013318	train-auc:0.998912	valid-error:0.013886	valid-auc:0.998874
[678]	train-error:0.013273	train-auc:0.998913	valid-error:0.013779	valid-auc:0.998875
[679]	train-error:0.013313	train-auc:0.998914	valid-error:0.013779	valid-auc:0.998876
[680]	train-error:0.013279	train-auc:0.998915	valid-error:0.013779	valid-auc:0.998875
[681]	train-error:0.013268	train-auc:0.998916	valid-error:0.013779	valid-auc:0.998877
[682]	train-error:0.01324	train-auc:0.998919	valid-error:0.013672	valid-auc:0.998878
[683]	train-error:0.013234	train-auc:0.99892	valid-error

[768]	train-error:0.011896	train-auc:0.999006	valid-error:0.012497	valid-auc:0.99898
[769]	train-error:0.01189	train-auc:0.999007	valid-error:0.012497	valid-auc:0.998981
[770]	train-error:0.01189	train-auc:0.999008	valid-error:0.012497	valid-auc:0.998983
[771]	train-error:0.011896	train-auc:0.999008	valid-error:0.012497	valid-auc:0.998983
[772]	train-error:0.011902	train-auc:0.999007	valid-error:0.012497	valid-auc:0.99898
[773]	train-error:0.011907	train-auc:0.999008	valid-error:0.012497	valid-auc:0.998982
[774]	train-error:0.011907	train-auc:0.999008	valid-error:0.012497	valid-auc:0.998984
[775]	train-error:0.011907	train-auc:0.99901	valid-error:0.012497	valid-auc:0.998988
[776]	train-error:0.011907	train-auc:0.999011	valid-error:0.012497	valid-auc:0.998989
[777]	train-error:0.011902	train-auc:0.999011	valid-error:0.012497	valid-auc:0.998986
[778]	train-error:0.011907	train-auc:0.999012	valid-error:0.012497	valid-auc:0.998987
[779]	train-error:0.011907	train-auc:0.999012	valid-error:0

[864]	train-error:0.011682	train-auc:0.999072	valid-error:0.012391	valid-auc:0.999037
[865]	train-error:0.011677	train-auc:0.999073	valid-error:0.012391	valid-auc:0.999038
[866]	train-error:0.011677	train-auc:0.999074	valid-error:0.012391	valid-auc:0.99904
[867]	train-error:0.011677	train-auc:0.999074	valid-error:0.012391	valid-auc:0.999039
[868]	train-error:0.011671	train-auc:0.999074	valid-error:0.012391	valid-auc:0.999047
[869]	train-error:0.011677	train-auc:0.999076	valid-error:0.012391	valid-auc:0.999045
[870]	train-error:0.011671	train-auc:0.999076	valid-error:0.012391	valid-auc:0.999045
[871]	train-error:0.011682	train-auc:0.999076	valid-error:0.012391	valid-auc:0.999045
[872]	train-error:0.011682	train-auc:0.999077	valid-error:0.012391	valid-auc:0.999046
[873]	train-error:0.011688	train-auc:0.999077	valid-error:0.012391	valid-auc:0.999046
[874]	train-error:0.011671	train-auc:0.999087	valid-error:0.012391	valid-auc:0.999059
[875]	train-error:0.01166	train-auc:0.999088	valid-erro

[960]	train-error:0.011418	train-auc:0.99915	valid-error:0.012177	valid-auc:0.999131
[961]	train-error:0.011413	train-auc:0.999153	valid-error:0.012177	valid-auc:0.999132
[962]	train-error:0.011418	train-auc:0.999153	valid-error:0.012177	valid-auc:0.999133
[963]	train-error:0.011418	train-auc:0.999154	valid-error:0.012177	valid-auc:0.999133
[964]	train-error:0.011418	train-auc:0.999156	valid-error:0.012177	valid-auc:0.999137
[965]	train-error:0.011418	train-auc:0.999156	valid-error:0.012177	valid-auc:0.999138
[966]	train-error:0.011418	train-auc:0.999156	valid-error:0.012177	valid-auc:0.999139
[967]	train-error:0.011413	train-auc:0.999157	valid-error:0.012177	valid-auc:0.999139
[968]	train-error:0.011407	train-auc:0.999158	valid-error:0.012177	valid-auc:0.99914
[969]	train-error:0.011413	train-auc:0.999158	valid-error:0.012177	valid-auc:0.99914
[970]	train-error:0.011356	train-auc:0.999159	valid-error:0.01207	valid-auc:0.999141
[971]	train-error:0.011362	train-auc:0.99916	valid-error:0

[1056]	train-error:0.010991	train-auc:0.999212	valid-error:0.011963	valid-auc:0.999194
[1057]	train-error:0.010991	train-auc:0.999213	valid-error:0.011963	valid-auc:0.999194
[1058]	train-error:0.010997	train-auc:0.999213	valid-error:0.011963	valid-auc:0.999195
[1059]	train-error:0.010997	train-auc:0.999213	valid-error:0.011963	valid-auc:0.999195
[1060]	train-error:0.010997	train-auc:0.999213	valid-error:0.011963	valid-auc:0.999196
[1061]	train-error:0.010997	train-auc:0.999214	valid-error:0.011963	valid-auc:0.999196
[1062]	train-error:0.010991	train-auc:0.999214	valid-error:0.011963	valid-auc:0.999197
[1063]	train-error:0.010997	train-auc:0.999215	valid-error:0.011963	valid-auc:0.999197
[1064]	train-error:0.010997	train-auc:0.999215	valid-error:0.011963	valid-auc:0.999197
[1065]	train-error:0.010997	train-auc:0.999217	valid-error:0.011963	valid-auc:0.999198
[1066]	train-error:0.010997	train-auc:0.999217	valid-error:0.011963	valid-auc:0.999198
[1067]	train-error:0.01098	train-auc:0.9992

[1151]	train-error:0.010384	train-auc:0.999267	valid-error:0.011429	valid-auc:0.999245
[1152]	train-error:0.010378	train-auc:0.999268	valid-error:0.011429	valid-auc:0.999246
[1153]	train-error:0.010378	train-auc:0.999273	valid-error:0.011429	valid-auc:0.99926
[1154]	train-error:0.010373	train-auc:0.999273	valid-error:0.011429	valid-auc:0.999261
[1155]	train-error:0.010378	train-auc:0.999274	valid-error:0.011429	valid-auc:0.999261
[1156]	train-error:0.010373	train-auc:0.999274	valid-error:0.011429	valid-auc:0.999261
[1157]	train-error:0.010378	train-auc:0.999275	valid-error:0.011429	valid-auc:0.999262
[1158]	train-error:0.010344	train-auc:0.999275	valid-error:0.011429	valid-auc:0.999263
[1159]	train-error:0.010344	train-auc:0.999275	valid-error:0.011429	valid-auc:0.999263
[1160]	train-error:0.010344	train-auc:0.999276	valid-error:0.011429	valid-auc:0.999263
[1161]	train-error:0.010339	train-auc:0.999276	valid-error:0.011429	valid-auc:0.999264
[1162]	train-error:0.010339	train-auc:0.9992

[1246]	train-error:0.010198	train-auc:0.999322	valid-error:0.011002	valid-auc:0.999302
[1247]	train-error:0.010204	train-auc:0.999323	valid-error:0.011002	valid-auc:0.999302
[1248]	train-error:0.010198	train-auc:0.999323	valid-error:0.011002	valid-auc:0.999302
[1249]	train-error:0.010209	train-auc:0.999323	valid-error:0.011002	valid-auc:0.999302
[1250]	train-error:0.010181	train-auc:0.999323	valid-error:0.011002	valid-auc:0.999302
[1251]	train-error:0.010198	train-auc:0.999324	valid-error:0.011002	valid-auc:0.999302
[1252]	train-error:0.010193	train-auc:0.999324	valid-error:0.011002	valid-auc:0.999302
[1253]	train-error:0.010198	train-auc:0.999325	valid-error:0.011002	valid-auc:0.999303
[1254]	train-error:0.010193	train-auc:0.999325	valid-error:0.011002	valid-auc:0.999303
[1255]	train-error:0.010187	train-auc:0.999326	valid-error:0.011002	valid-auc:0.999303
[1256]	train-error:0.010187	train-auc:0.999326	valid-error:0.011002	valid-auc:0.999304
[1257]	train-error:0.010181	train-auc:0.999

[1341]	train-error:0.010114	train-auc:0.999365	valid-error:0.010895	valid-auc:0.999333
[1342]	train-error:0.01012	train-auc:0.999365	valid-error:0.010895	valid-auc:0.999333
[1343]	train-error:0.010114	train-auc:0.999366	valid-error:0.010895	valid-auc:0.999335
[1344]	train-error:0.010114	train-auc:0.999366	valid-error:0.010895	valid-auc:0.999335
[1345]	train-error:0.010114	train-auc:0.999367	valid-error:0.010895	valid-auc:0.999335
[1346]	train-error:0.010114	train-auc:0.999368	valid-error:0.010895	valid-auc:0.999336
[1347]	train-error:0.010114	train-auc:0.999368	valid-error:0.010895	valid-auc:0.999336
[1348]	train-error:0.010114	train-auc:0.999369	valid-error:0.010895	valid-auc:0.999337
[1349]	train-error:0.010108	train-auc:0.999369	valid-error:0.010895	valid-auc:0.999336
[1350]	train-error:0.010103	train-auc:0.999369	valid-error:0.010895	valid-auc:0.999336
[1351]	train-error:0.010114	train-auc:0.99937	valid-error:0.010895	valid-auc:0.999336
[1352]	train-error:0.010108	train-auc:0.99937

[1436]	train-error:0.009822	train-auc:0.999407	valid-error:0.010895	valid-auc:0.999354
[1437]	train-error:0.009822	train-auc:0.999408	valid-error:0.010895	valid-auc:0.999354
[1438]	train-error:0.009822	train-auc:0.999408	valid-error:0.010895	valid-auc:0.999355
[1439]	train-error:0.009822	train-auc:0.999409	valid-error:0.010895	valid-auc:0.999355
[1440]	train-error:0.009827	train-auc:0.999409	valid-error:0.010895	valid-auc:0.999356
[1441]	train-error:0.009822	train-auc:0.999409	valid-error:0.010895	valid-auc:0.999357
[1442]	train-error:0.009822	train-auc:0.99941	valid-error:0.010895	valid-auc:0.999357
[1443]	train-error:0.009822	train-auc:0.99941	valid-error:0.010895	valid-auc:0.999357
[1444]	train-error:0.009822	train-auc:0.99941	valid-error:0.010895	valid-auc:0.999356
[1445]	train-error:0.009816	train-auc:0.999411	valid-error:0.010895	valid-auc:0.999356
[1446]	train-error:0.009816	train-auc:0.999411	valid-error:0.010895	valid-auc:0.999355
[1447]	train-error:0.009771	train-auc:0.999412

[1531]	train-error:0.009636	train-auc:0.999444	valid-error:0.010788	valid-auc:0.999372
[1532]	train-error:0.00963	train-auc:0.999444	valid-error:0.010788	valid-auc:0.999372
[1533]	train-error:0.00963	train-auc:0.999444	valid-error:0.010788	valid-auc:0.999373
[1534]	train-error:0.00963	train-auc:0.999445	valid-error:0.010788	valid-auc:0.999374
[1535]	train-error:0.00963	train-auc:0.999445	valid-error:0.010788	valid-auc:0.999373
[1536]	train-error:0.00963	train-auc:0.999445	valid-error:0.010788	valid-auc:0.999374
[1537]	train-error:0.009636	train-auc:0.999446	valid-error:0.010788	valid-auc:0.999374
[1538]	train-error:0.009625	train-auc:0.999446	valid-error:0.010788	valid-auc:0.999374
[1539]	train-error:0.00963	train-auc:0.999446	valid-error:0.010788	valid-auc:0.999373
[1540]	train-error:0.00963	train-auc:0.999447	valid-error:0.010788	valid-auc:0.999374
[1541]	train-error:0.009625	train-auc:0.999447	valid-error:0.010788	valid-auc:0.999374
[1542]	train-error:0.00963	train-auc:0.999447	vali

[1626]	train-error:0.009546	train-auc:0.99948	valid-error:0.010575	valid-auc:0.999401
[1627]	train-error:0.009546	train-auc:0.99948	valid-error:0.010575	valid-auc:0.999402
[1628]	train-error:0.009546	train-auc:0.999481	valid-error:0.010575	valid-auc:0.999403
[1629]	train-error:0.00954	train-auc:0.999482	valid-error:0.010575	valid-auc:0.999403
[1630]	train-error:0.00954	train-auc:0.999482	valid-error:0.010575	valid-auc:0.999403
[1631]	train-error:0.00954	train-auc:0.999482	valid-error:0.010575	valid-auc:0.999403
[1632]	train-error:0.009535	train-auc:0.999483	valid-error:0.010575	valid-auc:0.999403
[1633]	train-error:0.009535	train-auc:0.999483	valid-error:0.010575	valid-auc:0.999404
[1634]	train-error:0.009512	train-auc:0.999484	valid-error:0.010575	valid-auc:0.999404
[1635]	train-error:0.009512	train-auc:0.999484	valid-error:0.010575	valid-auc:0.999404
[1636]	train-error:0.00949	train-auc:0.999485	valid-error:0.010575	valid-auc:0.999404
[1637]	train-error:0.00949	train-auc:0.999485	val

[1721]	train-error:0.009372	train-auc:0.99951	valid-error:0.010254	valid-auc:0.999423
[1722]	train-error:0.009377	train-auc:0.99951	valid-error:0.010254	valid-auc:0.999423
[1723]	train-error:0.009377	train-auc:0.99951	valid-error:0.010254	valid-auc:0.999423
[1724]	train-error:0.009377	train-auc:0.999511	valid-error:0.010254	valid-auc:0.999423
[1725]	train-error:0.009372	train-auc:0.999511	valid-error:0.010254	valid-auc:0.999424
[1726]	train-error:0.009372	train-auc:0.999511	valid-error:0.010254	valid-auc:0.999424
[1727]	train-error:0.009377	train-auc:0.999512	valid-error:0.010254	valid-auc:0.999424
[1728]	train-error:0.009383	train-auc:0.999512	valid-error:0.010254	valid-auc:0.999424
[1729]	train-error:0.009383	train-auc:0.999512	valid-error:0.010254	valid-auc:0.999424
[1730]	train-error:0.009389	train-auc:0.999512	valid-error:0.010254	valid-auc:0.999424
[1731]	train-error:0.009389	train-auc:0.999512	valid-error:0.010254	valid-auc:0.999424
[1732]	train-error:0.009389	train-auc:0.999512

[1816]	train-error:0.009237	train-auc:0.999537	valid-error:0.010361	valid-auc:0.999441
[1817]	train-error:0.009231	train-auc:0.999537	valid-error:0.010361	valid-auc:0.999441
[1818]	train-error:0.009242	train-auc:0.999537	valid-error:0.010361	valid-auc:0.999441
[1819]	train-error:0.009242	train-auc:0.999538	valid-error:0.010361	valid-auc:0.999441
[1820]	train-error:0.009242	train-auc:0.999538	valid-error:0.010361	valid-auc:0.999441
[1821]	train-error:0.009242	train-auc:0.999538	valid-error:0.010361	valid-auc:0.999441
[1822]	train-error:0.009231	train-auc:0.999539	valid-error:0.010361	valid-auc:0.999441
[1823]	train-error:0.009231	train-auc:0.999539	valid-error:0.010361	valid-auc:0.999441
[1824]	train-error:0.00922	train-auc:0.99954	valid-error:0.010361	valid-auc:0.99944
[1825]	train-error:0.009214	train-auc:0.99954	valid-error:0.010361	valid-auc:0.99944
[1826]	train-error:0.009214	train-auc:0.99954	valid-error:0.010361	valid-auc:0.99944
[1827]	train-error:0.009192	train-auc:0.99954	vali

[1911]	train-error:0.009175	train-auc:0.999563	valid-error:0.010254	valid-auc:0.999453
[1912]	train-error:0.009169	train-auc:0.999563	valid-error:0.010254	valid-auc:0.999453
[1913]	train-error:0.009169	train-auc:0.999564	valid-error:0.010254	valid-auc:0.999455
[1914]	train-error:0.009169	train-auc:0.999564	valid-error:0.010254	valid-auc:0.999455
[1915]	train-error:0.009169	train-auc:0.999564	valid-error:0.010254	valid-auc:0.999456
[1916]	train-error:0.009175	train-auc:0.999565	valid-error:0.010254	valid-auc:0.999457
[1917]	train-error:0.009175	train-auc:0.999565	valid-error:0.010254	valid-auc:0.999457
[1918]	train-error:0.009175	train-auc:0.999565	valid-error:0.010254	valid-auc:0.999457
[1919]	train-error:0.009175	train-auc:0.999565	valid-error:0.010254	valid-auc:0.999457
[1920]	train-error:0.009175	train-auc:0.999565	valid-error:0.010254	valid-auc:0.999456
[1921]	train-error:0.009175	train-auc:0.999565	valid-error:0.010254	valid-auc:0.999456
[1922]	train-error:0.009175	train-auc:0.999

[2006]	train-error:0.009102	train-auc:0.999585	valid-error:0.010254	valid-auc:0.999469
[2007]	train-error:0.009102	train-auc:0.999585	valid-error:0.010361	valid-auc:0.999469
[2008]	train-error:0.009102	train-auc:0.999585	valid-error:0.010361	valid-auc:0.999471
[2009]	train-error:0.009102	train-auc:0.999585	valid-error:0.010361	valid-auc:0.999471
[2010]	train-error:0.009096	train-auc:0.999586	valid-error:0.010361	valid-auc:0.999471
[2011]	train-error:0.009091	train-auc:0.999586	valid-error:0.010254	valid-auc:0.999471
[2012]	train-error:0.009091	train-auc:0.999586	valid-error:0.010254	valid-auc:0.999471
[2013]	train-error:0.009091	train-auc:0.999586	valid-error:0.010254	valid-auc:0.999471
[2014]	train-error:0.009091	train-auc:0.999586	valid-error:0.010254	valid-auc:0.999471
[2015]	train-error:0.009091	train-auc:0.999587	valid-error:0.010254	valid-auc:0.999471
[2016]	train-error:0.009091	train-auc:0.999587	valid-error:0.010254	valid-auc:0.999471
[2017]	train-error:0.009096	train-auc:0.999

[2101]	train-error:0.009034	train-auc:0.999605	valid-error:0.010147	valid-auc:0.999482
[2102]	train-error:0.009018	train-auc:0.999605	valid-error:0.010147	valid-auc:0.999481
[2103]	train-error:0.009001	train-auc:0.999605	valid-error:0.010147	valid-auc:0.999481
[2104]	train-error:0.009006	train-auc:0.999605	valid-error:0.010147	valid-auc:0.999482
[2105]	train-error:0.009001	train-auc:0.999606	valid-error:0.010147	valid-auc:0.999482
[2106]	train-error:0.009001	train-auc:0.999606	valid-error:0.010147	valid-auc:0.999482
[2107]	train-error:0.009001	train-auc:0.999606	valid-error:0.010147	valid-auc:0.999482
[2108]	train-error:0.00899	train-auc:0.999606	valid-error:0.010147	valid-auc:0.999483
[2109]	train-error:0.0089	train-auc:0.999607	valid-error:0.010147	valid-auc:0.999483
[2110]	train-error:0.008905	train-auc:0.999607	valid-error:0.010147	valid-auc:0.999482
[2111]	train-error:0.008883	train-auc:0.999607	valid-error:0.010147	valid-auc:0.999482
[2112]	train-error:0.008877	train-auc:0.999608

[2196]	train-error:0.008686	train-auc:0.999625	valid-error:0.010147	valid-auc:0.99949
[2197]	train-error:0.008686	train-auc:0.999625	valid-error:0.010147	valid-auc:0.99949
[2198]	train-error:0.008686	train-auc:0.999625	valid-error:0.010147	valid-auc:0.99949
[2199]	train-error:0.00868	train-auc:0.999625	valid-error:0.010147	valid-auc:0.99949
[2200]	train-error:0.008686	train-auc:0.999625	valid-error:0.010147	valid-auc:0.999492
[2201]	train-error:0.008686	train-auc:0.999625	valid-error:0.010147	valid-auc:0.999492
[2202]	train-error:0.00868	train-auc:0.999625	valid-error:0.010147	valid-auc:0.999492
[2203]	train-error:0.008675	train-auc:0.999626	valid-error:0.010147	valid-auc:0.999492
[2204]	train-error:0.00868	train-auc:0.999626	valid-error:0.010147	valid-auc:0.999493
[2205]	train-error:0.008675	train-auc:0.999626	valid-error:0.010147	valid-auc:0.999492
[2206]	train-error:0.008675	train-auc:0.999626	valid-error:0.010147	valid-auc:0.999492
[2207]	train-error:0.008675	train-auc:0.999627	val

[2291]	train-error:0.008484	train-auc:0.999642	valid-error:0.009507	valid-auc:0.999505
[2292]	train-error:0.008484	train-auc:0.999642	valid-error:0.009507	valid-auc:0.999504
[2293]	train-error:0.008484	train-auc:0.999642	valid-error:0.009507	valid-auc:0.999504
[2294]	train-error:0.008484	train-auc:0.999642	valid-error:0.009507	valid-auc:0.999504
[2295]	train-error:0.008484	train-auc:0.999642	valid-error:0.009507	valid-auc:0.999504
[2296]	train-error:0.008478	train-auc:0.999643	valid-error:0.009507	valid-auc:0.999504
[2297]	train-error:0.008484	train-auc:0.999643	valid-error:0.009507	valid-auc:0.999504
[2298]	train-error:0.008478	train-auc:0.999643	valid-error:0.009507	valid-auc:0.999505
[2299]	train-error:0.008478	train-auc:0.999643	valid-error:0.009507	valid-auc:0.999505
[2300]	train-error:0.008478	train-auc:0.999643	valid-error:0.009507	valid-auc:0.999505
[2301]	train-error:0.008478	train-auc:0.999644	valid-error:0.009507	valid-auc:0.999505
[2302]	train-error:0.008472	train-auc:0.999

[2386]	train-error:0.008292	train-auc:0.999658	valid-error:0.009507	valid-auc:0.999515
[2387]	train-error:0.008292	train-auc:0.999658	valid-error:0.009507	valid-auc:0.999517
[2388]	train-error:0.008287	train-auc:0.999659	valid-error:0.009507	valid-auc:0.999517
[2389]	train-error:0.008287	train-auc:0.999659	valid-error:0.009507	valid-auc:0.999517
[2390]	train-error:0.008287	train-auc:0.999659	valid-error:0.009507	valid-auc:0.999518
[2391]	train-error:0.008287	train-auc:0.999659	valid-error:0.009507	valid-auc:0.999518
[2392]	train-error:0.008281	train-auc:0.999659	valid-error:0.009507	valid-auc:0.999518
[2393]	train-error:0.008281	train-auc:0.999659	valid-error:0.009507	valid-auc:0.999518
[2394]	train-error:0.008281	train-auc:0.999659	valid-error:0.009507	valid-auc:0.999517
[2395]	train-error:0.008281	train-auc:0.999659	valid-error:0.009507	valid-auc:0.999517
[2396]	train-error:0.008281	train-auc:0.999659	valid-error:0.009507	valid-auc:0.999517
[2397]	train-error:0.008276	train-auc:0.999

[2482]	train-error:0.007865	train-auc:0.999676	valid-error:0.009293	valid-auc:0.999528
[2483]	train-error:0.007865	train-auc:0.999676	valid-error:0.009293	valid-auc:0.999527
[2484]	train-error:0.007865	train-auc:0.999676	valid-error:0.009293	valid-auc:0.999527
[2485]	train-error:0.007865	train-auc:0.999677	valid-error:0.009293	valid-auc:0.999527
[2486]	train-error:0.007865	train-auc:0.999677	valid-error:0.009293	valid-auc:0.999528
[2487]	train-error:0.007865	train-auc:0.999677	valid-error:0.009293	valid-auc:0.999528
[2488]	train-error:0.007865	train-auc:0.999677	valid-error:0.009293	valid-auc:0.999528
[2489]	train-error:0.007865	train-auc:0.999677	valid-error:0.009293	valid-auc:0.999528
[2490]	train-error:0.007859	train-auc:0.999677	valid-error:0.009293	valid-auc:0.999527
[2491]	train-error:0.007859	train-auc:0.999677	valid-error:0.009293	valid-auc:0.999528
[2492]	train-error:0.007859	train-auc:0.999678	valid-error:0.009293	valid-auc:0.999528
[2493]	train-error:0.007848	train-auc:0.999

[2577]	train-error:0.007264	train-auc:0.99969	valid-error:0.008545	valid-auc:0.99954
[2578]	train-error:0.007269	train-auc:0.99969	valid-error:0.008545	valid-auc:0.99954
[2579]	train-error:0.007269	train-auc:0.99969	valid-error:0.008545	valid-auc:0.99954
[2580]	train-error:0.007269	train-auc:0.999691	valid-error:0.008545	valid-auc:0.99954
[2581]	train-error:0.007224	train-auc:0.999691	valid-error:0.008438	valid-auc:0.99954
[2582]	train-error:0.007224	train-auc:0.999691	valid-error:0.008438	valid-auc:0.999541
[2583]	train-error:0.007224	train-auc:0.999691	valid-error:0.008438	valid-auc:0.999541
[2584]	train-error:0.007224	train-auc:0.999691	valid-error:0.008438	valid-auc:0.999541
[2585]	train-error:0.007224	train-auc:0.999691	valid-error:0.008438	valid-auc:0.999541
[2586]	train-error:0.007224	train-auc:0.999692	valid-error:0.008438	valid-auc:0.999541
[2587]	train-error:0.007224	train-auc:0.999692	valid-error:0.008438	valid-auc:0.999541
[2588]	train-error:0.007219	train-auc:0.999692	vali

[2672]	train-error:0.006904	train-auc:0.999704	valid-error:0.008225	valid-auc:0.999549
[2673]	train-error:0.006909	train-auc:0.999704	valid-error:0.008225	valid-auc:0.999549
[2674]	train-error:0.006909	train-auc:0.999704	valid-error:0.008225	valid-auc:0.99955
[2675]	train-error:0.006909	train-auc:0.999704	valid-error:0.008225	valid-auc:0.99955
[2676]	train-error:0.006915	train-auc:0.999705	valid-error:0.008225	valid-auc:0.99955
[2677]	train-error:0.006921	train-auc:0.999705	valid-error:0.008225	valid-auc:0.99955
[2678]	train-error:0.006915	train-auc:0.999705	valid-error:0.008225	valid-auc:0.99955
[2679]	train-error:0.006898	train-auc:0.999705	valid-error:0.008225	valid-auc:0.99955
[2680]	train-error:0.006904	train-auc:0.999705	valid-error:0.008225	valid-auc:0.99955
[2681]	train-error:0.006893	train-auc:0.999705	valid-error:0.008225	valid-auc:0.99955
[2682]	train-error:0.006898	train-auc:0.999706	valid-error:0.008225	valid-auc:0.99955
[2683]	train-error:0.006898	train-auc:0.999706	valid

[2767]	train-error:0.004604	train-auc:0.999717	valid-error:0.005982	valid-auc:0.999557
[2768]	train-error:0.004604	train-auc:0.999717	valid-error:0.005982	valid-auc:0.999557
[2769]	train-error:0.004604	train-auc:0.999717	valid-error:0.005982	valid-auc:0.999557
[2770]	train-error:0.004604	train-auc:0.999717	valid-error:0.005982	valid-auc:0.999557
[2771]	train-error:0.004604	train-auc:0.999717	valid-error:0.005982	valid-auc:0.999557
[2772]	train-error:0.004604	train-auc:0.999717	valid-error:0.005982	valid-auc:0.999558
[2773]	train-error:0.004604	train-auc:0.999718	valid-error:0.005982	valid-auc:0.999558
[2774]	train-error:0.004599	train-auc:0.999718	valid-error:0.005982	valid-auc:0.999557
[2775]	train-error:0.004604	train-auc:0.999718	valid-error:0.005982	valid-auc:0.999557
[2776]	train-error:0.004604	train-auc:0.999718	valid-error:0.005982	valid-auc:0.999557
[2777]	train-error:0.004593	train-auc:0.999718	valid-error:0.005982	valid-auc:0.999557
[2778]	train-error:0.004593	train-auc:0.999

[2862]	train-error:0.004402	train-auc:0.99973	valid-error:0.005554	valid-auc:0.999566
[2863]	train-error:0.004402	train-auc:0.99973	valid-error:0.005554	valid-auc:0.999566
[2864]	train-error:0.004357	train-auc:0.99973	valid-error:0.005448	valid-auc:0.999566
[2865]	train-error:0.004357	train-auc:0.99973	valid-error:0.005448	valid-auc:0.999565
[2866]	train-error:0.004357	train-auc:0.99973	valid-error:0.005341	valid-auc:0.999565
[2867]	train-error:0.004357	train-auc:0.99973	valid-error:0.005341	valid-auc:0.999565
[2868]	train-error:0.004363	train-auc:0.999731	valid-error:0.005554	valid-auc:0.999566
[2869]	train-error:0.004363	train-auc:0.999731	valid-error:0.005554	valid-auc:0.999566
[2870]	train-error:0.004363	train-auc:0.999731	valid-error:0.005554	valid-auc:0.999566
[2871]	train-error:0.004357	train-auc:0.999731	valid-error:0.005554	valid-auc:0.999566
[2872]	train-error:0.004357	train-auc:0.999731	valid-error:0.005554	valid-auc:0.999566
[2873]	train-error:0.004351	train-auc:0.999732	va

[2957]	train-error:0.004256	train-auc:0.999742	valid-error:0.005127	valid-auc:0.999575
[2958]	train-error:0.004261	train-auc:0.999742	valid-error:0.005127	valid-auc:0.999575
[2959]	train-error:0.004256	train-auc:0.999742	valid-error:0.00502	valid-auc:0.999574
[2960]	train-error:0.004256	train-auc:0.999742	valid-error:0.00502	valid-auc:0.999574
[2961]	train-error:0.004256	train-auc:0.999742	valid-error:0.00502	valid-auc:0.999574
[2962]	train-error:0.00425	train-auc:0.999742	valid-error:0.00502	valid-auc:0.999575
[2963]	train-error:0.00425	train-auc:0.999742	valid-error:0.00502	valid-auc:0.999574
[2964]	train-error:0.00425	train-auc:0.999743	valid-error:0.00502	valid-auc:0.999573
[2965]	train-error:0.00425	train-auc:0.999742	valid-error:0.00502	valid-auc:0.999574
[2966]	train-error:0.00425	train-auc:0.999743	valid-error:0.00502	valid-auc:0.999573
[2967]	train-error:0.00425	train-auc:0.999743	valid-error:0.00502	valid-auc:0.999573
[2968]	train-error:0.00425	train-auc:0.999743	valid-error:

[3053]	train-error:0.004115	train-auc:0.999753	valid-error:0.005127	valid-auc:0.999579
[3054]	train-error:0.004104	train-auc:0.999753	valid-error:0.005127	valid-auc:0.99958
[3055]	train-error:0.004104	train-auc:0.999753	valid-error:0.005127	valid-auc:0.999579
[3056]	train-error:0.004104	train-auc:0.999753	valid-error:0.005127	valid-auc:0.999579
[3057]	train-error:0.004104	train-auc:0.999754	valid-error:0.005127	valid-auc:0.999579
[3058]	train-error:0.004104	train-auc:0.999754	valid-error:0.005127	valid-auc:0.999579
[3059]	train-error:0.004104	train-auc:0.999754	valid-error:0.005127	valid-auc:0.999579
[3060]	train-error:0.00411	train-auc:0.999754	valid-error:0.005127	valid-auc:0.999579
[3061]	train-error:0.004115	train-auc:0.999754	valid-error:0.005127	valid-auc:0.999579
[3062]	train-error:0.004121	train-auc:0.999754	valid-error:0.005127	valid-auc:0.999579
[3063]	train-error:0.004132	train-auc:0.999754	valid-error:0.005127	valid-auc:0.999579
[3064]	train-error:0.004121	train-auc:0.99975

In [102]:
print gbm.eval(dvalid)

[0]	eval-error:0.004807	eval-auc:0.999578


### Validate classification accuracy

In [103]:
%time data_ngrams = generate_context_ngrams(train, context_cols)

CPU times: user 11min 49s, sys: 1min 2s, total: 12min 52s
Wall time: 13min 12s


In [105]:
%time dtest = xgb.DMatrix(data_ngrams, label=labels, feature_names=feature_names)

CPU times: user 7min 20s, sys: 4min 41s, total: 12min 2s
Wall time: 14min 39s


In [106]:
%time predictions = gbm.predict(dtest)

CPU times: user 2h 12min 44s, sys: 31.5 s, total: 2h 13min 15s
Wall time: 47min 43s


In [107]:
predictions

array([ 0.01158588,  0.00036614,  0.01372064, ...,  0.0202615 ,
        0.00632452,  0.00073591], dtype=float32)

In [108]:
from sklearn.metrics import accuracy_score

for i in range(10):
    predictions_current = predictions > i / 10.0
    print ("0.{}: {}".format(i, accuracy_score(labels, predictions_current)))

0.0: 0.125082403416
0.1: 0.97505953236
0.2: 0.984837582407
0.3: 0.98892325983
0.4: 0.991354863932
0.5: 0.993797437818
0.6: 0.994522200149
0.7: 0.994743770888
0.8: 0.994721358483
0.9: 0.992554069454


In [109]:
best_threshold = 0.7
best_accuracy = accuracy_score(labels, predictions > best_threshold)
print 'Best accuracy: ', best_accuracy
print 'Number of errors: ', len(train) - best_accuracy * len(train)

Best accuracy:  0.994743770888
Number of errors:  55582.0


In [110]:
from sklearn.metrics import confusion_matrix

tn, fp, fn, tp = confusion_matrix(labels, predictions > best_threshold).ravel()

In [111]:
print tn, fp 
print fn, '\t', tp

9215552 36265
19317 	1303367
