In [1]:
import numpy as np
import pandas as pd
import scipy

import string
import re
import gc

In [2]:
import xgboost as xgb
from sklearn.model_selection import train_test_split
from sklearn.preprocessing import LabelEncoder
from sklearn.feature_extraction.text import CountVectorizer



In [37]:
train = pd.read_csv('data/train.csv', encoding='utf-8')
train.dropna(inplace=True)
train = train[train.sentence_id != 492799]
train.reset_index(inplace=True)

#### Prepare full data

In [4]:
vectorizer = CountVectorizer (
    analyzer='char_wb',
    ngram_range=(1, 1),
    lowercase=False
)

In [5]:
%time data_ngrams = vectorizer.fit_transform(train['before'].values)

CPU times: user 1min 30s, sys: 1.87 s, total: 1min 32s
Wall time: 1min 33s


In [6]:
data_ngrams.shape

(10574501, 2660)

In [7]:
labels = np.array(train['after'].values != train['before'].values, dtype=np.uint8)

enc = LabelEncoder()
labels = enc.fit_transform(labels)

#### Prepare context data

In [38]:
def get_context_value(values, sentences, offset):
    offset_values = np.asarray([u'' for v in range(len(values))], dtype=object)
    for i in range(len(values)):
        offset_value = ""
        i_offset = i + offset
        if i_offset >= 0 and i_offset < len(values) \
                and sentences[i_offset] == sentences[i]:
            offset_value = values[i_offset]
        offset_values[i] = offset_value

    return offset_values

In [40]:
train['before_m_2'] = pd.Series(get_context_value(train.before.values, train.sentence_id.values, -2))
train['before_m_1'] = pd.Series(get_context_value(train.before.values, train.sentence_id.values, -1))
train['before_p_1'] = pd.Series(get_context_value(train.before.values, train.sentence_id.values, 1))
train['before_p_2'] = pd.Series(get_context_value(train.before.values, train.sentence_id.values, 2))

In [42]:
def generate_context_ngrams(data, columns):
    return scipy.sparse.hstack([vectorizer.transform(data[c].values) for c in columns])

In [43]:
context_cols = ['before_m_2', 'before_m_1', 'before', 'before_p_1', 'before_p_2']

#### Prepare train data

In [44]:
sample_sizes = {
    "PUNCT" : 25000,
    "PLAIN" : 30000,
    "CARDINAL" : 20000,
    "LETTERS" : 20000,
    "DATE" : 20000,
    "ORDINAL" : 15000,
    "MEASURE" : 15000
}

train.reset_index(inplace=True)
train_data = pd.DataFrame(columns=train.columns)
sample_max_size = 10000

for c in train['class'].unique():
    class_sample = train[train['class'] == c]
    sample_size = sample_sizes[c] if c in sample_sizes else sample_max_size
    if len(class_sample) > sample_max_size:
        train_data = train_data.append(class_sample.sample(sample_size))
    else:
        train_data = train_data.append(class_sample)

In [45]:
train_ngrams = generate_context_ngrams(train_data, context_cols)

In [46]:
labels_train = np.array(train_data['after'].values != train_data['before'].values, dtype=np.uint8)
labels_train = enc.transform(labels_train)

### Train XGBoost classifier 

In [47]:
xtr, xcv, ytr, ycv = train_test_split(train_ngrams, labels_train, test_size = 0.05, random_state = 42)
feature_names = ['f_{}'.format(i) for i in range(train_ngrams.shape[1])]

In [48]:
dtrain = xgb.DMatrix(xtr, label=ytr, feature_names=feature_names)
dvalid = xgb.DMatrix(xcv, label=ycv, feature_names=feature_names)

In [49]:
params = [("objective", "binary:logistic"),
          ("booster", "gbtree"),
          ("nthread", 3),
          ("eta", 0.01),
          ("max_depth", 8),
          ("subsample", 0.9),
          ("min_child_weight", 1),
          ("colsample_bytree", 0.7),
          ("eval_metric", 'error'),
          ("eval_metric", 'auc'),
         ]
num_rounds = 10000
stop = 60

In [50]:
watchlist = [(dtrain, 'train'), (dvalid, 'valid')]

In [51]:
gbm = xgb.train(params, dtrain, num_rounds, evals=watchlist, early_stopping_rounds=stop)

[0]	train-error:0.127067	train-auc:0.938747	valid-error:0.128285	valid-auc:0.938557
Multiple eval metrics have been passed: 'valid-auc' will be used for early stopping.

Will train until valid-auc hasn't improved in 60 rounds.
[1]	train-error:0.095191	train-auc:0.959619	valid-error:0.094958	valid-auc:0.960181
[2]	train-error:0.090491	train-auc:0.964873	valid-error:0.089084	valid-auc:0.965013
[3]	train-error:0.079191	train-auc:0.976891	valid-error:0.077654	valid-auc:0.977523
[4]	train-error:0.07986	train-auc:0.981235	valid-error:0.077868	valid-auc:0.981791
[5]	train-error:0.071691	train-auc:0.98442	valid-error:0.071566	valid-auc:0.984667
[6]	train-error:0.073754	train-auc:0.98422	valid-error:0.073809	valid-auc:0.984462
[7]	train-error:0.068689	train-auc:0.985439	valid-error:0.068575	valid-auc:0.985504
[8]	train-error:0.067132	train-auc:0.985538	valid-error:0.066973	valid-auc:0.985509
[9]	train-error:0.073743	train-auc:0.985854	valid-error:0.0721	valid-auc:0.985919
[10]	train-error:0.067

[96]	train-error:0.060678	train-auc:0.992004	valid-error:0.061205	valid-auc:0.991726
[97]	train-error:0.060768	train-auc:0.992006	valid-error:0.061205	valid-auc:0.991713
[98]	train-error:0.060734	train-auc:0.992001	valid-error:0.060991	valid-auc:0.991654
[99]	train-error:0.060756	train-auc:0.992027	valid-error:0.061312	valid-auc:0.991723
[100]	train-error:0.060745	train-auc:0.992067	valid-error:0.061098	valid-auc:0.991781
[101]	train-error:0.060543	train-auc:0.992079	valid-error:0.060564	valid-auc:0.991814
[102]	train-error:0.060571	train-auc:0.992081	valid-error:0.060884	valid-auc:0.991802
[103]	train-error:0.060582	train-auc:0.992083	valid-error:0.060884	valid-auc:0.991826
[104]	train-error:0.060582	train-auc:0.992085	valid-error:0.060671	valid-auc:0.991816
[105]	train-error:0.06043	train-auc:0.992099	valid-error:0.060564	valid-auc:0.991837
[106]	train-error:0.060425	train-auc:0.992095	valid-error:0.06035	valid-auc:0.991809
[107]	train-error:0.060442	train-auc:0.992372	valid-error:0.

[192]	train-error:0.056079	train-auc:0.994686	valid-error:0.057573	valid-auc:0.994531
[193]	train-error:0.056023	train-auc:0.994699	valid-error:0.057466	valid-auc:0.994544
[194]	train-error:0.055545	train-auc:0.994833	valid-error:0.056932	valid-auc:0.994647
[195]	train-error:0.055545	train-auc:0.994823	valid-error:0.056932	valid-auc:0.994622
[196]	train-error:0.055579	train-auc:0.994798	valid-error:0.056932	valid-auc:0.994597
[197]	train-error:0.055472	train-auc:0.994846	valid-error:0.056825	valid-auc:0.994652
[198]	train-error:0.055427	train-auc:0.994834	valid-error:0.056719	valid-auc:0.994646
[199]	train-error:0.055309	train-auc:0.994849	valid-error:0.056719	valid-auc:0.994651
[200]	train-error:0.055331	train-auc:0.994869	valid-error:0.056825	valid-auc:0.994675
[201]	train-error:0.05523	train-auc:0.994951	valid-error:0.056612	valid-auc:0.994754
[202]	train-error:0.055219	train-auc:0.994954	valid-error:0.056612	valid-auc:0.994759
[203]	train-error:0.055196	train-auc:0.994949	valid-err

[288]	train-error:0.030404	train-auc:0.996035	valid-error:0.029801	valid-auc:0.995883
[289]	train-error:0.029414	train-auc:0.996055	valid-error:0.02852	valid-auc:0.995903
[290]	train-error:0.029144	train-auc:0.996063	valid-error:0.028413	valid-auc:0.995913
[291]	train-error:0.028689	train-auc:0.996075	valid-error:0.028092	valid-auc:0.995922
[292]	train-error:0.028363	train-auc:0.996091	valid-error:0.027665	valid-auc:0.995954
[293]	train-error:0.02834	train-auc:0.99609	valid-error:0.027665	valid-auc:0.995947
[294]	train-error:0.028284	train-auc:0.996093	valid-error:0.027451	valid-auc:0.995941
[295]	train-error:0.028273	train-auc:0.9961	valid-error:0.027451	valid-auc:0.995944
[296]	train-error:0.028627	train-auc:0.996134	valid-error:0.027558	valid-auc:0.995984
[297]	train-error:0.028323	train-auc:0.996131	valid-error:0.027451	valid-auc:0.995985
[298]	train-error:0.028323	train-auc:0.996149	valid-error:0.027451	valid-auc:0.995999
[299]	train-error:0.028267	train-auc:0.99615	valid-error:0.

[384]	train-error:0.02685	train-auc:0.996775	valid-error:0.025849	valid-auc:0.996621
[385]	train-error:0.026834	train-auc:0.996782	valid-error:0.025849	valid-auc:0.996629
[386]	train-error:0.026817	train-auc:0.996787	valid-error:0.025849	valid-auc:0.996633
[387]	train-error:0.026811	train-auc:0.996792	valid-error:0.025849	valid-auc:0.996639
[388]	train-error:0.026805	train-auc:0.996795	valid-error:0.025849	valid-auc:0.996648
[389]	train-error:0.025889	train-auc:0.996798	valid-error:0.024888	valid-auc:0.996652
[390]	train-error:0.025507	train-auc:0.996806	valid-error:0.023927	valid-auc:0.996656
[391]	train-error:0.025479	train-auc:0.996816	valid-error:0.023927	valid-auc:0.99666
[392]	train-error:0.025569	train-auc:0.996822	valid-error:0.024354	valid-auc:0.99667
[393]	train-error:0.025462	train-auc:0.996829	valid-error:0.023927	valid-auc:0.996679
[394]	train-error:0.025434	train-auc:0.996828	valid-error:0.023927	valid-auc:0.996678
[395]	train-error:0.025265	train-auc:0.996831	valid-error

[480]	train-error:0.022297	train-auc:0.997313	valid-error:0.021043	valid-auc:0.997184
[481]	train-error:0.022285	train-auc:0.997317	valid-error:0.021043	valid-auc:0.997187
[482]	train-error:0.02228	train-auc:0.99732	valid-error:0.021043	valid-auc:0.997188
[483]	train-error:0.022297	train-auc:0.997328	valid-error:0.021043	valid-auc:0.997194
[484]	train-error:0.022269	train-auc:0.99733	valid-error:0.021043	valid-auc:0.997196
[485]	train-error:0.022044	train-auc:0.997334	valid-error:0.020829	valid-auc:0.997202
[486]	train-error:0.022032	train-auc:0.997335	valid-error:0.020829	valid-auc:0.997197
[487]	train-error:0.021987	train-auc:0.997342	valid-error:0.020829	valid-auc:0.997204
[488]	train-error:0.021971	train-auc:0.997345	valid-error:0.020722	valid-auc:0.997207
[489]	train-error:0.021796	train-auc:0.997349	valid-error:0.020615	valid-auc:0.997216
[490]	train-error:0.021774	train-auc:0.997353	valid-error:0.020615	valid-auc:0.99722
[491]	train-error:0.021774	train-auc:0.997355	valid-error:

[576]	train-error:0.019604	train-auc:0.997709	valid-error:0.018479	valid-auc:0.997564
[577]	train-error:0.019598	train-auc:0.997711	valid-error:0.018479	valid-auc:0.997565
[578]	train-error:0.019581	train-auc:0.997719	valid-error:0.018372	valid-auc:0.997573
[579]	train-error:0.019564	train-auc:0.997722	valid-error:0.018372	valid-auc:0.997573
[580]	train-error:0.019553	train-auc:0.997728	valid-error:0.018372	valid-auc:0.997582
[581]	train-error:0.019542	train-auc:0.997732	valid-error:0.018372	valid-auc:0.99759
[582]	train-error:0.019542	train-auc:0.997742	valid-error:0.018372	valid-auc:0.997603
[583]	train-error:0.019525	train-auc:0.997745	valid-error:0.018372	valid-auc:0.997605
[584]	train-error:0.019497	train-auc:0.997749	valid-error:0.018372	valid-auc:0.997608
[585]	train-error:0.019424	train-auc:0.997754	valid-error:0.018372	valid-auc:0.997612
[586]	train-error:0.019401	train-auc:0.997759	valid-error:0.018372	valid-auc:0.997619
[587]	train-error:0.019401	train-auc:0.997764	valid-err

[672]	train-error:0.017951	train-auc:0.998069	valid-error:0.017518	valid-auc:0.997836
[673]	train-error:0.017957	train-auc:0.998072	valid-error:0.017518	valid-auc:0.997834
[674]	train-error:0.017917	train-auc:0.998085	valid-error:0.017518	valid-auc:0.997851
[675]	train-error:0.0179	train-auc:0.998087	valid-error:0.017518	valid-auc:0.997852
[676]	train-error:0.0179	train-auc:0.99809	valid-error:0.017518	valid-auc:0.997855
[677]	train-error:0.017895	train-auc:0.998093	valid-error:0.017518	valid-auc:0.99786
[678]	train-error:0.0179	train-auc:0.998095	valid-error:0.017411	valid-auc:0.997861
[679]	train-error:0.017889	train-auc:0.998096	valid-error:0.017411	valid-auc:0.997862
[680]	train-error:0.017867	train-auc:0.998101	valid-error:0.017518	valid-auc:0.997868
[681]	train-error:0.017867	train-auc:0.998104	valid-error:0.017411	valid-auc:0.997868
[682]	train-error:0.017855	train-auc:0.998105	valid-error:0.017411	valid-auc:0.997872
[683]	train-error:0.01785	train-auc:0.998112	valid-error:0.017

[768]	train-error:0.016152	train-auc:0.998307	valid-error:0.015488	valid-auc:0.998057
[769]	train-error:0.016146	train-auc:0.998309	valid-error:0.015488	valid-auc:0.998061
[770]	train-error:0.016141	train-auc:0.998312	valid-error:0.015488	valid-auc:0.998061
[771]	train-error:0.016118	train-auc:0.998313	valid-error:0.015488	valid-auc:0.998063
[772]	train-error:0.016107	train-auc:0.998315	valid-error:0.015488	valid-auc:0.998066
[773]	train-error:0.016096	train-auc:0.998316	valid-error:0.015488	valid-auc:0.998071
[774]	train-error:0.016073	train-auc:0.998319	valid-error:0.015275	valid-auc:0.998074
[775]	train-error:0.016101	train-auc:0.998321	valid-error:0.015381	valid-auc:0.998074
[776]	train-error:0.016084	train-auc:0.998322	valid-error:0.015381	valid-auc:0.998078
[777]	train-error:0.016079	train-auc:0.998322	valid-error:0.015381	valid-auc:0.998078
[778]	train-error:0.016056	train-auc:0.998325	valid-error:0.015381	valid-auc:0.99808
[779]	train-error:0.016056	train-auc:0.998326	valid-err

[864]	train-error:0.014887	train-auc:0.998463	valid-error:0.013779	valid-auc:0.998177
[865]	train-error:0.014887	train-auc:0.998464	valid-error:0.013779	valid-auc:0.998179
[866]	train-error:0.014893	train-auc:0.998465	valid-error:0.013779	valid-auc:0.998181
[867]	train-error:0.014881	train-auc:0.998466	valid-error:0.013779	valid-auc:0.998181
[868]	train-error:0.014887	train-auc:0.998468	valid-error:0.013779	valid-auc:0.998183
[869]	train-error:0.014876	train-auc:0.998469	valid-error:0.013779	valid-auc:0.998187
[870]	train-error:0.014876	train-auc:0.998471	valid-error:0.013779	valid-auc:0.99819
[871]	train-error:0.014859	train-auc:0.998472	valid-error:0.013779	valid-auc:0.998191
[872]	train-error:0.014864	train-auc:0.998473	valid-error:0.013779	valid-auc:0.998194
[873]	train-error:0.014859	train-auc:0.998474	valid-error:0.013779	valid-auc:0.998197
[874]	train-error:0.014842	train-auc:0.998476	valid-error:0.013779	valid-auc:0.998199
[875]	train-error:0.014836	train-auc:0.998479	valid-err

[960]	train-error:0.014409	train-auc:0.998579	valid-error:0.013352	valid-auc:0.998281
[961]	train-error:0.014403	train-auc:0.99858	valid-error:0.013352	valid-auc:0.998281
[962]	train-error:0.014398	train-auc:0.99858	valid-error:0.013352	valid-auc:0.998281
[963]	train-error:0.014403	train-auc:0.998581	valid-error:0.013352	valid-auc:0.998281
[964]	train-error:0.014409	train-auc:0.998583	valid-error:0.013352	valid-auc:0.998283
[965]	train-error:0.014392	train-auc:0.998584	valid-error:0.013352	valid-auc:0.998285
[966]	train-error:0.014375	train-auc:0.998585	valid-error:0.013245	valid-auc:0.998284
[967]	train-error:0.01437	train-auc:0.998587	valid-error:0.013245	valid-auc:0.998284
[968]	train-error:0.01437	train-auc:0.998587	valid-error:0.013245	valid-auc:0.998289
[969]	train-error:0.014364	train-auc:0.998588	valid-error:0.013245	valid-auc:0.99829
[970]	train-error:0.014375	train-auc:0.998589	valid-error:0.013245	valid-auc:0.998291
[971]	train-error:0.014364	train-auc:0.998592	valid-error:0

[1055]	train-error:0.013819	train-auc:0.998685	valid-error:0.012711	valid-auc:0.998371
[1056]	train-error:0.013824	train-auc:0.998685	valid-error:0.012711	valid-auc:0.998372
[1057]	train-error:0.013802	train-auc:0.998688	valid-error:0.012711	valid-auc:0.998371
[1058]	train-error:0.013802	train-auc:0.998689	valid-error:0.012711	valid-auc:0.998373
[1059]	train-error:0.013757	train-auc:0.99869	valid-error:0.012711	valid-auc:0.998372
[1060]	train-error:0.013779	train-auc:0.998691	valid-error:0.012711	valid-auc:0.998372
[1061]	train-error:0.013768	train-auc:0.998693	valid-error:0.012711	valid-auc:0.998372
[1062]	train-error:0.013729	train-auc:0.998694	valid-error:0.012711	valid-auc:0.998372
[1063]	train-error:0.013644	train-auc:0.998694	valid-error:0.012497	valid-auc:0.998372
[1064]	train-error:0.013639	train-auc:0.998695	valid-error:0.012497	valid-auc:0.998374
[1065]	train-error:0.013633	train-auc:0.998696	valid-error:0.012497	valid-auc:0.998375
[1066]	train-error:0.013543	train-auc:0.9986

[1150]	train-error:0.013178	train-auc:0.998783	valid-error:0.012177	valid-auc:0.998458
[1151]	train-error:0.013172	train-auc:0.998784	valid-error:0.012177	valid-auc:0.998458
[1152]	train-error:0.013139	train-auc:0.998785	valid-error:0.012177	valid-auc:0.99846
[1153]	train-error:0.013139	train-auc:0.998785	valid-error:0.012177	valid-auc:0.998461
[1154]	train-error:0.013122	train-auc:0.998786	valid-error:0.012177	valid-auc:0.998462
[1155]	train-error:0.013127	train-auc:0.998786	valid-error:0.012177	valid-auc:0.998461
[1156]	train-error:0.013127	train-auc:0.998787	valid-error:0.012177	valid-auc:0.998462
[1157]	train-error:0.013133	train-auc:0.998791	valid-error:0.012177	valid-auc:0.998467
[1158]	train-error:0.013122	train-auc:0.998792	valid-error:0.012177	valid-auc:0.998468
[1159]	train-error:0.013116	train-auc:0.998793	valid-error:0.012177	valid-auc:0.998469
[1160]	train-error:0.013127	train-auc:0.998794	valid-error:0.012177	valid-auc:0.99847
[1161]	train-error:0.013139	train-auc:0.99879

[1245]	train-error:0.012616	train-auc:0.998876	valid-error:0.01175	valid-auc:0.998564
[1246]	train-error:0.012616	train-auc:0.998876	valid-error:0.01175	valid-auc:0.998565
[1247]	train-error:0.01261	train-auc:0.998877	valid-error:0.01175	valid-auc:0.998565
[1248]	train-error:0.012616	train-auc:0.998878	valid-error:0.01175	valid-auc:0.998567
[1249]	train-error:0.012616	train-auc:0.998879	valid-error:0.01175	valid-auc:0.998568
[1250]	train-error:0.012616	train-auc:0.99888	valid-error:0.01175	valid-auc:0.998568
[1251]	train-error:0.012621	train-auc:0.998882	valid-error:0.01175	valid-auc:0.998568
[1252]	train-error:0.012604	train-auc:0.998883	valid-error:0.01175	valid-auc:0.998569
[1253]	train-error:0.012604	train-auc:0.998884	valid-error:0.01175	valid-auc:0.998569
[1254]	train-error:0.012599	train-auc:0.998885	valid-error:0.01175	valid-auc:0.99857
[1255]	train-error:0.012604	train-auc:0.998886	valid-error:0.01175	valid-auc:0.998573
[1256]	train-error:0.012604	train-auc:0.998886	valid-erro

[1340]	train-error:0.012245	train-auc:0.998957	valid-error:0.011216	valid-auc:0.998633
[1341]	train-error:0.012228	train-auc:0.998958	valid-error:0.011216	valid-auc:0.998633
[1342]	train-error:0.012211	train-auc:0.998958	valid-error:0.011216	valid-auc:0.998632
[1343]	train-error:0.012228	train-auc:0.998958	valid-error:0.011216	valid-auc:0.998633
[1344]	train-error:0.012205	train-auc:0.998959	valid-error:0.011109	valid-auc:0.998635
[1345]	train-error:0.0122	train-auc:0.99896	valid-error:0.011109	valid-auc:0.998635
[1346]	train-error:0.012183	train-auc:0.998961	valid-error:0.011109	valid-auc:0.998635
[1347]	train-error:0.012188	train-auc:0.998961	valid-error:0.011109	valid-auc:0.998635
[1348]	train-error:0.012166	train-auc:0.998963	valid-error:0.011109	valid-auc:0.998638
[1349]	train-error:0.012166	train-auc:0.998963	valid-error:0.011109	valid-auc:0.998641
[1350]	train-error:0.012166	train-auc:0.998964	valid-error:0.011109	valid-auc:0.998641
[1351]	train-error:0.01216	train-auc:0.998964	

[1435]	train-error:0.011542	train-auc:0.999025	valid-error:0.010788	valid-auc:0.998697
[1436]	train-error:0.011542	train-auc:0.999025	valid-error:0.010788	valid-auc:0.998697
[1437]	train-error:0.011531	train-auc:0.999026	valid-error:0.010788	valid-auc:0.998698
[1438]	train-error:0.011531	train-auc:0.999026	valid-error:0.010788	valid-auc:0.998699
[1439]	train-error:0.011531	train-auc:0.999027	valid-error:0.010788	valid-auc:0.9987
[1440]	train-error:0.011536	train-auc:0.999028	valid-error:0.010788	valid-auc:0.998699
[1441]	train-error:0.011548	train-auc:0.999028	valid-error:0.010788	valid-auc:0.998699
[1442]	train-error:0.011491	train-auc:0.999029	valid-error:0.010788	valid-auc:0.9987
[1443]	train-error:0.011486	train-auc:0.99903	valid-error:0.010788	valid-auc:0.998702
[1444]	train-error:0.011486	train-auc:0.999031	valid-error:0.010788	valid-auc:0.998703
[1445]	train-error:0.011486	train-auc:0.999031	valid-error:0.010788	valid-auc:0.998702
[1446]	train-error:0.011486	train-auc:0.999032	v

[1530]	train-error:0.010963	train-auc:0.999079	valid-error:0.010468	valid-auc:0.998748
[1531]	train-error:0.010963	train-auc:0.999079	valid-error:0.010361	valid-auc:0.998749
[1532]	train-error:0.010957	train-auc:0.999079	valid-error:0.010361	valid-auc:0.99875
[1533]	train-error:0.010957	train-auc:0.99908	valid-error:0.010361	valid-auc:0.99875
[1534]	train-error:0.010805	train-auc:0.999081	valid-error:0.010361	valid-auc:0.998751
[1535]	train-error:0.010805	train-auc:0.999081	valid-error:0.010468	valid-auc:0.998752
[1536]	train-error:0.010817	train-auc:0.999082	valid-error:0.010468	valid-auc:0.998753
[1537]	train-error:0.010794	train-auc:0.999083	valid-error:0.010468	valid-auc:0.998752
[1538]	train-error:0.010772	train-auc:0.999083	valid-error:0.010468	valid-auc:0.998753
[1539]	train-error:0.010766	train-auc:0.999084	valid-error:0.010468	valid-auc:0.998754
[1540]	train-error:0.010755	train-auc:0.999084	valid-error:0.010468	valid-auc:0.998754
[1541]	train-error:0.010749	train-auc:0.999085

[1625]	train-error:0.010507	train-auc:0.999128	valid-error:0.010147	valid-auc:0.998787
[1626]	train-error:0.010507	train-auc:0.999129	valid-error:0.010147	valid-auc:0.998787
[1627]	train-error:0.010502	train-auc:0.999129	valid-error:0.010147	valid-auc:0.998788
[1628]	train-error:0.010502	train-auc:0.999129	valid-error:0.010147	valid-auc:0.998788
[1629]	train-error:0.010507	train-auc:0.999129	valid-error:0.010147	valid-auc:0.998789
[1630]	train-error:0.010485	train-auc:0.99913	valid-error:0.010147	valid-auc:0.998788
[1631]	train-error:0.010485	train-auc:0.99913	valid-error:0.010147	valid-auc:0.998789
[1632]	train-error:0.010468	train-auc:0.999131	valid-error:0.010147	valid-auc:0.998789
[1633]	train-error:0.010457	train-auc:0.999132	valid-error:0.010147	valid-auc:0.998791
[1634]	train-error:0.010451	train-auc:0.999132	valid-error:0.010147	valid-auc:0.998791
[1635]	train-error:0.010446	train-auc:0.999132	valid-error:0.010147	valid-auc:0.998791
[1636]	train-error:0.01044	train-auc:0.999133

[1720]	train-error:0.01008	train-auc:0.999177	valid-error:0.009613	valid-auc:0.998815
[1721]	train-error:0.010086	train-auc:0.999178	valid-error:0.009613	valid-auc:0.998814
[1722]	train-error:0.010086	train-auc:0.999178	valid-error:0.009613	valid-auc:0.998814
[1723]	train-error:0.010086	train-auc:0.999178	valid-error:0.009613	valid-auc:0.998814
[1724]	train-error:0.010086	train-auc:0.999179	valid-error:0.009613	valid-auc:0.998815
[1725]	train-error:0.01008	train-auc:0.999179	valid-error:0.009613	valid-auc:0.998816
[1726]	train-error:0.01008	train-auc:0.999179	valid-error:0.009613	valid-auc:0.998816
[1727]	train-error:0.01008	train-auc:0.99918	valid-error:0.009613	valid-auc:0.998817
[1728]	train-error:0.01008	train-auc:0.99918	valid-error:0.009613	valid-auc:0.998818
[1729]	train-error:0.01008	train-auc:0.999181	valid-error:0.009613	valid-auc:0.998818
[1730]	train-error:0.01008	train-auc:0.999181	valid-error:0.009613	valid-auc:0.998819
[1731]	train-error:0.010086	train-auc:0.999182	valid

[1815]	train-error:0.009777	train-auc:0.999223	valid-error:0.009079	valid-auc:0.998854
[1816]	train-error:0.009782	train-auc:0.999224	valid-error:0.009079	valid-auc:0.998856
[1817]	train-error:0.009771	train-auc:0.999224	valid-error:0.009079	valid-auc:0.998856
[1818]	train-error:0.009777	train-auc:0.999224	valid-error:0.009079	valid-auc:0.998856
[1819]	train-error:0.009777	train-auc:0.999225	valid-error:0.009079	valid-auc:0.998857
[1820]	train-error:0.009777	train-auc:0.999225	valid-error:0.009079	valid-auc:0.998857
[1821]	train-error:0.009777	train-auc:0.999226	valid-error:0.009079	valid-auc:0.99886
[1822]	train-error:0.009782	train-auc:0.999226	valid-error:0.009079	valid-auc:0.998859
[1823]	train-error:0.00972	train-auc:0.999227	valid-error:0.008972	valid-auc:0.998861
[1824]	train-error:0.00972	train-auc:0.999227	valid-error:0.008972	valid-auc:0.998858
[1825]	train-error:0.00972	train-auc:0.999228	valid-error:0.008972	valid-auc:0.998858
[1826]	train-error:0.009715	train-auc:0.999228	

[1910]	train-error:0.009557	train-auc:0.999261	valid-error:0.008866	valid-auc:0.998889
[1911]	train-error:0.009557	train-auc:0.999261	valid-error:0.008866	valid-auc:0.998889
[1912]	train-error:0.009557	train-auc:0.999262	valid-error:0.008866	valid-auc:0.998889
[1913]	train-error:0.009552	train-auc:0.999262	valid-error:0.008866	valid-auc:0.998889
[1914]	train-error:0.009552	train-auc:0.999263	valid-error:0.008866	valid-auc:0.998888
[1915]	train-error:0.009557	train-auc:0.999263	valid-error:0.008866	valid-auc:0.998889
[1916]	train-error:0.009563	train-auc:0.999263	valid-error:0.008866	valid-auc:0.998889
[1917]	train-error:0.009552	train-auc:0.999263	valid-error:0.008866	valid-auc:0.99889
[1918]	train-error:0.00954	train-auc:0.999264	valid-error:0.008866	valid-auc:0.998889
[1919]	train-error:0.00954	train-auc:0.999265	valid-error:0.008866	valid-auc:0.99889
[1920]	train-error:0.009529	train-auc:0.999265	valid-error:0.008866	valid-auc:0.99889
[1921]	train-error:0.009501	train-auc:0.999266	v

[2005]	train-error:0.009214	train-auc:0.999298	valid-error:0.008545	valid-auc:0.998911
[2006]	train-error:0.009209	train-auc:0.999298	valid-error:0.008545	valid-auc:0.998911
[2007]	train-error:0.009209	train-auc:0.999299	valid-error:0.008545	valid-auc:0.998911
[2008]	train-error:0.009209	train-auc:0.999299	valid-error:0.008545	valid-auc:0.998911
[2009]	train-error:0.009198	train-auc:0.999299	valid-error:0.008545	valid-auc:0.998911
[2010]	train-error:0.009198	train-auc:0.999299	valid-error:0.008545	valid-auc:0.998911
[2011]	train-error:0.009198	train-auc:0.9993	valid-error:0.008545	valid-auc:0.998912
[2012]	train-error:0.009192	train-auc:0.9993	valid-error:0.008545	valid-auc:0.998912
[2013]	train-error:0.009181	train-auc:0.9993	valid-error:0.008545	valid-auc:0.998912
[2014]	train-error:0.009181	train-auc:0.9993	valid-error:0.008545	valid-auc:0.998912
[2015]	train-error:0.009175	train-auc:0.999301	valid-error:0.008545	valid-auc:0.998912
[2016]	train-error:0.009169	train-auc:0.999301	vali

[2100]	train-error:0.008877	train-auc:0.99933	valid-error:0.008332	valid-auc:0.998936
[2101]	train-error:0.008871	train-auc:0.99933	valid-error:0.008332	valid-auc:0.998937
[2102]	train-error:0.008871	train-auc:0.999331	valid-error:0.008332	valid-auc:0.998937
[2103]	train-error:0.008866	train-auc:0.999332	valid-error:0.008332	valid-auc:0.998937
[2104]	train-error:0.008866	train-auc:0.999332	valid-error:0.008332	valid-auc:0.998938
[2105]	train-error:0.008866	train-auc:0.999332	valid-error:0.008332	valid-auc:0.998937
[2106]	train-error:0.008866	train-auc:0.999333	valid-error:0.008332	valid-auc:0.998938
[2107]	train-error:0.008871	train-auc:0.999333	valid-error:0.008332	valid-auc:0.998938
[2108]	train-error:0.00886	train-auc:0.999334	valid-error:0.008332	valid-auc:0.998938
[2109]	train-error:0.008855	train-auc:0.999334	valid-error:0.008332	valid-auc:0.998938
[2110]	train-error:0.008855	train-auc:0.999334	valid-error:0.008332	valid-auc:0.998938
[2111]	train-error:0.00886	train-auc:0.999335	

[2195]	train-error:0.008663	train-auc:0.999362	valid-error:0.008332	valid-auc:0.998961
[2196]	train-error:0.008663	train-auc:0.999362	valid-error:0.008332	valid-auc:0.998961
[2197]	train-error:0.008663	train-auc:0.999362	valid-error:0.008332	valid-auc:0.998962
[2198]	train-error:0.008663	train-auc:0.999363	valid-error:0.008332	valid-auc:0.998963
[2199]	train-error:0.008663	train-auc:0.999363	valid-error:0.008332	valid-auc:0.998963
[2200]	train-error:0.00863	train-auc:0.999363	valid-error:0.008332	valid-auc:0.998963
[2201]	train-error:0.00863	train-auc:0.999363	valid-error:0.008332	valid-auc:0.998963
[2202]	train-error:0.008658	train-auc:0.999363	valid-error:0.008332	valid-auc:0.998963
[2203]	train-error:0.008658	train-auc:0.999364	valid-error:0.008225	valid-auc:0.998964
[2204]	train-error:0.008652	train-auc:0.999364	valid-error:0.008225	valid-auc:0.998964
[2205]	train-error:0.008652	train-auc:0.999364	valid-error:0.008225	valid-auc:0.998964
[2206]	train-error:0.008647	train-auc:0.99936

[2290]	train-error:0.008382	train-auc:0.999389	valid-error:0.008225	valid-auc:0.998984
[2291]	train-error:0.008388	train-auc:0.999389	valid-error:0.008225	valid-auc:0.998984
[2292]	train-error:0.008388	train-auc:0.99939	valid-error:0.008225	valid-auc:0.998984
[2293]	train-error:0.008399	train-auc:0.99939	valid-error:0.008225	valid-auc:0.998984
[2294]	train-error:0.008394	train-auc:0.999391	valid-error:0.008225	valid-auc:0.998984
[2295]	train-error:0.008382	train-auc:0.999391	valid-error:0.008225	valid-auc:0.998984
[2296]	train-error:0.008382	train-auc:0.999391	valid-error:0.008225	valid-auc:0.998984
[2297]	train-error:0.008382	train-auc:0.999391	valid-error:0.008225	valid-auc:0.998984
[2298]	train-error:0.008388	train-auc:0.999391	valid-error:0.008225	valid-auc:0.998984
[2299]	train-error:0.008371	train-auc:0.999392	valid-error:0.008225	valid-auc:0.998984
[2300]	train-error:0.008377	train-auc:0.999392	valid-error:0.008225	valid-auc:0.998985
[2301]	train-error:0.008382	train-auc:0.99939

[2385]	train-error:0.008202	train-auc:0.999416	valid-error:0.008118	valid-auc:0.998999
[2386]	train-error:0.008208	train-auc:0.999416	valid-error:0.008118	valid-auc:0.998999
[2387]	train-error:0.008202	train-auc:0.999416	valid-error:0.008118	valid-auc:0.998998
[2388]	train-error:0.008197	train-auc:0.999416	valid-error:0.008118	valid-auc:0.998998
[2389]	train-error:0.00818	train-auc:0.999417	valid-error:0.008118	valid-auc:0.998999
[2390]	train-error:0.00818	train-auc:0.999417	valid-error:0.008118	valid-auc:0.998999
[2391]	train-error:0.008191	train-auc:0.999417	valid-error:0.008118	valid-auc:0.998999
[2392]	train-error:0.008163	train-auc:0.999417	valid-error:0.008118	valid-auc:0.998999
[2393]	train-error:0.008163	train-auc:0.999418	valid-error:0.008118	valid-auc:0.999
[2394]	train-error:0.008163	train-auc:0.999418	valid-error:0.008118	valid-auc:0.999
[2395]	train-error:0.008163	train-auc:0.999418	valid-error:0.008118	valid-auc:0.999001
[2396]	train-error:0.008163	train-auc:0.999418	vali

[2480]	train-error:0.007966	train-auc:0.999439	valid-error:0.008011	valid-auc:0.999017
[2481]	train-error:0.007961	train-auc:0.999439	valid-error:0.008118	valid-auc:0.999018
[2482]	train-error:0.007955	train-auc:0.99944	valid-error:0.008118	valid-auc:0.999018
[2483]	train-error:0.007955	train-auc:0.99944	valid-error:0.008118	valid-auc:0.999018
[2484]	train-error:0.007949	train-auc:0.99944	valid-error:0.008011	valid-auc:0.999018
[2485]	train-error:0.007949	train-auc:0.999441	valid-error:0.008011	valid-auc:0.999018
[2486]	train-error:0.007944	train-auc:0.999441	valid-error:0.008011	valid-auc:0.99902
[2487]	train-error:0.007944	train-auc:0.999441	valid-error:0.008011	valid-auc:0.99902
[2488]	train-error:0.007944	train-auc:0.999441	valid-error:0.008011	valid-auc:0.999021
[2489]	train-error:0.007944	train-auc:0.999441	valid-error:0.008011	valid-auc:0.999021
[2490]	train-error:0.007961	train-auc:0.999442	valid-error:0.008011	valid-auc:0.999021
[2491]	train-error:0.007955	train-auc:0.999442	v

[2575]	train-error:0.007691	train-auc:0.999461	valid-error:0.008011	valid-auc:0.99904
[2576]	train-error:0.007696	train-auc:0.999461	valid-error:0.008011	valid-auc:0.999041
[2577]	train-error:0.007674	train-auc:0.999462	valid-error:0.008011	valid-auc:0.999041
[2578]	train-error:0.00768	train-auc:0.999462	valid-error:0.008011	valid-auc:0.999041
[2579]	train-error:0.00768	train-auc:0.999462	valid-error:0.008011	valid-auc:0.999041
[2580]	train-error:0.00768	train-auc:0.999462	valid-error:0.008011	valid-auc:0.999041
[2581]	train-error:0.00768	train-auc:0.999462	valid-error:0.008011	valid-auc:0.999041
[2582]	train-error:0.00768	train-auc:0.999463	valid-error:0.008011	valid-auc:0.999041
[2583]	train-error:0.00768	train-auc:0.999463	valid-error:0.008011	valid-auc:0.999041
[2584]	train-error:0.007674	train-auc:0.999463	valid-error:0.008011	valid-auc:0.999041
[2585]	train-error:0.007657	train-auc:0.999463	valid-error:0.008011	valid-auc:0.999041
[2586]	train-error:0.00764	train-auc:0.999464	vali

[2670]	train-error:0.007438	train-auc:0.999482	valid-error:0.007797	valid-auc:0.999051
[2671]	train-error:0.007438	train-auc:0.999482	valid-error:0.007797	valid-auc:0.999051
[2672]	train-error:0.007432	train-auc:0.999482	valid-error:0.007797	valid-auc:0.999053
[2673]	train-error:0.007432	train-auc:0.999483	valid-error:0.007797	valid-auc:0.999053
[2674]	train-error:0.007427	train-auc:0.999483	valid-error:0.007797	valid-auc:0.999053
[2675]	train-error:0.007415	train-auc:0.999483	valid-error:0.007797	valid-auc:0.999053
[2676]	train-error:0.007415	train-auc:0.999484	valid-error:0.007797	valid-auc:0.999053
[2677]	train-error:0.007404	train-auc:0.999484	valid-error:0.007797	valid-auc:0.999052
[2678]	train-error:0.007404	train-auc:0.999484	valid-error:0.007797	valid-auc:0.999052
[2679]	train-error:0.007404	train-auc:0.999484	valid-error:0.007797	valid-auc:0.999052
[2680]	train-error:0.007398	train-auc:0.999484	valid-error:0.007797	valid-auc:0.999052
[2681]	train-error:0.007398	train-auc:0.999

[2765]	train-error:0.006994	train-auc:0.999504	valid-error:0.007477	valid-auc:0.99906
[2766]	train-error:0.006988	train-auc:0.999504	valid-error:0.007477	valid-auc:0.99906
[2767]	train-error:0.006982	train-auc:0.999504	valid-error:0.007477	valid-auc:0.999061
[2768]	train-error:0.006977	train-auc:0.999504	valid-error:0.007477	valid-auc:0.999061
[2769]	train-error:0.00696	train-auc:0.999505	valid-error:0.007477	valid-auc:0.99906
[2770]	train-error:0.00696	train-auc:0.999505	valid-error:0.007477	valid-auc:0.999061
[2771]	train-error:0.006966	train-auc:0.999505	valid-error:0.007477	valid-auc:0.999061
[2772]	train-error:0.006949	train-auc:0.999506	valid-error:0.007477	valid-auc:0.999062
[2773]	train-error:0.006954	train-auc:0.999506	valid-error:0.007477	valid-auc:0.999062
[2774]	train-error:0.006949	train-auc:0.999506	valid-error:0.007477	valid-auc:0.999062
[2775]	train-error:0.006949	train-auc:0.999506	valid-error:0.007477	valid-auc:0.999062
[2776]	train-error:0.006954	train-auc:0.999506	v

[2861]	train-error:0.00651	train-auc:0.999526	valid-error:0.006943	valid-auc:0.999074
[2862]	train-error:0.00651	train-auc:0.999526	valid-error:0.006943	valid-auc:0.999074
[2863]	train-error:0.006493	train-auc:0.999527	valid-error:0.006943	valid-auc:0.999074
[2864]	train-error:0.006493	train-auc:0.999527	valid-error:0.006943	valid-auc:0.999074
[2865]	train-error:0.006488	train-auc:0.999527	valid-error:0.006943	valid-auc:0.999074
[2866]	train-error:0.006488	train-auc:0.999528	valid-error:0.006943	valid-auc:0.999074
[2867]	train-error:0.006499	train-auc:0.999528	valid-error:0.006943	valid-auc:0.999074
[2868]	train-error:0.006488	train-auc:0.999528	valid-error:0.006943	valid-auc:0.999075
[2869]	train-error:0.006488	train-auc:0.999528	valid-error:0.006943	valid-auc:0.999075
[2870]	train-error:0.006488	train-auc:0.999529	valid-error:0.006943	valid-auc:0.999075
[2871]	train-error:0.006476	train-auc:0.999529	valid-error:0.006943	valid-auc:0.999074
[2872]	train-error:0.006471	train-auc:0.99952

[2956]	train-error:0.00633	train-auc:0.999544	valid-error:0.006516	valid-auc:0.999082
[2957]	train-error:0.006325	train-auc:0.999544	valid-error:0.006516	valid-auc:0.999082
[2958]	train-error:0.006308	train-auc:0.999544	valid-error:0.006516	valid-auc:0.999083
[2959]	train-error:0.006308	train-auc:0.999545	valid-error:0.006516	valid-auc:0.999083
[2960]	train-error:0.006308	train-auc:0.999545	valid-error:0.006516	valid-auc:0.999083
[2961]	train-error:0.006313	train-auc:0.999545	valid-error:0.006516	valid-auc:0.999082
[2962]	train-error:0.006308	train-auc:0.999545	valid-error:0.006516	valid-auc:0.999082
[2963]	train-error:0.006308	train-auc:0.999545	valid-error:0.006516	valid-auc:0.999082
[2964]	train-error:0.006308	train-auc:0.999546	valid-error:0.006516	valid-auc:0.999082
[2965]	train-error:0.006308	train-auc:0.999546	valid-error:0.006516	valid-auc:0.999082
[2966]	train-error:0.006297	train-auc:0.999546	valid-error:0.006516	valid-auc:0.999083
[2967]	train-error:0.006297	train-auc:0.9995

[3051]	train-error:0.006139	train-auc:0.999561	valid-error:0.006516	valid-auc:0.999089
[3052]	train-error:0.006134	train-auc:0.999561	valid-error:0.006516	valid-auc:0.999089
[3053]	train-error:0.006134	train-auc:0.999562	valid-error:0.006516	valid-auc:0.99909
[3054]	train-error:0.006128	train-auc:0.999561	valid-error:0.006516	valid-auc:0.999089
[3055]	train-error:0.006128	train-auc:0.999562	valid-error:0.006516	valid-auc:0.99909
[3056]	train-error:0.006117	train-auc:0.999562	valid-error:0.006516	valid-auc:0.99909
[3057]	train-error:0.006117	train-auc:0.999562	valid-error:0.006516	valid-auc:0.99909
[3058]	train-error:0.006128	train-auc:0.999563	valid-error:0.006516	valid-auc:0.99909
[3059]	train-error:0.006128	train-auc:0.999563	valid-error:0.006516	valid-auc:0.99909
[3060]	train-error:0.006134	train-auc:0.999563	valid-error:0.006516	valid-auc:0.999089
[3061]	train-error:0.00615	train-auc:0.999564	valid-error:0.006516	valid-auc:0.999091
[3062]	train-error:0.00615	train-auc:0.999564	vali

[3146]	train-error:0.005869	train-auc:0.999579	valid-error:0.006302	valid-auc:0.999102
[3147]	train-error:0.005869	train-auc:0.999579	valid-error:0.006302	valid-auc:0.999103
[3148]	train-error:0.005864	train-auc:0.999579	valid-error:0.006302	valid-auc:0.999103
[3149]	train-error:0.005858	train-auc:0.99958	valid-error:0.006302	valid-auc:0.999103
[3150]	train-error:0.005864	train-auc:0.99958	valid-error:0.006302	valid-auc:0.999104
[3151]	train-error:0.005858	train-auc:0.99958	valid-error:0.006302	valid-auc:0.999104
[3152]	train-error:0.005858	train-auc:0.99958	valid-error:0.006302	valid-auc:0.999105
[3153]	train-error:0.005852	train-auc:0.99958	valid-error:0.006302	valid-auc:0.999105
[3154]	train-error:0.005852	train-auc:0.999581	valid-error:0.006302	valid-auc:0.999105
[3155]	train-error:0.005852	train-auc:0.999581	valid-error:0.006195	valid-auc:0.999106
[3156]	train-error:0.005852	train-auc:0.999581	valid-error:0.006195	valid-auc:0.999106
[3157]	train-error:0.005852	train-auc:0.999581	v

[3241]	train-error:0.005723	train-auc:0.999594	valid-error:0.005875	valid-auc:0.999113
[3242]	train-error:0.005712	train-auc:0.999594	valid-error:0.005875	valid-auc:0.999113
[3243]	train-error:0.005706	train-auc:0.999595	valid-error:0.005875	valid-auc:0.999114
[3244]	train-error:0.005712	train-auc:0.999595	valid-error:0.005875	valid-auc:0.999114
[3245]	train-error:0.005718	train-auc:0.999595	valid-error:0.005875	valid-auc:0.999114
[3246]	train-error:0.005706	train-auc:0.999595	valid-error:0.005875	valid-auc:0.999114
[3247]	train-error:0.005706	train-auc:0.999595	valid-error:0.005875	valid-auc:0.999114
[3248]	train-error:0.005701	train-auc:0.999595	valid-error:0.005875	valid-auc:0.999115
[3249]	train-error:0.005701	train-auc:0.999595	valid-error:0.005875	valid-auc:0.999115
[3250]	train-error:0.005706	train-auc:0.999595	valid-error:0.005875	valid-auc:0.999115
[3251]	train-error:0.005701	train-auc:0.999596	valid-error:0.005875	valid-auc:0.999114
[3252]	train-error:0.005689	train-auc:0.999

[3336]	train-error:0.005442	train-auc:0.999611	valid-error:0.005448	valid-auc:0.999126
[3337]	train-error:0.005431	train-auc:0.999611	valid-error:0.005448	valid-auc:0.999126
[3338]	train-error:0.005425	train-auc:0.999611	valid-error:0.005448	valid-auc:0.999126
[3339]	train-error:0.005425	train-auc:0.999611	valid-error:0.005448	valid-auc:0.999126
[3340]	train-error:0.005425	train-auc:0.999611	valid-error:0.005448	valid-auc:0.999126
[3341]	train-error:0.005414	train-auc:0.999611	valid-error:0.005448	valid-auc:0.999126
[3342]	train-error:0.005414	train-auc:0.999612	valid-error:0.005448	valid-auc:0.999126
[3343]	train-error:0.005414	train-auc:0.999612	valid-error:0.005448	valid-auc:0.999126
[3344]	train-error:0.005408	train-auc:0.999612	valid-error:0.005448	valid-auc:0.999126
[3345]	train-error:0.005403	train-auc:0.999612	valid-error:0.005448	valid-auc:0.999127
[3346]	train-error:0.005403	train-auc:0.999612	valid-error:0.005448	valid-auc:0.999128
[3347]	train-error:0.005403	train-auc:0.999

[3431]	train-error:0.005251	train-auc:0.999624	valid-error:0.005234	valid-auc:0.999134
[3432]	train-error:0.005257	train-auc:0.999624	valid-error:0.005234	valid-auc:0.999134
[3433]	train-error:0.005245	train-auc:0.999624	valid-error:0.005234	valid-auc:0.999134
[3434]	train-error:0.005245	train-auc:0.999624	valid-error:0.005234	valid-auc:0.999134
[3435]	train-error:0.005245	train-auc:0.999624	valid-error:0.005234	valid-auc:0.999134
[3436]	train-error:0.005245	train-auc:0.999624	valid-error:0.005234	valid-auc:0.999133
[3437]	train-error:0.00524	train-auc:0.999625	valid-error:0.005234	valid-auc:0.999133
[3438]	train-error:0.005234	train-auc:0.999625	valid-error:0.005234	valid-auc:0.999133
[3439]	train-error:0.005234	train-auc:0.999625	valid-error:0.005234	valid-auc:0.999133
[3440]	train-error:0.005234	train-auc:0.999625	valid-error:0.005234	valid-auc:0.999133
[3441]	train-error:0.005234	train-auc:0.999625	valid-error:0.005234	valid-auc:0.999133
[3442]	train-error:0.005234	train-auc:0.9996

[3526]	train-error:0.005167	train-auc:0.999637	valid-error:0.005127	valid-auc:0.999138
[3527]	train-error:0.005167	train-auc:0.999637	valid-error:0.005127	valid-auc:0.999139
[3528]	train-error:0.005155	train-auc:0.999637	valid-error:0.005127	valid-auc:0.999139
[3529]	train-error:0.00515	train-auc:0.999637	valid-error:0.005127	valid-auc:0.999139
[3530]	train-error:0.005144	train-auc:0.999637	valid-error:0.005127	valid-auc:0.999139
[3531]	train-error:0.005138	train-auc:0.999637	valid-error:0.005127	valid-auc:0.999139
[3532]	train-error:0.005127	train-auc:0.999637	valid-error:0.005127	valid-auc:0.999139
[3533]	train-error:0.005127	train-auc:0.999638	valid-error:0.005127	valid-auc:0.99914
[3534]	train-error:0.005122	train-auc:0.999638	valid-error:0.005127	valid-auc:0.99914
[3535]	train-error:0.00511	train-auc:0.999638	valid-error:0.005127	valid-auc:0.99914
[3536]	train-error:0.00511	train-auc:0.999638	valid-error:0.005127	valid-auc:0.99914
[3537]	train-error:0.005105	train-auc:0.999639	val

[3621]	train-error:0.004998	train-auc:0.999651	valid-error:0.005127	valid-auc:0.999146
[3622]	train-error:0.004998	train-auc:0.999651	valid-error:0.005127	valid-auc:0.999146
[3623]	train-error:0.004992	train-auc:0.999651	valid-error:0.005127	valid-auc:0.999146
[3624]	train-error:0.004998	train-auc:0.999651	valid-error:0.005127	valid-auc:0.999146
[3625]	train-error:0.004998	train-auc:0.999651	valid-error:0.005127	valid-auc:0.999146
[3626]	train-error:0.004998	train-auc:0.999651	valid-error:0.005127	valid-auc:0.999147
[3627]	train-error:0.004998	train-auc:0.999651	valid-error:0.005127	valid-auc:0.999147
[3628]	train-error:0.004998	train-auc:0.999651	valid-error:0.005127	valid-auc:0.999147
[3629]	train-error:0.004998	train-auc:0.999651	valid-error:0.005127	valid-auc:0.999147
[3630]	train-error:0.004998	train-auc:0.999652	valid-error:0.005127	valid-auc:0.999147
[3631]	train-error:0.004992	train-auc:0.999652	valid-error:0.005127	valid-auc:0.999147
[3632]	train-error:0.004992	train-auc:0.999

[3716]	train-error:0.004857	train-auc:0.999661	valid-error:0.005127	valid-auc:0.999154
[3717]	train-error:0.004857	train-auc:0.999661	valid-error:0.005127	valid-auc:0.999154
[3718]	train-error:0.004857	train-auc:0.999662	valid-error:0.005127	valid-auc:0.999155
[3719]	train-error:0.004857	train-auc:0.999662	valid-error:0.005127	valid-auc:0.999155
[3720]	train-error:0.004852	train-auc:0.999662	valid-error:0.005127	valid-auc:0.999155
[3721]	train-error:0.004846	train-auc:0.999663	valid-error:0.005127	valid-auc:0.999154
[3722]	train-error:0.004846	train-auc:0.999663	valid-error:0.005127	valid-auc:0.999154
[3723]	train-error:0.004852	train-auc:0.999663	valid-error:0.005127	valid-auc:0.999154
[3724]	train-error:0.004852	train-auc:0.999663	valid-error:0.005127	valid-auc:0.999155
[3725]	train-error:0.004846	train-auc:0.999663	valid-error:0.005127	valid-auc:0.999155
[3726]	train-error:0.004846	train-auc:0.999663	valid-error:0.005127	valid-auc:0.999155
[3727]	train-error:0.004857	train-auc:0.999

In [52]:
print gbm.eval(dvalid)

[0]	eval-error:0.005127	eval-auc:0.999154


### Validate classification accuracy

In [53]:
%time data_ngrams = generate_context_ngrams(train, context_cols)

CPU times: user 7min 34s, sys: 23.7 s, total: 7min 57s
Wall time: 8min 2s


In [54]:
dtest = xgb.DMatrix(data_ngrams, label=labels, feature_names=feature_names)

In [55]:
%time predictions = gbm.predict(dtest)

CPU times: user 2h 49min 15s, sys: 14.2 s, total: 2h 49min 29s
Wall time: 1h 40s


In [56]:
predictions

array([ 0.01649959,  0.00064542,  0.00664725, ...,  0.008603  ,
        0.00384797,  0.00434028], dtype=float32)

In [57]:
from sklearn.metrics import accuracy_score

for i in range(10):
    predictions_current = predictions > i / 10.0
    print ("0.{}: {}".format(i, accuracy_score(labels, predictions_current)))

0.0: 0.125082403416
0.1: 0.954131736334
0.2: 0.975757343065
0.3: 0.985090927695
0.4: 0.988508204784
0.5: 0.992159630038
0.6: 0.993216133792
0.7: 0.993637619402
0.8: 0.993493971961
0.9: 0.991273630784


In [63]:
best_threshold = 0.7
best_accuracy = accuracy_score(labels, predictions > best_threshold)
print 'Best accuracy: ', best_accuracy
print 'Number of errors: ', len(train) - best_accuracy * len(train)

Best accuracy:  0.993637619402
Number of errors:  67279.0


In [60]:
from sklearn.metrics import confusion_matrix

tn, fp, fn, tp = confusion_matrix(labels, predictions > 0.7).ravel()

In [70]:
print tn, fp 
print fn, '\t', tp

9213176 38641
28638 	1294046
