In [45]:
%matplotlib inline

import xgboost as xgb
import seaborn as sns
import pandas as pd

sns.set(font_scale = 1.5)



In [2]:
dtrain = xgb.DMatrix('train.txt')
dtest = xgb.DMatrix('test.txt')

In [3]:
print("Train dataset contains {0} rows and {1} columns".format(dtrain.num_row(), dtrain.num_col()))
print("Test dataset contains {0} rows and {1} columns".format(dtest.num_row(), dtest.num_col()))

Train dataset contains 18500 rows and 9 columns
Test dataset contains 820 rows and 9 columns


In [4]:
print("Train possible labels: ")
print(np.unique(dtrain.get_label()))

print("\nTest possible labels: ")
print(np.unique(dtest.get_label()))

Train possible labels: 
[0. 1.]

Test possible labels: 
[0. 1.]


In [117]:
def misclassified(pred_probs, dtrain):
    labels = dtrain.get_label() # obtain true labels
    preds = pred_probs > 0.5 # obtain predicted values
    return 'misclassified', np.sum(labels != preds)

In [120]:
params = {
    'objective':'binary:logistic',
    'max_depth':30,
    'min_child_weight':0,
    'silent':0,
    'eta':1,
    'eval_metric':'auc'
}

num_rounds = 100

bst = xgb.train(params, dtrain, num_rounds)

watchlist  = [(dtest,'test'), (dtrain,'train')] # native interface only
bst = xgb.train(params, dtrain, num_rounds, watchlist, early_stopping_rounds=10)

preds_prob = bst.predict(dtest)
preds_prob

Will train until train error hasn't decreased in 10 rounds.
[0]	test-auc:0.855432	train-auc:0.894909
[1]	test-auc:0.887417	train-auc:0.939228
[2]	test-auc:0.929087	train-auc:0.973951
[3]	test-auc:0.936072	train-auc:0.985368
[4]	test-auc:0.940163	train-auc:0.990909
[5]	test-auc:0.943394	train-auc:0.994730
[6]	test-auc:0.946598	train-auc:0.996750
[7]	test-auc:0.949326	train-auc:0.998139
[8]	test-auc:0.950015	train-auc:0.998851
[9]	test-auc:0.948924	train-auc:0.999265
[10]	test-auc:0.947811	train-auc:0.999530
[11]	test-auc:0.946697	train-auc:0.999681
[12]	test-auc:0.948477	train-auc:0.999760
[13]	test-auc:0.947750	train-auc:0.999816
[14]	test-auc:0.948561	train-auc:0.999861
[15]	test-auc:0.948773	train-auc:0.999884
[16]	test-auc:0.949121	train-auc:0.999903
[17]	test-auc:0.949727	train-auc:0.999915
[18]	test-auc:0.949773	train-auc:0.999927
[19]	test-auc:0.949129	train-auc:0.999931
[20]	test-auc:0.949182	train-auc:0.999940
[21]	test-auc:0.949439	train-auc:0.999946
[22]	test-auc:0.949189	tra

array([9.94809031e-01, 9.97767568e-01, 9.86515760e-01, 7.51314282e-01,
       9.96812403e-01, 9.99621987e-01, 9.98105884e-01, 2.31059000e-01,
       9.99644756e-01, 9.99845147e-01, 9.99757707e-01, 9.86155570e-01,
       9.99767601e-01, 1.75876244e-06, 9.99179900e-01, 9.99338329e-01,
       1.59872510e-02, 9.84334886e-01, 9.99617457e-01, 9.86267805e-01,
       5.20982325e-01, 9.74714994e-01, 8.51521671e-01, 9.92329061e-01,
       9.99953389e-01, 9.87513602e-01, 9.99807894e-01, 8.14934552e-01,
       9.90571380e-01, 9.95499194e-01, 9.99742210e-01, 9.92189288e-01,
       9.99810159e-01, 2.88965493e-01, 8.99622858e-01, 9.99864221e-01,
       1.40063509e-01, 7.43878901e-01, 9.98752952e-01, 9.95457649e-01,
       9.99998808e-01, 6.11754879e-02, 9.65311289e-01, 9.99883056e-01,
       9.99191344e-01, 9.93057609e-01, 9.99356806e-01, 9.99948621e-01,
       9.99796331e-01, 9.97245073e-01, 9.99699712e-01, 8.70365262e-01,
       9.98886406e-01, 9.99951243e-01, 7.70998716e-01, 2.96024859e-01,
      

In [122]:
print("Booster best train score: {}".format(bst.best_score))
print("Booster best iteration: {}".format(bst.best_iteration))
print("Booster best number of trees limit: {}".format(bst.best_ntree_limit))

Booster best train score: 0.999999
Booster best iteration: 73
Booster best number of trees limit: 74


In [121]:
labels = dtest.get_label()
preds = preds_prob > 0.5 # threshold
correct = 0

for i in range(len(preds)):
    if (labels[i] == preds[i]):
        correct += 1

print('Predicted correctly: {0}/{1}'.format(correct, len(preds)))
print('Error: {0:.4f}'.format(1-correct/len(preds)))

Predicted correctly: 706/820
Error: 1.0000
