In [1]:
import pandas as pd
import numpy as np
import xgboost as xgb

### data

In [2]:
def summary(data, start):
    data['mean'] = data.iloc[:,start:start+6].mean(axis=1)
    data['median'] = data.iloc[:,start:start+6].median(axis=1)
    data['min'] = data.iloc[:,start:start+6].min(axis=1)
    data['max'] = data.iloc[:,start:start+6].max(axis=1)
    data['diff'] = data['mean'] - data['min']
    return data

In [3]:
train_score = pd.read_csv('../data/train_scores.csv')
train_score = summary(train_score, 2).drop('stack', axis=1)
train_image = pd.read_csv('../data/train_xgb.csv')
train_data = train_score.merge(train_image, on='id')
print(train_data.shape)
train_matrix = xgb.DMatrix(data=train_data.iloc[:,2:], label=train_data['label'])

(1604, 24)


In [4]:
test_score = pd.read_csv('../data/test_scores.csv')
test_score = summary(test_score, 2).drop('stack', axis=1)
test_image = pd.read_csv('../data/test_xgb.csv')
test_data = test_score.merge(test_image, on='id')
print(test_data.shape)
test_matrix = xgb.DMatrix(data=test_data.iloc[:,1:])

(8424, 23)


### xgboost - cv

In [5]:
booster = {}
booster['booster'] = 'gbtree'
booster['nthread'] = 6
booster['max_depth'] = 5
booster['min_child_weight'] = 4
booster['subsample'] = 0.8
booster['colsample_bytree'] = 1.0
booster['colsample_bylevel'] = 0.8
booster['lambda'] = 4.0
booster['alpha'] = 3.0
booster['objective'] = 'binary:logistic'
booster['eval_metric'] = ['logloss']
booster['seed'] = 2017
booster['eta'] = 0.01

In [6]:
params = {}
params['params'] = booster
params['dtrain'] = train_matrix
params['num_boost_round'] = 3000
params['early_stopping_rounds'] = 200
params['verbose_eval'] = 400

In [7]:
x = xgb.cv(**params, nfold=5)

[0]	train-logloss:0.685422+6.69669e-05	test-logloss:0.685666+0.000257372
[400]	train-logloss:0.124171+0.00472596	test-logloss:0.164404+0.0260864
[800]	train-logloss:0.093035+0.00332345	test-logloss:0.150434+0.0289718
[1200]	train-logloss:0.0802404+0.00279669	test-logloss:0.148112+0.0297278
[1600]	train-logloss:0.0723432+0.00251912	test-logloss:0.147316+0.0298523
[2000]	train-logloss:0.066837+0.00230856	test-logloss:0.146955+0.0299523


### xgboost - model

In [8]:
params = {}
params['params'] = booster
params['dtrain'] = train_matrix
params['num_boost_round'] = 2400
model = xgb.train(**params)
sorted(model.get_fscore().items(), key=lambda x : x[1], reverse=True)[:30]

[('inc_angle', 1694),
 ('feat_1', 1245),
 ('feat_8', 1151),
 ('feat_5', 902),
 ('feat_3', 901),
 ('feat_9', 869),
 ('feat_2', 833),
 ('feat_4', 758),
 ('diff', 718),
 ('model_6', 718),
 ('model_5', 595),
 ('feat_7', 583),
 ('min', 569),
 ('max', 543),
 ('model_4', 512),
 ('feat_6', 508),
 ('model_1', 452),
 ('model_3', 434),
 ('model_2', 413),
 ('feat_0', 374),
 ('median', 260),
 ('mean', 228)]

In [9]:
scores = model.predict(test_matrix)

In [10]:
submit = test_data[['id']].copy()
submit['is_iceberg'] = scores
submit.to_csv('../data/xgb_stack.csv', index=False)
print(submit.shape)

(8424, 2)


In [11]:
submit.head()

Unnamed: 0,id,is_iceberg
0,5941774d,0.005709
1,4023181e,0.63955
2,b20200e4,0.003614
3,e7f018bb,0.995691
4,4371c8c3,0.088955
