In [1]:
import pandas as pd
import numpy as np
import xgboost as xgb

### Features

In [2]:
y_train = np.load('../data/y_train.npy').argmax(axis=1)
y_valid = np.load('../data/y_valid.npy').argmax(axis=1)
print(y_train.shape, y_valid.shape)

sum_train = np.atleast_2d(np.load('../data/X_train.npy').mean(axis=1)).T
sum_valid = np.atleast_2d(np.load('../data/X_valid.npy').mean(axis=1)).T
sum_score = np.atleast_2d(np.load('../data/X_score.npy').mean(axis=1)).T
print(sum_train.shape, sum_valid.shape, sum_score.shape)

max_train = np.atleast_2d(np.count_nonzero(np.load('../data/X_train.npy'),axis=1)).T
max_valid = np.atleast_2d(np.count_nonzero(np.load('../data/X_valid.npy'),axis=1)).T
max_score = np.atleast_2d(np.count_nonzero(np.load('../data/X_score.npy'),axis=1)).T
print(max_train.shape, max_valid.shape, max_score.shape)

knn_train = np.load('../data/scores/knn_train.npy')
knn_valid = np.load('../data/scores/knn_valid.npy')
knn_score = np.load('../data/scores/knn_score.npy')
print(knn_train.shape, knn_valid.shape, knn_score.shape)

cnn1_train = np.load('../data/scores/cnn1_train.npy')
cnn1_valid = np.load('../data/scores/cnn1_valid.npy')
cnn1_score = np.load('../data/scores/cnn1_score.npy')
print(cnn1_train.shape, cnn1_valid.shape, cnn1_score.shape)

cnn2_train = np.load('../data/scores/cnn2_train.npy')
cnn2_valid = np.load('../data/scores/cnn2_valid.npy')
cnn2_score = np.load('../data/scores/cnn2_score.npy')
print(cnn2_train.shape, cnn2_valid.shape, cnn2_score.shape)

cnn3_train = np.load('../data/scores/cnn3_train.npy')
cnn3_valid = np.load('../data/scores/cnn3_valid.npy')
cnn3_score = np.load('../data/scores/cnn3_score.npy')
print(cnn3_train.shape, cnn3_valid.shape, cnn3_score.shape)

(36000,) (6000,)
(36000, 1) (6000, 1) (28000, 1)
(36000, 1) (6000, 1) (28000, 1)
(36000, 10) (6000, 10) (28000, 10)
(36000, 10) (6000, 10) (28000, 10)
(36000, 10) (6000, 10) (28000, 10)
(36000, 10) (6000, 10) (28000, 10)


In [3]:
X_train = np.concatenate([sum_train,max_train,knn_train,cnn1_train,cnn2_train,cnn3_train], axis=1)
X_valid = np.concatenate([sum_valid,max_valid,knn_valid,cnn1_valid,cnn2_valid,cnn3_valid], axis=1)
X_score = np.concatenate([sum_score,max_score,knn_score,cnn1_score,cnn2_score,cnn3_score], axis=1)
print(X_train.shape, X_valid.shape, X_score.shape)

(36000, 42) (6000, 42) (28000, 42)


In [4]:
train_matrix = xgb.DMatrix(data=X_valid[:3000,:], label=y_valid[:3000])
valid_matrix = xgb.DMatrix(data=X_valid[3000:,:], label=y_valid[3000:])
score_matrix = xgb.DMatrix(data=X_score)

### Model

In [5]:
booster = {}
booster['booster'] = 'gbtree'
booster['nthread'] = 15
booster['max_depth'] = 6
booster['min_child_weight'] = 1
booster['subsample'] = 0.8
booster['colsample_bytree'] = 1.0
booster['colsample_bylevel'] = 0.9
booster['lambda'] = 1.0
booster['alpha'] = 0.0
booster['objective'] = 'multi:softprob'
booster['num_class'] = 10
booster['eval_metric'] = ['merror']
booster['seed'] = 108

In [6]:
params = {}
params['params'] = booster
params['dtrain'] = train_matrix
params['num_boost_round'] = 2000
params['evals'] = [(train_matrix,'train_matrix'),(valid_matrix,'valid_matrix')]
params['early_stopping_rounds'] = 10
params['verbose_eval'] = 1
params['callbacks'] = [xgb.callback.reset_learning_rate([0.001] * 2000)]

In [7]:
model = xgb.train(**params)

[0]	train_matrix-merror:0.003667	valid_matrix-merror:0.005
Multiple eval metrics have been passed: 'valid_matrix-merror' will be used for early stopping.

Will train until valid_matrix-merror hasn't improved in 10 rounds.
[1]	train_matrix-merror:0.002667	valid_matrix-merror:0.005
[2]	train_matrix-merror:0.002667	valid_matrix-merror:0.005
[3]	train_matrix-merror:0.002667	valid_matrix-merror:0.005
[4]	train_matrix-merror:0.002667	valid_matrix-merror:0.005
[5]	train_matrix-merror:0.002667	valid_matrix-merror:0.004667
[6]	train_matrix-merror:0.002667	valid_matrix-merror:0.004667
[7]	train_matrix-merror:0.002667	valid_matrix-merror:0.004333
[8]	train_matrix-merror:0.002667	valid_matrix-merror:0.004333
[9]	train_matrix-merror:0.002667	valid_matrix-merror:0.004333
[10]	train_matrix-merror:0.002667	valid_matrix-merror:0.004667
[11]	train_matrix-merror:0.002667	valid_matrix-merror:0.004667
[12]	train_matrix-merror:0.002667	valid_matrix-merror:0.004667
[13]	train_matrix-merror:0.002667	valid_mat

In [8]:
predict = model.predict(score_matrix).argmax(axis=1)
predict = pd.DataFrame(predict, columns=['Label'])
predict = predict.reset_index()
predict.columns = ['ImageId','Label']
predict['ImageId'] = predict['ImageId'] + 1

In [9]:
predict.to_csv('../data/submit_v4.csv', index=False)

In [None]:
### Submission scored 0.99471 on LB