In [15]:
# specify id
y_id = 2
track_id = 1
server = 4
error_rate = 300.0

In [2]:
# import module
import sys
sys.path.insert(0, '../')
from utils.training_utils import *
from sklearn.model_selection import GridSearchCV
from sklearn.metrics import mean_absolute_error, make_scorer
from xgboost import XGBRegressor

In [25]:
# specify parameters
params = {
    'booster': ['gbtree', 'dart'],
    'n_estimators': [50, 100, 500],
    'max_depth': [3, 6],
    'subsample': [0.25, 0.5, 0.75],
    'learning_rate': [0.1, 0.05, 0.01], 
    'tree_method': ['hist', 'auto']
}
params_test = {
    'booster': ['dart'],
    'n_estimators': [500],
    'max_depth': [9],
    'subsample': [0.5],
    'learning_rate': [0.1], 
    'tree_method': ['hist']
}

In [4]:
# load datas
test_x, train_x, train_y = load_data(y_id)
print(test_x.shape, train_x.shape, train_y.shape)

(2500, 10000) (47500, 10000) (47500,)


In [5]:
# pick only important data
idx = []
with open('../29/adaboost' + str(y_id) + '_feature.csv', 'r') as f:
    i = 0
    for lines in f:
        importance = float(lines.replace('\n', '').split(',')[y_id])
        if(np.abs(importance) > 1e-9):
            idx.append(i)
        i += 1
train_x = train_x[:, idx]
test_x = test_x[:, idx]
print(train_x.shape)

(47500, 565)


In [6]:
# define my own scorer
from sklearn.metrics import make_scorer

def scorer(y, y_pred):
    return -np.sum(np.abs(y - y_pred)) * error_rate / len(y)

In [10]:
# create GridSearchCV
model = GridSearchCV(estimator=XGBRegressor(verbosity=2, n_jobs=8), 
                     param_grid=params, 
                     scoring=make_scorer(scorer),
                     cv=3,
                     verbose=20,
                     n_jobs=4,
                     return_train_score=True)

In [11]:
# train
model.fit(train_x, train_y)

Fitting 3 folds for each of 216 candidates, totalling 648 fits


[Parallel(n_jobs=4)]: Using backend LokyBackend with 4 concurrent workers.
[Parallel(n_jobs=4)]: Done   1 tasks      | elapsed:   10.3s
[Parallel(n_jobs=4)]: Done   2 tasks      | elapsed:   10.5s
[Parallel(n_jobs=4)]: Done   3 tasks      | elapsed:   11.1s
[Parallel(n_jobs=4)]: Done   4 tasks      | elapsed:   13.1s
[Parallel(n_jobs=4)]: Done   5 tasks      | elapsed:   20.9s
[Parallel(n_jobs=4)]: Done   6 tasks      | elapsed:   22.5s
[Parallel(n_jobs=4)]: Done   7 tasks      | elapsed:   23.2s
[Parallel(n_jobs=4)]: Done   8 tasks      | elapsed:   24.6s
[Parallel(n_jobs=4)]: Done   9 tasks      | elapsed:   34.7s
[Parallel(n_jobs=4)]: Done  10 tasks      | elapsed:   37.5s
[Parallel(n_jobs=4)]: Done  11 tasks      | elapsed:   39.0s
[Parallel(n_jobs=4)]: Done  12 tasks      | elapsed:   39.6s
[Parallel(n_jobs=4)]: Done  13 tasks      | elapsed:   46.5s
[Parallel(n_jobs=4)]: Done  14 tasks      | elapsed:   47.7s
[Parallel(n_jobs=4)]: Done  15 tasks      | elapsed:   50.8s
[Parallel(

[Parallel(n_jobs=4)]: Done 135 tasks      | elapsed: 36.3min
[Parallel(n_jobs=4)]: Done 136 tasks      | elapsed: 36.6min
[Parallel(n_jobs=4)]: Done 137 tasks      | elapsed: 36.7min
[Parallel(n_jobs=4)]: Done 138 tasks      | elapsed: 36.8min
[Parallel(n_jobs=4)]: Done 139 tasks      | elapsed: 36.8min
[Parallel(n_jobs=4)]: Done 140 tasks      | elapsed: 36.9min
[Parallel(n_jobs=4)]: Done 141 tasks      | elapsed: 37.2min
[Parallel(n_jobs=4)]: Done 142 tasks      | elapsed: 37.3min
[Parallel(n_jobs=4)]: Done 143 tasks      | elapsed: 37.4min
[Parallel(n_jobs=4)]: Done 144 tasks      | elapsed: 37.5min
[Parallel(n_jobs=4)]: Done 145 tasks      | elapsed: 38.2min
[Parallel(n_jobs=4)]: Done 146 tasks      | elapsed: 38.3min
[Parallel(n_jobs=4)]: Done 147 tasks      | elapsed: 38.6min
[Parallel(n_jobs=4)]: Done 148 tasks      | elapsed: 39.1min
[Parallel(n_jobs=4)]: Done 149 tasks      | elapsed: 40.0min
[Parallel(n_jobs=4)]: Done 150 tasks      | elapsed: 40.1min
[Parallel(n_jobs=4)]: Do

[Parallel(n_jobs=4)]: Done 270 tasks      | elapsed: 80.4min
[Parallel(n_jobs=4)]: Done 271 tasks      | elapsed: 80.7min
[Parallel(n_jobs=4)]: Done 272 tasks      | elapsed: 80.7min
[Parallel(n_jobs=4)]: Done 273 tasks      | elapsed: 80.9min
[Parallel(n_jobs=4)]: Done 274 tasks      | elapsed: 81.0min
[Parallel(n_jobs=4)]: Done 275 tasks      | elapsed: 81.0min
[Parallel(n_jobs=4)]: Done 276 tasks      | elapsed: 81.1min
[Parallel(n_jobs=4)]: Done 277 tasks      | elapsed: 81.3min
[Parallel(n_jobs=4)]: Done 278 tasks      | elapsed: 81.3min
[Parallel(n_jobs=4)]: Done 279 tasks      | elapsed: 81.5min
[Parallel(n_jobs=4)]: Done 280 tasks      | elapsed: 81.5min
[Parallel(n_jobs=4)]: Done 281 tasks      | elapsed: 81.8min
[Parallel(n_jobs=4)]: Done 282 tasks      | elapsed: 81.8min
[Parallel(n_jobs=4)]: Done 283 tasks      | elapsed: 82.0min
[Parallel(n_jobs=4)]: Done 284 tasks      | elapsed: 82.0min
[Parallel(n_jobs=4)]: Done 285 tasks      | elapsed: 82.3min
[Parallel(n_jobs=4)]: Do

[Parallel(n_jobs=4)]: Done 403 tasks      | elapsed: 126.4min
[Parallel(n_jobs=4)]: Done 404 tasks      | elapsed: 126.4min
[Parallel(n_jobs=4)]: Done 405 tasks      | elapsed: 126.9min
[Parallel(n_jobs=4)]: Done 406 tasks      | elapsed: 127.0min
[Parallel(n_jobs=4)]: Done 407 tasks      | elapsed: 127.5min
[Parallel(n_jobs=4)]: Done 408 tasks      | elapsed: 127.5min
[Parallel(n_jobs=4)]: Done 409 tasks      | elapsed: 128.1min
[Parallel(n_jobs=4)]: Done 410 tasks      | elapsed: 128.2min
[Parallel(n_jobs=4)]: Done 411 tasks      | elapsed: 128.8min
[Parallel(n_jobs=4)]: Done 412 tasks      | elapsed: 129.0min
[Parallel(n_jobs=4)]: Done 413 tasks      | elapsed: 129.3min
[Parallel(n_jobs=4)]: Done 414 tasks      | elapsed: 129.5min
[Parallel(n_jobs=4)]: Done 415 tasks      | elapsed: 134.1min
[Parallel(n_jobs=4)]: Done 416 tasks      | elapsed: 134.1min
[Parallel(n_jobs=4)]: Done 417 tasks      | elapsed: 134.3min
[Parallel(n_jobs=4)]: Done 418 tasks      | elapsed: 134.5min
[Paralle

[Parallel(n_jobs=4)]: Done 540 tasks      | elapsed: 204.6min
[Parallel(n_jobs=4)]: Done 541 tasks      | elapsed: 204.7min
[Parallel(n_jobs=4)]: Done 542 tasks      | elapsed: 204.9min
[Parallel(n_jobs=4)]: Done 543 tasks      | elapsed: 205.0min
[Parallel(n_jobs=4)]: Done 544 tasks      | elapsed: 205.1min
[Parallel(n_jobs=4)]: Done 545 tasks      | elapsed: 205.3min
[Parallel(n_jobs=4)]: Done 546 tasks      | elapsed: 205.4min
[Parallel(n_jobs=4)]: Done 547 tasks      | elapsed: 205.6min
[Parallel(n_jobs=4)]: Done 548 tasks      | elapsed: 205.8min
[Parallel(n_jobs=4)]: Done 549 tasks      | elapsed: 206.0min
[Parallel(n_jobs=4)]: Done 550 tasks      | elapsed: 206.1min
[Parallel(n_jobs=4)]: Done 551 tasks      | elapsed: 206.3min
[Parallel(n_jobs=4)]: Done 552 tasks      | elapsed: 206.4min
[Parallel(n_jobs=4)]: Done 553 tasks      | elapsed: 206.6min
[Parallel(n_jobs=4)]: Done 554 tasks      | elapsed: 206.8min
[Parallel(n_jobs=4)]: Done 555 tasks      | elapsed: 207.0min
[Paralle

GridSearchCV(cv=3, error_score='raise-deprecating',
       estimator=XGBRegressor(base_score=0.5, booster='gbtree', colsample_bylevel=1,
       colsample_bytree=1, gamma=0, importance_type='gain',
       learning_rate=0.1, max_delta_step=0, max_depth=3,
       min_child_weight=1, missing=None, n_estimators=100, n_jobs=8,
       nthread=None, objective='reg:linear', random_state=0, reg_alpha=0,
       reg_lambda=1, scale_pos_weight=1, seed=None, silent=True,
       subsample=1, verbosity=2),
       fit_params=None, iid='warn', n_jobs=4,
       param_grid={'booster': ['gbtree', 'dart'], 'n_estimators': [50, 100, 500], 'max_depth': [3, 6], 'subsample': [0.25, 0.5, 0.75], 'learning_rate': [0.1, 0.05, 0.01], 'tree_method': ['hist', 'auto']},
       pre_dispatch='2*n_jobs', refit=True, return_train_score=True,
       scoring=make_scorer(scorer), verbose=20)

In [12]:
# write files
write_prediction('train_y' + str(y_id) + '_' + str(track_id) + '.txt', 'w', model.predict(train_x).reshape((47500, 1)).astype('str'))
write_prediction('test_y' + str(y_id) + '_' + str(track_id) + '.txt', 'w', model.predict(test_x).reshape((2500, 1)).astype('str'))

In [13]:
print(err1_calc(model.predict(train_x), train_y, y_id))

1.1123971307389506


In [14]:
print(model.best_estimator_)

XGBRegressor(base_score=0.5, booster='dart', colsample_bylevel=1,
       colsample_bytree=1, gamma=0, importance_type='gain',
       learning_rate=0.1, max_delta_step=0, max_depth=6,
       min_child_weight=1, missing=None, n_estimators=500, n_jobs=8,
       nthread=None, objective='reg:linear', random_state=0, reg_alpha=0,
       reg_lambda=1, scale_pos_weight=1, seed=None, silent=True,
       subsample=0.75, tree_method='hist', verbosity=2)


In [17]:
# test

In [26]:
model2 = GridSearchCV(estimator=XGBRegressor(verbosity=2, n_jobs=8), 
                     param_grid=params_test, 
                     scoring=make_scorer(scorer),
                     cv=3,
                     verbose=20,
                     n_jobs=4,
                     return_train_score=True)

In [27]:
model2.fit(train_x, train_y)

Fitting 3 folds for each of 1 candidates, totalling 3 fits


[Parallel(n_jobs=4)]: Using backend LokyBackend with 4 concurrent workers.
[Parallel(n_jobs=4)]: Done   1 tasks      | elapsed:    4.1s
[Parallel(n_jobs=4)]: Done   3 out of   3 | elapsed:    4.1s remaining:    0.0s


KeyboardInterrupt: 

In [20]:
print(err1_calc(model2.predict(train_x), train_y, y_id))

0.16900158048345704


In [21]:
print(err2_calc(model2.predict(train_x), train_y))

0.0007744558685535031


In [22]:
print(model2.best_estimator_)

XGBRegressor(base_score=0.5, booster='dart', colsample_bylevel=1,
       colsample_bytree=1, gamma=0, importance_type='gain',
       learning_rate=0.1, max_delta_step=0, max_depth=9,
       min_child_weight=1, missing=None, n_estimators=500, n_jobs=8,
       nthread=None, objective='reg:linear', random_state=0, reg_alpha=0,
       reg_lambda=1, scale_pos_weight=1, seed=None, silent=True,
       subsample=0.75, tree_method='hist', verbosity=2)


In [23]:
write_prediction('tmp3.txt', 'w', model2.predict(test_x).reshape((2500, 1)).astype('str'))