In [3]:
import os
import pandas as pd
import xgboost
import utils
import scoring
import lightgbm as lgb
from sklearn.model_selection import train_test_split, StratifiedKFold
from sklearn.metrics import auc
from matplotlib import pyplot as plt
import numpy as np
from sklearn.decomposition import PCA
import xgboost as xgb
from sklearn.preprocessing import scale
from sklearn import  preprocessing
from hyperopt import hp
from hyperopt import fmin, tpe, hp, STATUS_OK, Trials
pd.options.display.max_columns = 1000

In [4]:
import warnings
warnings.catch_warnings()



## load train and validation

In [6]:
DATA_PATH = "../data"
train_df, test_df = utils.load_data_csv(DATA_PATH, utils.SIMPLE_FEATURE_COLUMNS)

train_hit = pd.read_csv('../data/train_closest_hits_features.csv')
test_hit = pd.read_csv('../data/test_closest_hits_features.csv')

print ('train shape {} test shape {}'.format(train_df.shape, test_df.shape))
print ('train shape {} test shape {}'.format(train_hit.shape, test_hit.shape))


train_df = pd.concat([train_df, train_hit], axis=1)
test_df = pd.concat([test_df, test_hit], axis=1)
del train_hit, test_hit
# print ('After concating train shape {} test shape {}'.format(train_df.shape, test_df.shape))

train_df.drop('Unnamed: 0', axis = 1, inplace=True)
test_df.drop('Unnamed: 0', axis = 1, inplace=True)

  mask |= (ar1 == a)


train shape (5445705, 67) test shape (726095, 65)
train shape (5445705, 25) test shape (726095, 25)


In [7]:
train_part, valid_df = train_test_split(train_df, test_size = 0.05, random_state = 0, shuffle=True)
train_features = [x for x in train_df.columns.tolist() if x not in ['label', 'weight'] ]
X_train = train_part[train_features]
y_train = train_part.label
w_train = train_part.weight

X_valid = valid_df[train_features]
y_valid = valid_df.label
w_valid = valid_df.weight

In [10]:
print (train_df.shape, valid_df.shape, test_df.shape, X_train.shape, X_valid.shape)
del valid_df, train_df, train_part

## Hyperopt param search

In [6]:
def get_score(valid_pred, dvalid):
    w_valid = dvalid.get_weight()  
    actuals = dvalid.get_label()
    score = scoring.rejection90(actuals.values, valid_pred, sample_weight = w_valid.values)
    return 'score', score, True

In [8]:
def score(params):
    print ("Training with params : ")
    print (params)
    num_round = int(params['n_estimators'])
    del params['n_estimators']
    dtrain = lgb.Dataset(X_train, label = y_train, weight = w_train)
    dvalid = lgb.Dataset(X_valid, label = y_valid, weight = w_valid)
    model = lgb.train(params, dtrain, num_round, valid_sets = [dtrain, dvalid], feval = get_score, \
                      verbose_eval = 20, early_stopping_rounds = 50)
    valid_pred = model.predict(X_valid, num_iteration = model.best_iteration)
    score = scoring.rejection90(y_valid, valid_pred, sample_weight = w_valid.values)
    print ("\t\tScore {0}\n\n".format(score))
    print( "\t\tBest iteration = ", model.best_iteration )
    return {'loss': -score, 'status': STATUS_OK}

In [9]:
def optimize(trials):
    space = {
             'n_estimators' : 2000,
             'learning_rate' : hp.quniform('learning_rate', 0.02, 0.3, 0.025),
             'scale_pos_weight' : hp.choice('scale_pos_weight', np.arange(10, 25, dtype=int)),
             'max_depth' : hp.choice('max_depth', np.arange(3, 12, dtype=int)), 
#              'min_data_in_leaf' : hp.choice('min_data_in_leaf', np.arange(20, 100, dtype=int)), 
             'colsample_bytree' : hp.quniform('colsample_bytree', 0.6, 1, 0.1),
             'subsample' : hp.quniform('subsample', 0.6, 1, 0.1),
             'lambda_l1' : hp.quniform('lambda_l1', 1, 6, 1),
             'lambda_l2' : hp.quniform('lambda_l2', 1, 6, 1),
             'metric': 'auc',
             'objective': 'binary',
             'nthread' : 6,
             'silent' : 1,
             'seed' : '0'
             }

    best = fmin(score, space, algo=tpe.suggest, trials=trials, max_evals= 20)
    print (best)

In [None]:
#Trials object where the history of search will be stored
trials = Trials()
optimize(trials)

Training with params : 
{'colsample_bytree': 0.7000000000000001, 'lambda_l1': 6.0, 'lambda_l2': 3.0, 'learning_rate': 0.225, 'max_depth': 6, 'metric': 'auc', 'n_estimators': 2000, 'nthread': 6, 'objective': 'binary', 'scale_pos_weight': 23, 'seed': '0', 'silent': 1, 'subsample': 0.8}


Please use silent argument of the Dataset constructor to pass this parameter.
  .format(key))


Training until validation scores don't improve for 50 rounds.
[20]	training's auc: 0.911088	training's score: 0.770665	valid_1's auc: 0.897659	valid_1's score: 0.755536
[40]	training's auc: 0.937031	training's score: 0.809902	valid_1's auc: 0.907342	valid_1's score: 0.774467
[60]	training's auc: 0.946903	training's score: 0.82358	valid_1's auc: 0.907074	valid_1's score: 0.779375
[80]	training's auc: 0.951296	training's score: 0.828664	valid_1's auc: 0.908627	valid_1's score: 0.777944
Early stopping, best iteration is:
[46]	training's auc: 0.941833	training's score: 0.816817	valid_1's auc: 0.90684	valid_1's score: 0.785078
		Score 0.7850780039420767


		Best iteration =  46
Training with params : 
{'colsample_bytree': 0.9, 'lambda_l1': 6.0, 'lambda_l2': 6.0, 'learning_rate': 0.05, 'max_depth': 4, 'metric': 'auc', 'n_estimators': 2000, 'nthread': 6, 'objective': 'binary', 'scale_pos_weight': 16, 'seed': '0', 'silent': 1, 'subsample': 0.7000000000000001}
Training until validation scores d

[80]	training's auc: 0.918745	training's score: 0.783358	valid_1's auc: 0.902942	valid_1's score: 0.762583
[100]	training's auc: 0.924586	training's score: 0.790692	valid_1's auc: 0.90531	valid_1's score: 0.766485
[120]	training's auc: 0.928657	training's score: 0.798159	valid_1's auc: 0.906894	valid_1's score: 0.768756
[140]	training's auc: 0.932351	training's score: 0.802671	valid_1's auc: 0.907488	valid_1's score: 0.778747
[160]	training's auc: 0.936264	training's score: 0.807486	valid_1's auc: 0.90935	valid_1's score: 0.779903
[180]	training's auc: 0.938706	training's score: 0.81211	valid_1's auc: 0.910287	valid_1's score: 0.780666
[200]	training's auc: 0.941502	training's score: 0.815872	valid_1's auc: 0.910392	valid_1's score: 0.784197
[220]	training's auc: 0.944043	training's score: 0.819531	valid_1's auc: 0.908635	valid_1's score: 0.781847
[240]	training's auc: 0.945037	training's score: 0.820657	valid_1's auc: 0.908281	valid_1's score: 0.782018
Early stopping, best iteration i

In [12]:
best_params = {'boosting': 'gbdt', 
'objective': 'binary', 
 'metric': 'auc',
 'colsample_bytree': 0.8,
 'lambda_l1': 3.0,
 'lambda_l2': 5.0,
 'learning_rate': 0.05,
 'max_depth': 5,
 'scale_pos_weight': 23,
 'subsample': 0.9,
 'is_training_metric': False,
  'nthread': 6,
  'seed' : 0}

In [13]:
dtrain = lgb.Dataset(X_train, label = y_train, weight = w_train)
dvalid = lgb.Dataset(X_valid, label = y_valid, weight = w_valid)
fit_model = lgb.train( best_params, dtrain, num_boost_round = 3000, valid_sets = [dtrain, dvalid], feval = get_score, \
                       verbose_eval = 10, early_stopping_rounds = 50)


Training until validation scores don't improve for 50 rounds.
[10]	training's auc: 0.771179	training's score: 0.446321	valid_1's auc: 0.765805	valid_1's score: 0.438928
[20]	training's auc: 0.816645	training's score: 0.601605	valid_1's auc: 0.806559	valid_1's score: 0.586473
[30]	training's auc: 0.851019	training's score: 0.654385	valid_1's auc: 0.842817	valid_1's score: 0.649826
[40]	training's auc: 0.860569	training's score: 0.679201	valid_1's auc: 0.850049	valid_1's score: 0.662324
[50]	training's auc: 0.877199	training's score: 0.71473	valid_1's auc: 0.862129	valid_1's score: 0.70163
[60]	training's auc: 0.889694	training's score: 0.738107	valid_1's auc: 0.871746	valid_1's score: 0.715877
[120]	training's auc: 0.923698	training's score: 0.789315	valid_1's auc: 0.901529	valid_1's score: 0.760246
[130]	training's auc: 0.926487	training's score: 0.794562	valid_1's auc: 0.903393	valid_1's score: 0.762089
[140]	training's auc: 0.929501	training's score: 0.799072	valid_1's auc: 0.904204	

In [16]:
print( " Best iteration = ", fit_model.best_iteration )

valid_pred = fit_model.predict(X_valid, num_iteration=fit_model.best_iteration)
print( "validation score = ", scoring.rejection90(y_valid.values, valid_pred, sample_weight = w_valid.values))

train_pred = fit_model.predict(X_train, num_iteration=fit_model.best_iteration)
print( "train score = ", scoring.rejection90(y_train.values, train_pred, sample_weight = w_train.values))

test_pred = fit_model.predict(test_df, num_iteration=fit_model.best_iteration)
test_pred[:20]

 Best iteration =  328
validation score =  0.7886786285212015
train score =  0.835148607396624


array([0.99742913, 0.98885396, 0.97097484, 0.9829058 , 0.99499633,
       0.97353369, 0.97339083, 0.97731266, 0.97610309, 0.99744541,
       0.99328967, 0.97463811, 0.98768186, 0.98859473, 0.99516877,
       0.99387499, 0.99501488, 0.97787196, 0.96813551, 0.98951552])

In [17]:
fit_model.best_iteration

328

In [18]:
pd.DataFrame(data={"prediction":  test_pred}, index=test_df.index).to_csv(
    "../submissions/lgb_90perc_7886.csv", index_label=utils.ID_COLUMN)

In [19]:
fit_model.save_model("../models/track1_lgb_90perc_7886_n305.lgb")

<lightgbm.basic.Booster at 0x7f05d0b58780>