In [81]:
import os
import pandas as pd
import numpy as np
import lightgbm as lgb
from sklearn.model_selection import KFold, train_test_split
from sklearn import *
from scipy import *

from scipy.stats import skew
from scipy.stats.stats import pearsonr
from scipy.optimize import minimize_scalar

from hyperopt import hp, tpe
from hyperopt.fmin import fmin

from sklearn.model_selection import cross_val_score, StratifiedKFold
from sklearn.ensemble import RandomForestClassifier
from sklearn.metrics import make_scorer


In [5]:
# Load the list of documents
train = pd.read_csv("train.csv")
test = pd.read_csv("test.csv")

In [18]:
train.shape 

(595212, 59)

In [19]:
train.head()

Unnamed: 0,id,target,ps_ind_01,ps_ind_02_cat,ps_ind_03,ps_ind_04_cat,ps_ind_05_cat,ps_ind_06_bin,ps_ind_07_bin,ps_ind_08_bin,...,ps_calc_11,ps_calc_12,ps_calc_13,ps_calc_14,ps_calc_15_bin,ps_calc_16_bin,ps_calc_17_bin,ps_calc_18_bin,ps_calc_19_bin,ps_calc_20_bin
0,7,0,2,2,5,1,0,0,1,0,...,9,1,5,8,0,1,1,0,0,1
1,9,0,1,1,7,0,0,0,0,1,...,3,1,1,9,0,1,1,0,1,0
2,13,0,5,4,9,1,0,0,0,1,...,4,2,7,7,0,1,1,0,1,0
3,16,0,0,1,2,0,0,1,0,0,...,2,2,4,9,0,0,0,0,0,0
4,17,0,0,2,0,1,0,1,0,0,...,3,1,1,3,0,0,0,1,1,0


In [10]:
all_data = pd.concat((train.loc[:,'ps_ind_01':'ps_calc_20_bin'],
                      test.loc[:,'ps_ind_01':'ps_calc_20_bin']))

In [11]:
all_data.shape

(1488028, 57)

In [12]:
#log transform skewed numeric features:
numeric_feats = all_data.dtypes[all_data.dtypes != "object"].index

skewed_feats = train[numeric_feats].apply(lambda x: skew(x.dropna())) #compute skewness
skewed_feats = skewed_feats[skewed_feats > 0.75]
skewed_feats = skewed_feats.index

all_data[skewed_feats] = np.log1p(all_data[skewed_feats])



In [15]:
all_data.head()

Unnamed: 0,ps_ind_01,ps_ind_02_cat,ps_ind_03,ps_ind_04_cat,ps_ind_05_cat,ps_ind_06_bin,ps_ind_07_bin,ps_ind_08_bin,ps_ind_09_bin,ps_ind_10_bin,...,ps_calc_11,ps_calc_12,ps_calc_13,ps_calc_14,ps_calc_15_bin,ps_calc_16_bin,ps_calc_17_bin,ps_calc_18_bin,ps_calc_19_bin,ps_calc_20_bin
0,1.098612,1.098612,5,1,0.0,0,0.693147,0.0,0.0,0.0,...,9,0.693147,5,8,0.0,1,1,0.0,0,0.693147
1,0.693147,0.693147,7,0,0.0,0,0.0,0.693147,0.0,0.0,...,3,0.693147,1,9,0.0,1,1,0.0,1,0.0
2,1.791759,1.609438,9,1,0.0,0,0.0,0.693147,0.0,0.0,...,4,1.098612,7,7,0.0,1,1,0.0,1,0.0
3,0.0,0.693147,2,0,0.0,1,0.0,0.0,0.0,0.0,...,2,1.098612,4,9,0.0,0,0,0.0,0,0.0
4,0.0,1.098612,0,1,0.0,1,0.0,0.0,0.0,0.0,...,3,0.693147,1,3,0.0,0,0,0.693147,1,0.0


In [14]:
all_data = pd.get_dummies(all_data)

In [17]:
all_data = all_data.fillna(all_data.mean())

In [44]:
X_train = all_data[:train.shape[0]]
X_test = all_data[train.shape[0]:]
y_train = train['target']

In [47]:
y_train.shape

(595212,)

In [70]:
# LightGBM parameters
denom = 0
fold = 5 #Change to 5, 1 for Kaggle Limits
for i in range(fold):
    params = {
        'task': 'train',
        'boosting_type': 'gbdt',
        'objective': 'binary',
        'metric': {'auc'},
        #'num_class': 9,
        'learning_rate': 0.01,
        'max_depth': 10,
        'num_leaves': 20,
        'min_data_in_leaf': 5,
        #'lambda_l1': 0.05,
        'num_iteration': 2000,
        'feature_fraction': 0.9, 
        'bagging_fraction': 0.9, 
        'bagging_freq': 5
        #'verbose': 0
    }
    X_train_cv, X_test_cv, y_train_cv, y_test_cv = train_test_split(X_train, y_train, test_size=0.18, random_state=i)
        
    lgb_train = lgb.Dataset(X_train_cv, y_train_cv)
    lgb_eval = lgb.Dataset(X_test_cv, y_test_cv, reference=lgb_train)

# train
    gbm = lgb.train(params,
            lgb_train,
            #num_boost_round=rnds,
            valid_sets=lgb_eval,
            verbose_eval=10,
            early_stopping_rounds=20)
    
    if denom != 0:
        pred = gbm.predict(X_test, num_iteration=gbm.best_iteration+50)      
        preds += pred
    else:
        pred = gbm.predict(X_test, num_iteration=gbm.best_iteration+50)
        preds = pred.copy()
    denom += 1
    #submission = pd.DataFrame(pred, columns=['class'+str(c+1) for c in range(9)])
    #submission = submission.reset_index()
    #submission.columns = ["ID","class1","class2","class3","class4","class5","class6","class7","class8","class9"]
    #submission['ID'] = pid
    #submission.to_csv('submission_xgb_fold_'  + str(i) + '.csv', index=False)
    
preds /= denom
submission = pd.read_csv("sample_submission.csv")
submission['target'] = preds
submission = submission.reset_index()
submission.columns = ["id","target"]
submission.to_csv("sub_LGBM.csv", index=False)
#submission['ID'] = pid
#y_pred = gbm.predict(X_test_vect_lgbm, num_iteration=gbm.best_iteration)
#y_pred = np.argmax(y_pred, axis=1)



Training until validation scores don't improve for 20 rounds.
[10]	valid_0's auc: 0.615138
[20]	valid_0's auc: 0.61793
[30]	valid_0's auc: 0.617871
[40]	valid_0's auc: 0.618526
[50]	valid_0's auc: 0.618574
[60]	valid_0's auc: 0.619019
[70]	valid_0's auc: 0.618733
[80]	valid_0's auc: 0.619482
[90]	valid_0's auc: 0.620181
[100]	valid_0's auc: 0.620869
[110]	valid_0's auc: 0.620932
[120]	valid_0's auc: 0.621034
[130]	valid_0's auc: 0.621276
[140]	valid_0's auc: 0.621709
[150]	valid_0's auc: 0.622143
[160]	valid_0's auc: 0.622024
[170]	valid_0's auc: 0.622265
[180]	valid_0's auc: 0.622497
[190]	valid_0's auc: 0.622972
[200]	valid_0's auc: 0.623393
[210]	valid_0's auc: 0.624017
[220]	valid_0's auc: 0.624444
[230]	valid_0's auc: 0.624556
[240]	valid_0's auc: 0.625018
[250]	valid_0's auc: 0.625367
[260]	valid_0's auc: 0.625792
[270]	valid_0's auc: 0.626055
[280]	valid_0's auc: 0.626586
[290]	valid_0's auc: 0.627094
[300]	valid_0's auc: 0.627951
[310]	valid_0's auc: 0.628179
[320]	valid_0's au

In [80]:
def gini(truth, predictions):
    g = np.asarray(np.c_[truth, predictions, np.arange(len(truth)) ], dtype=np.float)
    g = g[np.lexsort((g[:,2], -1*g[:,1]))]
    gs = g[:,0].cumsum().sum() / g[:,0].sum()
    gs -= (len(truth) + 1) / 2.
    return gs / len(truth)

def gini_xgb(predictions, truth):
    truth = truth.get_label()
    return 'gini', -1.0 * gini(truth, predictions) / gini(truth, truth)

def gini_lgb(truth, predictions):
    score = gini(truth, predictions) / gini(truth, truth)
    return 'gini', score, True

def gini_sklearn(truth, predictions):
    return gini(truth, predictions) / gini(truth, truth)

gini_scorer = make_scorer(gini_sklearn, greater_is_better=True, needs_proba=True)

In [86]:
def objective(params):
    params = {
        'num_leaves': int(params['num_leaves']),
        'colsample_bytree': '{:.3f}'.format(params['colsample_bytree']),
        'max_depth': int(params['max_depth']),
        'min_data_in_leaf': int(params['min_data_in_leaf']),
        'lambda_l1': '{:.3f}'.format(params['lambda_l1']),
    }
    
    clf = lgb.LGBMClassifier(
        n_estimators=500,
        learning_rate=0.01,
        **params
    )
    
    score = cross_val_score(clf, X_train, y_train, scoring=gini_scorer, cv=StratifiedKFold(5)).mean()
    print("Gini {:.3f} params {}".format(score, params))
    return score

space = {
    'num_leaves': hp.quniform('num_leaves', 100, 200, 2),
    'colsample_bytree': hp.uniform('colsample_bytree', 0.3, 1.0),
    'max_depth': hp.quniform('max_depth', 5, 30, 2),
    'min_data_in_leaf': hp.quniform('min_data_in_leaf', 1, 20, 1),
    'lambda_l1': hp.uniform('lambda_l1', 0.01, 0.05),
}

best = fmin(fn=objective,
            space=space,
            algo=tpe.suggest,
            max_evals=30)

Gini -0.276 params {'num_leaves': 166, 'colsample_bytree': '0.407', 'max_depth': 18, 'min_data_in_leaf': 11, 'lambda_l1': '0.046'}
Gini -0.274 params {'num_leaves': 110, 'colsample_bytree': '0.966', 'max_depth': 28, 'min_data_in_leaf': 12, 'lambda_l1': '0.019'}
Gini -0.274 params {'num_leaves': 198, 'colsample_bytree': '0.704', 'max_depth': 16, 'min_data_in_leaf': 10, 'lambda_l1': '0.033'}
Gini -0.275 params {'num_leaves': 100, 'colsample_bytree': '0.439', 'max_depth': 8, 'min_data_in_leaf': 6, 'lambda_l1': '0.041'}
Gini -0.275 params {'num_leaves': 112, 'colsample_bytree': '0.833', 'max_depth': 10, 'min_data_in_leaf': 7, 'lambda_l1': '0.039'}
Gini -0.275 params {'num_leaves': 130, 'colsample_bytree': '0.446', 'max_depth': 16, 'min_data_in_leaf': 13, 'lambda_l1': '0.039'}
Gini -0.277 params {'num_leaves': 136, 'colsample_bytree': '0.354', 'max_depth': 28, 'min_data_in_leaf': 8, 'lambda_l1': '0.020'}
Gini -0.273 params {'num_leaves': 104, 'colsample_bytree': '0.350', 'max_depth': 18, 'm

In [87]:
print("Hyperopt estimated optimum {}".format(best))

Hyperopt estimated optimum {'colsample_bytree': 0.3542618105439753, 'lambda_l1': 0.020025402620244666, 'max_depth': 28.0, 'min_data_in_leaf': 8.0, 'num_leaves': 136.0}


### LGBM optimum v1
Hyperopt estimated optimum {'colsample_bytree': 0.3542618105439753, 'lambda_l1': 0.020025402620244666, 'max_depth': 28.0, 'min_data_in_leaf': 8.0, 'num_leaves': 136.0}

In [89]:
import xgboost as xgb

In [93]:
def objective(params):
    params = {
        'max_depth': int(params['max_depth']),
        'gamma': "{:.3f}".format(params['gamma']),
        'colsample_bytree': '{:.3f}'.format(params['colsample_bytree']),
    }
    
    clf = xgb.XGBClassifier(
        n_estimators=250,
        learning_rate=0.05,
        nthread=4,
        **params
    )
    
    score = cross_val_score(clf, X_train, y_train, scoring=gini_scorer, cv=StratifiedKFold(5)).mean()
    print("Gini {:.3f} params {}".format(score, params))
    return(score)

space = {
    'max_depth': hp.quniform('max_depth', 2, 8, 1),
    'colsample_bytree': hp.uniform('colsample_bytree', 0.3, 1.0),
    'gamma': hp.uniform('gamma', 0.0, 0.5),
}

best = fmin(fn=objective,
            space=space,
            algo=tpe.suggest,
            max_evals=10)

Gini -0.254 params {'max_depth': 2, 'gamma': '0.055', 'colsample_bytree': '0.543'}
Gini -0.266 params {'max_depth': 4, 'gamma': '0.110', 'colsample_bytree': '0.760'}
Gini -0.278 params {'max_depth': 6, 'gamma': '0.307', 'colsample_bytree': '0.632'}
Gini -0.278 params {'max_depth': 7, 'gamma': '0.208', 'colsample_bytree': '0.451'}
Gini -0.279 params {'max_depth': 6, 'gamma': '0.221', 'colsample_bytree': '0.422'}
Gini -0.253 params {'max_depth': 6, 'gamma': '0.018', 'colsample_bytree': '0.957'}
Gini -0.278 params {'max_depth': 7, 'gamma': '0.291', 'colsample_bytree': '0.523'}
Gini -0.277 params {'max_depth': 7, 'gamma': '0.117', 'colsample_bytree': '0.406'}
Gini -0.269 params {'max_depth': 3, 'gamma': '0.090', 'colsample_bytree': '0.358'}
Gini -0.275 params {'max_depth': 8, 'gamma': '0.429', 'colsample_bytree': '0.646'}


In [94]:
print("Hyperopt estimated optimum {}".format(best))

Hyperopt estimated optimum {'colsample_bytree': 0.42220114349224486, 'gamma': 0.22144103013844213, 'max_depth': 6.0}


### xgboost optimum v1
Hyperopt estimated optimum {'colsample_bytree': 0.42220114349224486, 'gamma': 0.22144103013844213, 'max_depth': 6.0}

In [116]:
def objective(params):
    params = {
        'n_estimators': int(params['n_estimators']), 
        'max_depth': int(params['max_depth']),
    }
    clf = RandomForestClassifier(
        n_jobs=4, 
        class_weight='balanced', 
        **params
    )
    
    score = cross_val_score(clf, X_train, y_train, scoring=gini_scorer, cv=StratifiedKFold()).mean()
    
    print("Gini {:.3f} params {}".format(score, params))
    
    return(score)

space = {
    'n_estimators': hp.quniform('n_estimators', 25, 500, 25),
    'max_depth': hp.quniform('max_depth', 1, 10, 1)
}

best = fmin(fn=objective,
            space=space,
            algo=tpe.suggest,
            max_evals=10)

ValueError: Input contains NaN, infinity or a value too large for dtype('float32').

In [109]:
X_train.isnull().values.any()

array([[False, False, False, ..., False, False, False],
       [False, False, False, ..., False, False, False],
       [False, False, False, ..., False, False, False],
       ..., 
       [False, False, False, ..., False, False, False],
       [False, False, False, ..., False, False, False],
       [False, False, False, ..., False, False, False]], dtype=bool)