In [None]:
import os
import datetime as dt
import time
import json

import pandas as pd
import numpy as np

from sklearn.preprocessing import StandardScaler, MinMaxScaler, RobustScaler
from sklearn.model_selection import cross_val_score, StratifiedKFold
from sklearn import metrics

# https://stackoverflow.com/questions/14254203/mixing-categorial-and-continuous-data-in-naive-bayes-classifier-using-scikit-lea?rq=1
from sklearn.naive_bayes import BernoulliNB, GaussianNB, MultinomialNB
from sklearn.ensemble import ExtraTreesClassifier, RandomForestClassifier, GradientBoostingClassifier, AdaBoostClassifier

import xgboost as xgb
from xgboost import XGBClassifier
import lightgbm as lgb
from lightgbm import LGBMClassifier

In [None]:
today = dt.date.today()

In [None]:
# should've done this in cleaning
# rscaler = RobustScaler()
# rscaler.fit(x_train)

# x_train_scaled = rscaler.transform(x_train)
# x_test_scaled = rscaler.transform(x_train)

## Overview

This notebook is going to be used a few times to get parameters for models and test out different configurations or sets of features. 

The first time through I tried out a bunch of different algorthims (mainly tree algorithms) and parameters.
I attempted to grid-search parameters, but doing so was to resource intensive and would take days to finish even for only a modest parameter search space. I ended up running bayesian optimizations on the model parameters and found that LightGBM was the best combination of fast and high auc (approximately the same as xgboost and a fraction of the time).

For further explorations and iterations I stuck with LGBM.

### Config Vars

In [None]:
PK = 'sk_id_curr'
TARGET = 'target'
N_CV = 4
SEED = 1111
DATA_DIR = 'clean_data/'

TRAIN_FILE = os.path.join(DATA_DIR, 'mrgd_train.csv')
TEST_FILE = os.path.join(DATA_DIR, 'mrgd_test.csv')
SUBMISSION_OUTPUT_FILE = os.path.join(DATA_DIR, 'submission_out.csv')

DTYPES = {'sk_id_curr':str, 'sk_id_bureau':str, 'sk_id_prev':str,'num_instalment_version':str}



### Load Data

In [None]:
train = pd.read_csv(TRAIN_FILE, dtype=DTYPES)
test = pd.read_csv(TEST_FILE, dtype=DTYPES)
train.shape, test.shape

In [None]:
train.head()

Separate keys, target, train and test

In [None]:
# Target var
y_train = train[TARGET].values
pd.value_counts(y_train)

In [None]:
# sk ids
train_id = train[PK]
test_id = test[PK]

In [None]:
train.drop([PK, TARGET], axis=1, inplace=True)
test.drop(PK, axis=1, inplace=True)

In [None]:
print("{},{}".format(train.shape, test.shape))

In [None]:
features = train.columns

In [None]:
# make arrays
x_train = train[features].values
x_test = test[features].values
###
x_train = train.values
x_test = test.values

### Grid Search Params

In [None]:
from sklearn.model_selection import GridSearchCV

In [None]:
et_default_params = {
    'n_jobs': -1,
    'verbose' : 1
}

rf_default_params = {
    'n_jobs': -1,
    'verbose' : 1
}

gb_default_params = {
    'verbose' : 1
}

xgb_default_params = {
    'silent': 0,
    'n_jobs': -1,
    'objective': 'binary:logistic',
    'eval_metric': 'auc'
}

lgb_default_params = {
    'silent': 0,
    'n_jobs': -1,
    'objective': 'binary',
    'metric': 'auc',
    #'is_unbalanced':True
}

In [None]:
et_params = {
    'n_estimators': list(range(25,201, 25)),
    'max_features': [0.01, 0.05, 0.1, 0.3, 0.5, 0.75],
    'max_depth': list(range(5,26, 5)),
}

rf_params = {
    'n_estimators': list(range(25,201, 25)),
    'max_features': [0.01, 0.1, 0.5, 0.8],
    'max_depth': list(range(5,26, 5)),
}


gb_params = {
    'n_estimators': range(20, 101, 20),
    'learning_rate': [0.01, 0.1, 0.5, 0.8],
    'max_features': [0.01, 0.1, 0.5, 0.8],
    'max_depth': list(range(5,26, 5)),
}


xgb_params = {
    'colsample_bytree': [0.01, 0.1, 0.5, 0.8],
    'subsample': [0.01, 0.1, 0.5, 0.8],
    'learning_rate': [0.01, 0.1, 0.5, 0.8],
    'max_depth': list(range(5,26, 5)),
    'scale_pos_weight': range(2, int(pd.value_counts(y_train)[0]/pd.value_counts(y_train)[1])*2, 4),
    'nrounds': [2**n for n in range(5,9)]
}

lgb_params = {
    'boosting': ['gbdt', 'dart'],
    'num_iterations': [2**n for n in range(5,10)],
    'learning_rate': [0.01, 0.1, 0.5, 0.8],
    'max_depth': list(range(5,26, 5)),
    'scale_pos_weight': range(2, int(pd.value_counts(y_train)[0]/pd.value_counts(y_train)[1])*2, 4),
    'subsample': [0.01, 0.1, 0.5, 0.8],
    'colsample_bytree': [0.01, 0.1, 0.5, 0.8],
}

# potential other params afterwords
# 'lambda_l1': [0, 0.6,]
# 'lambda_l2': [0, 0.6,]

In [None]:
np.prod(list(map(len, xgb_params.values())))

In [None]:
# OVERSAMPLE
def get_weights(ser):
    vcs = ser.value_counts()
    class_labels = vcs.index.tolist()
    class_sizes = vcs.values
    nclasses = len(class_labels)
    class_weights = dict(zip(class_labels,(nclasses**-1/class_sizes)))
    return class_weights

def oversample(ser, size):
    p = ser.map(get_weights(ser))
    samp = np.random.choice(ser.index,size,p = p)
    return samp


def grid_search_params(algo, default_params, grid_params, cv=N_CV, 
                       x_train=x_train, y_train=y_train):
    est = algo(**default_params)
    grid = GridSearchCV(est, grid_params, scoring='roc_auc', cv=cv, n_jobs=-1, )
    print('Grid Searching Params')
    
    start = time.time()
    grid.fit(x_train, y_train)
    print('Fitting Completed in {} minutes'.format((time.time()-start)/60))
    print('Best Score', grid.best_score_)
    
    return grid.best_estimator, grid.best_params

In [None]:
# results={}

In [None]:
# names = ['et', 'rf', 'gb', 'xgb', 'lgb']
# algos = [ExtraTreesClassifier, RandomForestClassifier, GradientBoostingClassifier, XGBClassifier, LGBMClassifier]
# default_params = [et_default_params, rf_default_params, gb_default_params, xgb_default_params, lgb_default_params]
# param_grids = [et_params, rf_params, gb_params, xgb_params, lgb_params]


# for name, algo, dparams, gparams in zip(algos,default_params,param_grids):
#     best_estimator, best_params = grid_search_params(algo, dparams, gparams)
#     results[name] = (best_estimator, best_params)

### Bayesian Param Search

New to this - exploring using bayes_opt python package for finding optimal parameters

In [None]:
from bayes_opt import BayesianOptimization

In [None]:
all_results = {}

#### Random Forest

In [None]:
def rf_func(**params):
    params['n_estimators'] = int(np.round(params['n_estimators']))
    params['max_depth'] = int(np.round(params['max_depth']))
    
    
    est = RandomForestClassifier(**rf_default_params, **params)
    cv = StratifiedKFold(N_CV, True, SEED)
    roc_auc = np.mean(cross_val_score(est, x_train, y_train, scoring='roc_auc', cv=cv, n_jobs=-1))
    return roc_auc

In [None]:
{p: (min(rf_params[p]), max(rf_params[p])) for p in rf_params}

In [None]:
start = time.time()
rfBO = BayesianOptimization(rf_func, {p: (min(rf_params[p]), max(rf_params[p])) for p in rf_params})
rfBO.maximize(10, 100)
print('TIME TAKEN (MIN):', (time.time() - start)/60)

In [None]:
result = rfBO.res['max']
# best_params = result['max_params']
# best_score = result['max_value']
print(result)
all_results['rf'] = result

#### Extra Trees

In [None]:
def et_func(**params):
    params['n_estimators'] = int(np.round(params['n_estimators']))
    params['max_depth'] = int(np.round(params['max_depth']))
    
    cv = StratifiedKFold(N_CV, True, SEED)
    est = RandomForestClassifier(**et_default_params, **params)
    roc_auc = np.mean(cross_val_score(est, x_train, y_train, scoring='roc_auc', cv=cv, n_jobs=-1))
    return roc_auc

In [None]:
start = time.time()
etBO = BayesianOptimization(et_func, {p: (min(et_params[p]), max(et_params[p])) for p in et_params})
etBO.maximize(10, 100)
print('TIME TAKEN (MIN):', (time.time() - start)/60)

In [None]:
result = etBO.res['max']
# best_params = result['max_params']
# best_score = result['max_value']
print(result)
all_results['et'] = result

#### LGBM

In [None]:
lgbm_bounds = {p: (min(lgb_params[p]), max(lgb_params[p])) for p in lgb_params}
lgbm_bounds.update({'boosting': (0,1)})

In [None]:
def lgb_func(**params):
    params['boosting'] = 'gbdt' if np.round(params['boosting']) == 0 else 'dart'
    params['num_iterations'] = int(np.round(params['num_iterations']))
    params['max_depth'] = int(np.round(params['max_depth']))
    params['learning_rate'] = np.clip(params['learning_rate'], 0.0001, 1)
    params['subsample'] = np.clip(params['subsample'], 0.0001, 1)
    params['colsample_bytree'] = np.clip(params['colsample_bytree'], 0.0001, 1)    
    
    cv = StratifiedKFold(N_CV, True, SEED)
    est = LGBMClassifier(**lgb_default_params, **params)
    roc_auc = np.mean(cross_val_score(est, x_train, y_train, scoring='roc_auc', cv=cv, n_jobs=-1))
    return roc_auc

In [None]:
start = time.time()
lgbBO = BayesianOptimization(lgb_func, lgbm_bounds)
lgbBO.maximize(10, 100)
print('TIME TAKEN (MIN):', (time.time() - start)/60)

In [None]:
result = lgbBO.res['max']
# best_params = result['max_params']
# best_score = result['max_value']
print(result)
all_results['lgb'] = result

#### XGB

In [None]:
def xgb_func(**params):
    params['nrounds'] = int(np.round(params['nrounds']))
    params['max_depth'] = int(np.round(params['max_depth']))
    params['learning_rate'] = np.clip(params['learning_rate'], 0.0001, 1)
    params['subsample'] = np.clip(params['subsample'], 0.0001, 1)
    params['colsample_bytree'] = np.clip(params['colsample_bytree'], 0.0001, 1)    
    
    cv = StratifiedKFold(N_CV, True, SEED)
    est = XGBClassifier(**xgb_default_params, **params)
    roc_auc = np.mean(cross_val_score(est, x_train, y_train, scoring='roc_auc', cv=cv, n_jobs=-1))
    return roc_auc

In [None]:
start = time.time()
xgbBO = BayesianOptimization(xgb_func, {p: (min(xgb_params[p]), max(xgb_params[p])) for p in xgb_params})
xgbBO.maximize(10, 100)
print('TIME TAKEN (MIN):', (time.time() - start)/60)

In [None]:
result = xgbBO.res['max']
# best_params = result['max_params']
# best_score = result['max_value']
print(result)
all_results['xgb'] = result

In [None]:
with open('extra/bayesian_opt_params.json', 'w') as f:
    json.dump(all_results, f)

### Load Results and Explore

In [None]:
with open('extra/bayesian_opt_params.json', 'r') as f:
    all_results = json.load(f)

In [None]:
params = all_results['lgb']['max_params']
params['boosting'] = 'gbdt' if np.round(params['boosting'] == 0) else 'dart'
params['max_depth'] = int(np.round(params['max_depth']))
params['num_iterations'] = int(np.round(params['num_iterations']))
params['scale_pos_weight'] = int(np.round(params['scale_pos_weight']))
params

In [None]:
cv = StratifiedKFold(4, True, SEED)
est = LGBMClassifier(**lgb_default_params, **params)
roc_auc = np.mean(cross_val_score(est, x_train, y_train, scoring='roc_auc', cv=cv, n_jobs=-1))
print(roc_auc)

### Explore and Add in Engineered Ftrs

In [None]:
eng_ftrs_train = pd.read_csv('clean_data/eng_ftrs_train.csv')
eng_ftrs_test = pd.read_csv('clean_data/eng_ftrs_test.csv')
eng_ftrs_train.shape, eng_ftrs_test.shape

In [None]:
xtrain_all = pd.concat((train, eng_ftrs_train), axis=1)
xtest_all = pd.concat((test, eng_ftrs_test), axis=1)

xtrain_all.shape, xtest_all.shape

In [None]:
# engineered features _alone_ do pretty well
cv = StratifiedKFold(N_CV, True, SEED)
est = LGBMClassifier(**lgb_default_params, **params)
roc_auc = np.mean(cross_val_score(est, eng_ftrs_train, y_train, scoring='roc_auc', cv=cv, n_jobs=-1))
print(roc_auc)

In [None]:
# :-/ combined no improvement
cv = StratifiedKFold(N_CV, True, SEED)
est = LGBMClassifier(**lgb_default_params, **params)
roc_auc = np.mean(cross_val_score(est, xtrain_all.values, y_train, scoring='roc_auc', cv=cv, n_jobs=-1))
print(roc_auc)

Filter to Use only Selected Features From Feature Selection

In [None]:
selected_ftrs = pd.read_csv('extra/all_selected_ftrs.csv', names=['ftr', 'score'])
selected_ftrs.head()

In [None]:
# don't need all 500 - same performance w 300
final_ftrs = selected_ftrs.ftr[:300]

In [None]:
# :-( still no improvement
cv = StratifiedKFold(N_CV, True, SEED)
est = LGBMClassifier(**lgb_default_params, **params)
roc_auc = np.mean(cross_val_score(est, xtrain_all[final_ftrs].values, y_train, scoring='roc_auc', cv=cv, n_jobs=-1))
print(roc_auc)

### Scratch Code for Generating Submission Files

In [None]:
params = all_results['lgb']['max_params']
params['boosting'] = 'gbdt' if np.round(params['boosting'] == 0) else 'dart'
params['max_depth'] = int(np.round(params['max_depth']))
params['n_iter'] = int(np.round(params['n_iter']))
params['scale_pos_weight'] = int(np.round(params['scale_pos_weight']))
params

In [None]:
lgb_clf = LGBMClassifier(**lgb_default_params, **params)
lgb_clf.fit(x_train, y_train)
probs = lgb_clf.predict_proba(x_test)

In [None]:
submission = pd.DataFrame([test_id,probs[:,1]], index=['SK_ID_CURR', 'TARGET']).T

In [None]:
algo = 'lgbm'
submission.to_csv('submissions/{}_{}.csv'.format(algo, today.strftime('%Y%m%d')), index=False)

In [None]:

# dtrain = xgb.DMatrix(x_train, label=y_train)
# nrounds = xgb_params.pop('nrounds', 75)
# clf = xgb.train(xgb_params, dtrain, nrounds)


In [None]:
# clf.best_iteration

In [None]:
# dtest = xgb.DMatrix(x_test)
# preds = clf.predict(dtest)

In [None]:
# submission['TARGET'] = preds

In [None]:
# algo = 'xgboost'
# submission.to_csv('submissions/{}_{}.csv'.format(algo, today.strftime('%Y%m%d')), index=False)