In [1]:
import numpy as np
import pandas as pd
from sklearn import preprocessing
import lightgbm as lgb
import os
import shutil
import matplotlib.pyplot as plt
import seaborn as sns
from sklearn.model_selection import StratifiedKFold
from sklearn.linear_model import LogisticRegression
from sklearn.metrics import roc_auc_score

pd.options.display.max_columns = 500
#import pandas_profiling

In [2]:
train_df = pd.read_csv('../data/train.csv')
test_df = pd.read_csv('../data/test.csv')

In [3]:
random_state = 42
np.random.seed(random_state)

In [4]:

def augmentation(x=None,y=None,upsample_times=2):
    x_pos = x[y==1].copy()
    y_pos = y[y==1].copy()
    
    aug_x_pos = x_pos.copy()
    
    if upsample_times ==0:
        x_pos_temp = np.zeros(x_pos.shape)
        for i in range(x_pos.shape[1]):
            pos_values = x_pos.iloc[:,i].values
            np.random.shuffle(pos_values)
            x_pos_temp[:,i] = pos_values

        x_pos_temp = pd.DataFrame(x_pos_temp)
        x_pos_temp.columns = x_pos.columns  
        aug_x_pos = x_pos_temp    
        aug_y_pos = np.ones(y_pos.shape[0]) 
        aug_x_pos = aug_x_pos.append(x[y==0]) 
        aug_y_pos = np.append(aug_y_pos,y[y==0]) 
    else:
        for n in range(upsample_times):
            x_pos_temp = np.zeros(x_pos.shape)

            for i in range(x_pos.shape[1]):
                pos_values = x_pos.iloc[:,i].values
                np.random.shuffle(pos_values)
                x_pos_temp[:,i] = pos_values

            x_pos_temp = pd.DataFrame(x_pos_temp)
            x_pos_temp.columns = x_pos.columns  
            aug_x_pos = aug_x_pos.append(x_pos_temp)    

        aug_y_pos = np.ones(y_pos.shape[0]*(upsample_times+1)) 

        aug_x_pos = aug_x_pos.append(x[y==0]) 
        aug_y_pos = np.append(aug_y_pos,y[y==0]) 
        
    return aug_x_pos,aug_y_pos
    
    
    
    

In [5]:
# train_df[var_list] = np.exp(train_df[var_list])
# test_df[var_list] = np.exp(test_df[var_list])

In [6]:
#exclusion = ['ID_code', 'target']+ rank_var_list
exclusion = ['ID_code', 'target'] 
feats = [c for c in train_df.columns if c not in exclusion]

In [13]:
# Skopt functions
from skopt import BayesSearchCV
from skopt import gp_minimize # Bayesian optimization using Gaussian Processes
from skopt.space import Real, Categorical, Integer
from skopt.utils import use_named_args # decorator to convert a list of parameters to named arguments
from skopt.callbacks import DeadlineStopper # Stop the optimization before running out of a fixed budget of time.
from skopt.callbacks import VerboseCallback # Callback to control the verbosity
from skopt.callbacks import DeltaXStopper # Stop the optimization If the last two positions at which the objective has been evaluated are less than delta

In [14]:
# Reporting util for different optimizers
def report_perf(optimizer, X, y, title, callbacks=None):
    """
    A wrapper for measuring time and performances of different optmizers
    
    optimizer = a sklearn or a skopt optimizer
    X = the training set 
    y = our target
    title = a string label for the experiment
    """
    start = time()
    if callbacks:
        optimizer.fit(X, y, callback=callbacks)
    else:
        optimizer.fit(X, y)
    best_score = optimizer.best_score_
    best_score_std = optimizer.cv_results_['std_test_score'][optimizer.best_index_]
    best_params = optimizer.best_params_
    print((title + " took %.2f seconds,  candidates checked: %d, best CV score: %.3f "
           +u"\u00B1"+" %.3f") % (time() - start, 
                                  len(optimizer.cv_results_['params']),
                                  best_score,
                                  best_score_std))    
    print('Best parameters:')
    pprint.pprint(best_params)
    print()
    return best_params

In [15]:
from sklearn.metrics import average_precision_score, roc_auc_score, mean_absolute_error
from sklearn.metrics import make_scorer
from time import time
import pprint

In [16]:
auc = make_scorer(roc_auc_score, greater_is_better=True, needs_proba=True)


In [17]:
n_folds = 5
random_seed = 26

In [18]:
folds = StratifiedKFold(n_splits=n_folds, shuffle=True, random_state=random_seed)

clf = lgb.LGBMClassifier(boosting_type='gbdt',
                         #is_unbalance ='true',
                         objective='binary',
                         #n_jobs=1, 
                         verbose=0)


search_spaces = {
        'learning_rate': Real(0.01, 1.0, 'log-uniform'),
        'num_leaves': Integer(2, 500),
        'max_depth': Integer(0, 500),
        'min_child_samples': Integer(0, 200),
        'max_bin': Integer(100, 100000),
        'subsample': Real(0.01, 1.0, 'uniform'),
        'subsample_freq': Integer(0, 10),
        'colsample_bytree': Real(0.01, 1.0, 'uniform'),
        'min_child_weight': Integer(0, 10),
        'subsample_for_bin': Integer(100000, 500000),
        'reg_lambda': Real(1e-9, 1000, 'log-uniform'),
        'reg_alpha': Real(1e-9, 1.0, 'log-uniform'),
        'scale_pos_weight': Real(1e-6, 500, 'log-uniform'),
        'n_estimators': Integer(500, 30000)        
        }

opt = BayesSearchCV(clf,
                    search_spaces,
                    scoring=auc,
                    cv=folds,
                    n_iter=40,
                    n_jobs=-1,
                    return_train_score=False,
                    refit=True,
                    optimizer_kwargs={'base_estimator': 'GP'},
                    random_state=22)
X = train_df[feats]
y = train_df['target']    
best_params = report_perf(opt, X, y,'LightGBM', 
                          callbacks=[DeltaXStopper(0.001), 
                                     DeadlineStopper(60*5)])

LightGBM took 23306.91 seconds,  candidates checked: 1, best CV score: 0.882 ± 0.001
Best parameters:
{'colsample_bytree': 0.3706219857878677,
 'learning_rate': 0.16624226726409647,
 'max_bin': 93400,
 'max_depth': 134,
 'min_child_samples': 22,
 'min_child_weight': 4,
 'n_estimators': 24176,
 'num_leaves': 27,
 'reg_alpha': 1.081049236893711e-05,
 'reg_lambda': 1.043686239159047,
 'scale_pos_weight': 0.19222548462579486,
 'subsample': 0.6941640075502717,
 'subsample_for_bin': 375140,
 'subsample_freq': 7}



In [20]:
best_params

{'colsample_bytree': 0.3706219857878677,
 'learning_rate': 0.16624226726409647,
 'max_bin': 93400,
 'max_depth': 134,
 'min_child_samples': 22,
 'min_child_weight': 4,
 'n_estimators': 24176,
 'num_leaves': 27,
 'reg_alpha': 1.081049236893711e-05,
 'reg_lambda': 1.043686239159047,
 'scale_pos_weight': 0.19222548462579486,
 'subsample': 0.6941640075502717,
 'subsample_for_bin': 375140,
 'subsample_freq': 7}

In [26]:
pd.DataFrame(best_params, index=[0]).to_csv('best_params_20190323')