# SETUP

In [None]:
import pandas as pd
import numpy as np
import lightgbm as lgbm
from sklearn.model_selection import KFold
from sklearn.metrics import mean_squared_error
from sklearn.cluster import KMeans
import numpy as np
from sklearn.metrics import roc_auc_score
import seaborn as sns
import matplotlib
import matplotlib.pyplot as plt
import matplotlib.gridspec as gridspec
from hyperopt import hp, tpe
from hyperopt.fmin import fmin

from sklearn.model_selection import cross_val_score, StratifiedKFold
from sklearn.ensemble import RandomForestClassifier
from sklearn.metrics import make_scorer

import xgboost as xgb

import lightgbm as lgbm
from sklearn.utils import shuffle, resample
from sklearn.preprocessing import StandardScaler
from bayes_opt import BayesianOptimization
from skopt  import BayesSearchCV 
import lightgbm as lgb
from sklearn.model_selection import GridSearchCV
# Similarly LGBMRegressor can also be imported for a regression model.
from lightgbm import LGBMClassifier
from sklearn.preprocessing import LabelEncoder
from sklearn.metrics import accuracy_score, roc_auc_score, confusion_matrix
from sklearn.feature_selection import VarianceThreshold
from sklearn.model_selection import train_test_split
from sklearn.metrics import f1_score
from sklearn.metrics import fbeta_score
from skopt.space import Real, Integer
from skopt.utils import use_named_args
from skopt import gp_minimize
from skopt.plots import plot_convergence

In [None]:
train = pd.read_csv("../input/santander-customer-transaction-prediction/train.csv")
test = pd.read_csv("../input/santander-customer-transaction-prediction/test.csv")

In [None]:
train.head(2)

In [None]:
print('Column/Columns that are not in test data: ', end = '')
for i in train.columns:
    if i not in test.columns:
        print(i)

In [None]:
sns.countplot(train['target']);

# Bayesian Optimization

In [None]:
train = train.select_dtypes(include=['int','float'])
test = test.select_dtypes(include=['int','float'])
X = train.drop(columns=["target"])
y=train["target"]

In [None]:
%%time

def bayes_parameter_opt_lgb(X, y, init_round=15, opt_round=25, n_folds=3, random_seed=6, output_process=False):
    # prepare data
    train_data = lgb.Dataset(data=X, label=y, free_raw_data=False)
    # parameters
    def lgb_eval(learning_rate,num_leaves, feature_fraction, bagging_fraction, max_depth, max_bin, lambda_l1,lambda_l2,early_stopping_round):
        params = {'application':'binary', 'metric':'auc'}
        params['learning_rate'] = max(min(learning_rate, 1), 0)
        params["num_leaves"] = int(round(num_leaves))
        params['feature_fraction'] = max(min(feature_fraction, 1), 0)
        params['bagging_fraction'] = max(min(bagging_fraction, 1), 0)
        params['max_depth'] = int(round(max_depth))
        params['max_bin'] = int(round(max_depth))
        params['lambda_l1'] = int(round(lambda_l1))
        params['lambda_l2'] = int(round(lambda_l2))
        params['early_stopping_round'] = int(round(early_stopping_round))
        
        cv_result = lgb.cv(params, train_data, nfold=n_folds, seed=random_seed, stratified=True, verbose_eval =250,num_boost_round=1500, metrics=['auc'])
        return max(cv_result['auc-mean'])
     
    lgbBO = BayesianOptimization(lgb_eval, {'learning_rate': (0.04, 0.65),
                                            'num_leaves': (8, 240),
                                            'feature_fraction': (0.21, 0.9),
                                            'bagging_fraction': (0.2, 0.9),
                                            'max_depth': (4, 25),
                                            'max_bin':(5,65),
                                            'early_stopping_round' : (10,300),
                                            'lambda_l1': (0, 10),
                                            'lambda_l2':  (0, 10)}, random_state=200)                                   
                                      


    
    #n_iter: How many steps of bayesian optimization you want to perform. The more steps the more likely to find a good maximum you are.
    #init_points: How many steps of random exploration you want to perform. Random exploration can help by diversifying the exploration space.
    
    lgbBO.maximize(init_points=init_round, n_iter=opt_round)
    
    model_auc=[]
    for model in range(len( lgbBO.res)):
        model_auc.append(lgbBO.res[model]['target'])
    
    # return best parameters
    return lgbBO.res[pd.Series(model_auc).idxmax()]['target'],lgbBO.res[pd.Series(model_auc).idxmax()]['params']

opt_params = bayes_parameter_opt_lgb(X, y, init_round=5, opt_round=10, n_folds=3, random_seed=6)

# Best Params

In [None]:
opt_params[1]["num_leaves"] = int(round(opt_params[1]["num_leaves"]))
opt_params[1]['max_depth'] = int(round(opt_params[1]['max_depth']))
opt_params[1]['max_bin'] = int(round(opt_params[1]['max_bin']))
opt_params[1]['objective']='binary'
opt_params[1]['metric']='auc'
#opt_params[1]['is_unbalance']=True
#opt_params[1]['boost_from_average']=False
opt_params=opt_params[1]
opt_params

# Undersampling

In [None]:
# separate classes into different datasets
normal_class = train.query('target == 0')
fraudulent_class = train.query('target == 1')

# randomize the datasets
normal_class = normal_class.sample(frac=1,random_state=1210)
fraudulent_class = fraudulent_class.sample(frac=1,random_state=1210)
resampled = normal_class.sample(n=int(len(fraudulent_class)*4.4), random_state=1210)
train = pd.concat([fraudulent_class,resampled])

In [None]:
train = train.select_dtypes(include=['int','float'])
test = test.select_dtypes(include=['int','float'])

In [None]:
train.info()

In [None]:
X = train.drop(columns=["target"])
y=train["target"]

# Bayesian Opt with Undersampling

In [None]:
%%time

def bayes_parameter_opt_lgb(X, y, init_round=15, opt_round=25, n_folds=3, random_seed=6, output_process=False):
    # prepare data
    train_data = lgb.Dataset(data=X, label=y, free_raw_data=False)
    # parameters
    def lgb_eval(learning_rate,num_leaves, feature_fraction, bagging_fraction, max_depth, max_bin, lambda_l1,lambda_l2,early_stopping_round):
        params = {'application':'binary', 'metric':'auc'}
        params['learning_rate'] = max(min(learning_rate, 1), 0)
        params["num_leaves"] = int(round(num_leaves))
        params['feature_fraction'] = max(min(feature_fraction, 1), 0)
        params['bagging_fraction'] = max(min(bagging_fraction, 1), 0)
        params['max_depth'] = int(round(max_depth))
        params['max_bin'] = int(round(max_depth))
        params['lambda_l1'] = int(round(lambda_l1))
        params['lambda_l2'] = int(round(lambda_l2))
        params['early_stopping_round'] = int(round(early_stopping_round))
        
        cv_result = lgb.cv(params, train_data, nfold=n_folds, seed=random_seed, stratified=True, verbose_eval =250,num_boost_round=1500, metrics=['auc'])
        return max(cv_result['auc-mean'])
     
    lgbBO = BayesianOptimization(lgb_eval, {'learning_rate': (0.04, 0.65),
                                            'num_leaves': (8, 240),
                                            'feature_fraction': (0.21, 0.9),
                                            'bagging_fraction': (0.2, 0.9),
                                            'max_depth': (4, 25),
                                            'max_bin':(5,65),
                                            'early_stopping_round' : (10,300),
                                            'lambda_l1': (0, 10),
                                            'lambda_l2':  (0, 10)}, random_state=200)                                   
                                      


    
    #n_iter: How many steps of bayesian optimization you want to perform. The more steps the more likely to find a good maximum you are.
    #init_points: How many steps of random exploration you want to perform. Random exploration can help by diversifying the exploration space.
    
    lgbBO.maximize(init_points=init_round, n_iter=opt_round)
    
    model_auc=[]
    for model in range(len( lgbBO.res)):
        model_auc.append(lgbBO.res[model]['target'])
    
    # return best parameters
    return lgbBO.res[pd.Series(model_auc).idxmax()]['target'],lgbBO.res[pd.Series(model_auc).idxmax()]['params']

opt_params = bayes_parameter_opt_lgb(X, y, init_round=5, opt_round=10, n_folds=3, random_seed=6)

# Undersampled data best params

In [None]:
opt_params[1]["num_leaves"] = int(round(opt_params[1]["num_leaves"]))
opt_params[1]['max_depth'] = int(round(opt_params[1]['max_depth']))
opt_params[1]['max_bin'] = int(round(opt_params[1]['max_bin']))
opt_params[1]['objective']='binary'
opt_params[1]['metric']='auc'
#opt_params[1]['is_unbalance']=True
#opt_params[1]['boost_from_average']=False
opt_params=opt_params[1]
opt_params

# Undersampled and base data params

In [None]:
d = {
    'Datasets': ['Undersampled_Data', 'Base_Data'],
    'bagging_fraction': [0.9, 0.9],
    'early_stopping_round': [72,76],
    'feature_fraction': [0.21,0.21],
         'lambda_l1': [0,0],
         'lambda_l2': [10,10],
         'learning_rate': [0.04,0.04],
         'max_bin': [34,35],
         'max_depth': [25,25],
             'num_leaves': [83,107],
         'objective': ['binary','binary'],
         'metric': ['auc', 'auc']
 }
df = pd.DataFrame(data=d)
df