<div style="color:#D81F26;
           display:fill;
           border-style: solid;
           border-color:#C1C1C1;
           font-size:14px;
           font-family:Calibri;
           background-color:#373737;">
<h2 style="text-align: center;
           padding: 10px;
           color:#FFFFFF;">
======= Playground Nov 2021 =======
</h2>
</div>

# About this notebook

This notebook is for submission to the Playground of Nov 2021. The LGBM Classifier has been used in the notebook.  

## Summary

* Check if there are any null values in the features
* Standardize feature's values
* Optimize the hyperparameters for LGBM
* Get the optimal hyperparameters for cross-validation and data submission

In [None]:
# This Python 3 environment comes with many helpful analytics libraries installed
# It is defined by the kaggle/python Docker image: https://github.com/kaggle/docker-python
# For example, here's several helpful packages to load

import numpy as np # linear algebra
import pandas as pd # data processing, CSV file I/O (e.g. pd.read_csv)

# Input data files are available in the read-only "../input/" directory
# For example, running this (by clicking run or pressing Shift+Enter) will list all files under the input directory

import os
for dirname, _, filenames in os.walk('/kaggle/input'):
    for filename in filenames:
        print(os.path.join(dirname, filename))

# You can write up to 20GB to the current directory (/kaggle/working/) that gets preserved as output when you create a version using "Save & Run All" 
# You can also write temporary files to /kaggle/temp/, but they won't be saved outside of the current session

In [None]:

# Additional packages
from sklearn.preprocessing import StandardScaler
import numpy as np
import random

from sklearn import model_selection, metrics
from sklearn.model_selection import RandomizedSearchCV, GridSearchCV
from sklearn.feature_selection import SelectKBest, f_classif
from sklearn.metrics import roc_auc_score
import os, psutil

#Lgbm
import lightgbm as lgb
from scipy.stats import randint as sp_randint
from scipy.stats import uniform as sp_uniform

In [None]:
# Parameter Setting
p_verbose = 250
p_estimators_CV = 1000
p_estimators_FNL = 1500
p_iter = 35

In [None]:
# Read the data
df_test=pd.read_csv('../input/tabular-playground-series-nov-2021/test.csv')
df_train=pd.read_csv('../input/tabular-playground-series-nov-2021/train.csv')
df_sub=pd.read_csv('../input/tabular-playground-series-nov-2021/sample_submission.csv')

# Exploratory Data Analysis

In [None]:
df_train.info()

In [None]:
df_test.info()

## Check if there is any null value

In [None]:
# Null value analysis
# No. of variables and % of Null values
counter = 0
for col in df_train.columns:
    if df_train[col].isnull().sum() > 0:
        counter = counter + 1
        print('{} with {:.4%} null value'.format(col, df_train[col].isnull().sum()/len(df_train)))

if counter == 0:
    print('There is no variable with null value')

# Variables scaling

In [None]:
standardEncoder = StandardScaler()
df_train_std = pd.DataFrame(standardEncoder.fit_transform(df_train.iloc[:,1:-1]), columns =df_train.iloc[:,1:-1].columns)
df_test_std = pd.DataFrame(standardEncoder.transform(df_test.iloc[:,1:]), columns =df_test.iloc[:,1:].columns)
print(df_train_std.shape)
df_train_std.hist(figsize=(16,20),color = 'g',xlabelsize=0,ylabelsize=0)

# Modelling data preparation

In [None]:
X = df_train_std
Y = df_train['target']
print(X.shape)
print('='  *  30)
print(Y.shape)

In [None]:
X_train, X_test, Y_train, Y_test = model_selection.train_test_split(X, Y, test_size=0.05, random_state=4355)

In [None]:
del df_train 
del df_test 
del df_train_std

# Model  - LightGBM

In [None]:
def learning_rate_010_decay_power_099(current_iter):
    base_learning_rate = 0.1
    lr = base_learning_rate  * np.power(.99, current_iter)
    return lr if lr > 1e-3 else 1e-3

def learning_rate_010_decay_power_0995(current_iter):
    base_learning_rate = 0.1
    lr = base_learning_rate  * np.power(.995, current_iter)
    return lr if lr > 1e-3 else 1e-3

def learning_rate_005_decay_power_099(current_iter):
    base_learning_rate = 0.05
    lr = base_learning_rate  * np.power(.99, current_iter)
    return lr if lr > 1e-3 else 1e-3

fit_params={"early_stopping_rounds":10, 
            "eval_metric" : 'auc', 
            "eval_set" : [(X_test,Y_test)],
            'eval_names': ['valid'],
            'callbacks': [lgb.reset_parameter(learning_rate=learning_rate_010_decay_power_099)],
            'verbose': p_verbose}

param_test ={'num_leaves':  random.uniform(900, 1000), 
             'min_child_samples': random.uniform(900, 1000), 
             'min_child_weight': random.uniform(9, 11), 
             'subsample': random.uniform(0.15, 0.25), 
             'colsample_bytree': random.uniform(0.7, 0.8),
#             'max_depth' : sp_randint(5, 15),            
             'reg_alpha': random.uniform(0.05, 0.15), 
             'reg_lambda': random.uniform(5, 15)}

param_test ={'num_leaves': sp_randint(900, 1000), 
             'min_child_samples': sp_randint(900, 1000), 
             'min_child_weight': sp_uniform(loc=9.0, scale=2.0),
             'subsample': sp_uniform(loc=0.150, scale=0.1), 
             'colsample_bytree': [0.70, 0.72, 0.74, 0.76, 0.78, 0.80],
             'reg_alpha': [0.01, 0.04, 0.08, 0.1, 0.14, 0.18, 0.2],
             'reg_lambda': [2, 4, 6, 8, 10, 12, 14, 16, 18, 20]
            }

In [None]:
clf = lgb.LGBMClassifier(max_depth=-1, random_state=1234, silent=True, metric='auc', n_estimators=p_estimators_CV,  class_weight='balanced', n_jobs = -1)

In [None]:
gs = RandomizedSearchCV(
    estimator=clf, param_distributions=param_test, 
    n_iter=p_iter,
    scoring='roc_auc',
    n_jobs = -1,
    cv=3,
    refit=True,
    verbose=p_verbose,
    random_state=4563)

In [None]:
gs.fit(X_train, Y_train, **fit_params)

In [None]:
print('Best score reached: {} with params: {} '.format(gs.best_score_, gs.best_params_))

### Use the Optimal parameters

In [None]:
opt_parameters = {'colsample_bytree': 0.7563492589437595, 'min_child_samples': 977, 'min_child_weight': 10.0
                  , 'num_leaves': 934, 'reg_alpha': 0.1, 'reg_lambda': 10, 'subsample': 0.21965621584761524} 

# gs = lgb.LGBMClassifier(max_depth=-1, random_state=1234, silent=True, metric='auc', n_estimators=p_estimators_FNL,  class_weight='balanced', n_jobs = -1)
#set optimal parameters
# gs.set_params(**opt_parameters)

#gs.fit(X_train, Y_train, **fit_params ) 

# preds = gs.predict_proba(df_test_std)[:,1]
# df_rst = pd.concat([df_sub.iloc[:,0:1], pd.DataFrame(preds, columns = ['target'])], axis = 1)
# df_rst.to_csv("./submission.csv",index=False)
# print('Done!')

### Use the Best Parameter from HP

In [None]:

#Configure from the HP optimisation
clf_final = lgb.LGBMClassifier(**gs.best_estimator_.get_params())

clf_final = lgb.LGBMClassifier(max_depth=-1, random_state=3453456, silent=True, metric='auc', n_estimators=p_estimators_FNL,  class_weight='balanced', n_jobs = -1)

preds = np.zeros(df_test_std.shape[0])

from sklearn.model_selection import KFold,StratifiedKFold

kf = StratifiedKFold(n_splits = 5, random_state=434512,shuffle=True)

auc = []
model_lst = []
n = 0

for train_idx, test_idx in kf.split(X,Y):
    x_train, x_val = X.iloc[train_idx], X.iloc[test_idx]
    y_train, y_val = Y.iloc[train_idx], Y.iloc[test_idx]
    #model = lgb.LGBMClassifier(**opt_parameters)
    clf_final.fit(x_train, y_train, eval_set = [(x_val,y_val)], early_stopping_rounds = 30, eval_metric = "auc", verbose = 100)
#    preds += model.predict_proba(df_test_sub)[:,1]/kf.n_splits
    model_lst.append(clf_final)
    auc.append(roc_auc_score(y_val, clf_final.predict_proba(x_val)[:, 1]))
#    gc.collect()
    print(f"fold: {n+1}, auc: {auc[n]}")
    n+=1   


In [None]:

# Prediction
for i in range(len(model_lst)):
    preds += model_lst[i].predict_proba(df_test_std)[:,1]/kf.n_splits
df_rst = pd.concat([df_sub.iloc[:,0:1], pd.DataFrame(preds, columns = ['target'])], axis = 1)
df_rst.to_csv("./submission.csv",index=False)
print('Done!')    
    

In [None]:
# Data Submission
# preds = gs.predict_proba(df_test_std)[:,1]
# df_rst = pd.concat([df_sub.iloc[:,0:1], pd.DataFrame(preds, columns = ['target'])], axis = 1)
# df_rst.to_csv("./submission.csv",index=False)
# print('Done!')

