 # Importing Libraries

In [None]:
import numpy as np 
import pandas as pd 
import matplotlib.pyplot as plt 
%matplotlib inline
import seaborn as sns
from tqdm import tqdm_notebook as tqdm
from collections import Counter
from sklearn.model_selection import train_test_split
from scipy import stats
import lightgbm as lgb
from sklearn.metrics import cohen_kappa_score
from sklearn.model_selection import KFold, StratifiedKFold
import gc
import optuna
pd.set_option('display.max_columns', 1000)

# Loading the Data

In [None]:
train = pd.read_csv('../input/tabular-playground-series-jan-2021/train.csv')
test  = pd.read_csv('../input/tabular-playground-series-jan-2021/test.csv')
sample_submission  = pd.read_csv('../input/tabular-playground-series-jan-2021/sample_submission.csv')

# Correlation in Data

In this work, I try to measure correlation in data using Correlation coefficients.

Correlation coefficientsare used to measure how strong a relationship is between two variables.Correlation coefficient formulas are used to find how strong a relationship is between data. The formulas return a value between -1 and 1, where:

1 indicates a strong positive relationship.
-1 indicates a strong negative relationship.

A result of zero indicates no relationship at all.

<img src="https://www.statisticshowto.com/wp-content/uploads/2012/10/pearson-2-small.png" width="500">




In [None]:
corr=train.corr()["target"]
corr[np.argsort(corr, axis=0)[:-1]]

# Plotting correlations

In [None]:
#plotting correlations
num_feat=train.columns[train.dtypes!=object]
num_feat=num_feat [:-1]
labels = []
values = []
for col in num_feat:
    labels.append(col)
    values.append(np.corrcoef(train[col].values, train.target.values)[0,1])
    
ind = np.arange(len(labels))
width = 0.9
fig, ax = plt.subplots(figsize=(8,15))
rects = ax.barh(ind, np.array(values), color='red')
ax.set_yticks(ind+((width)/2.))
ax.set_yticklabels(labels, rotation='horizontal')
ax.set_xlabel("Correlation coefficient")
ax.set_title("Correlation Coefficients each feature with target");

# Reading & Preparing the Data

In [None]:
features=['cont1','cont2','cont3','cont4','cont5','cont6','cont7','cont8','cont9','cont10','cont11','cont12','cont13','cont14']
target = 'target'
xTrain, xTest = train[features],test[features]
yTrain= train[target]

print('******** Finished Reading & Preparing the Data**********')


# OPTUNA

Optuna uses Bayesian methods to figure out an optimal set of hyperparameters. For more information on Bayesian methods for searching optimal parameters, check out this wonderful article : https://towardsdatascience.com/a-conceptual-explanation-of-bayesian-model-based-hyperparameter-optimization-for-machine-learning-b8172278050f .

## Objective Function

In [None]:
from optuna import Trial

def objective(trial:Trial,fastCheck=True,targetMeter=0,returnInfo=False):
    folds = 10
    seed  = 0
    shuffle = False
    kf = KFold(n_splits=folds,shuffle=False,random_state=seed)
    yValidPredTotal = np.zeros(xTrain.shape[0])
    gc.collect()
    models=[]
    validScore=0
    for trainIdx,validIdx in kf.split(xTrain,yTrain):
        trainData=xTrain.iloc[trainIdx,:],yTrain[trainIdx]
        validData=xTrain.iloc[validIdx,:],yTrain[validIdx]
        model,yPredValid,log = fitLGBM(trial,trainData,validData,numRounds=5000)
        yValidPredTotal[validIdx]=yPredValid
        models.append(model)
        gc.collect()
        validScore+=log["validRMSE"]
    validScore/=len(models)
    return validScore

## Defining Parameter Space for OPTUNA

In [None]:
def fitLGBM(trial,train,val,numRounds=5000): 
    xTrainLGBM,yTrainLGBM = train
    xValidLGBM,yValidLGBM = val
    boosting_list = ['gbdt','goss']
    objective_list_reg = ['huber', 'gamma', 'fair', 'tweedie']
    objective_list_class = ['regression','binary', 'cross_entropy']
    params={
      'boosting':trial.suggest_categorical('boosting',boosting_list),
      'num_leaves':trial.suggest_int('num_leaves', 2, 2**11),
      'max_depth':trial.suggest_int('max_depth', 2, 25),
      'max_bin': trial.suggest_int('max_bin', 32, 450,550),      
      'min_data_in_leaf': trial.suggest_int('min_data_in_leaf', 1, 150,256),
      'min_data_in_bin': trial.suggest_int('min_data_in_bin', 1, 150,256),
      'min_gain_to_split' : trial.suggest_discrete_uniform('min_gain_to_split', 0.1, 5, 0.01),      
      'lambda_l1':trial.suggest_loguniform('lamda_l1',1e-8,10),
      'lambda_l2':trial.suggest_loguniform('lamda_l2',1e-8,10),
      'learning_rate': trial.suggest_loguniform('learning_rate', 0.001, 0.1),
      'metric':trial.suggest_categorical('metric', ['RMSE']),
      'objective':trial.suggest_categorical('objective',objective_list_reg),
      'bagging_fraction':trial.suggest_discrete_uniform('bagging_fraction',0.4, 1, 0.01),
      'feature_fraction':trial.suggest_discrete_uniform('feature_fraction',0.4, 1, 0.01),
    }
    earlyStop=1000
    verboseEval=100
    dTrain = lgb.Dataset(xTrainLGBM,label=yTrainLGBM)
    dValid = lgb.Dataset(xValidLGBM,label=yValidLGBM)
    watchlist = [dTrain,dValid]

    # Callback for pruning.
    lgbmPruningCallback = optuna.integration.LightGBMPruningCallback(trial, 'rmse', valid_name='valid_1')

    model = lgb.train(params,train_set=dTrain,num_boost_round=numRounds,valid_sets=watchlist,verbose_eval=verboseEval,early_stopping_rounds=earlyStop,callbacks=[lgbmPruningCallback])

    #predictions
    pred_val=model.predict(xValidLGBM,num_iteration=model.best_iteration)
    oofPred = pred_val.astype(int)        
    log={'trainRMSE':model.best_score['training']['rmse'],
       'validRMSE':model.best_score['valid_1']['rmse']}
    return model,pred_val,log

## Optimization & STUDY Object

In [None]:
study = optuna.create_study(pruner=optuna.pruners.MedianPruner(n_warmup_steps=5))
study.optimize(objective,n_trials=200)#For the sake of simplicity, I have kept n_trials as less, but this can be altered for better results

## OPTUNA Study History : Analysis & Visualization

In [None]:
print('Best trial: score {}, params {}'.format(study.best_trial.value, study.best_trial.params))

## optuna.visualization.plot_optimization_history(study) : This function plots optimization history of all trials in a study

In [None]:
optuna.visualization.plot_optimization_history(study)

## optuna.visualization.plot_slice(study, params=None) : This function plots the parameter relationship as slice plot in a study

In [None]:
optuna.visualization.plot_slice(study)

## Reference : 

* https://www.kaggle.com/kst6690/dsb2019-tuning-lightgbm-parameter-using-optuna
* https://www.statisticshowto.com/probability-and-statistics/correlation-coefficient-formula/#Pearson