# Data

In [None]:
import numpy as np
import pandas as pd

import matplotlib.pyplot as plt
%matplotlib inline
import seaborn as sns
sns.set(style="darkgrid")

import string
import warnings
warnings.filterwarnings('ignore')

In [None]:
df_train = pd.read_csv('../input/tabular-playground-series-oct-2021/train.csv')
df_train.head()

In [None]:
df_test = pd.read_csv('../input/tabular-playground-series-oct-2021/test.csv')
df_test.head()

In [None]:
samp_sub = pd.read_csv('../input/tabular-playground-series-oct-2021/sample_submission.csv')
samp_sub.head()

In [None]:
print(df_train.shape)
print(df_test.shape)
print(samp_sub.shape)

In [None]:
df_train.info()

In [None]:
df_train.describe()

In [None]:
sns.countplot(x=df_train.target,palette='Set2')

# Missing values

In [None]:
print(df_train.isnull().sum())

In [None]:
missing_values = pd.DataFrame(df_train.isna().sum())
missing_values.rename(columns={0:'missing_value'},inplace=True)
def train_missing_perecentage(idx):
    return (idx/len(df_train))*100
missing_values['missing_value'] = missing_values.apply(train_missing_perecentage)
features = list(df_train.columns)
percentage = []
for i in features:
    percentage.append(float(missing_values.loc[str(i)]))
missing_values = pd.DataFrame({'Feature':features,'Percentage':percentage})

In [None]:
import plotly.express as px
px.scatter(data_frame=missing_values,x='Feature',y='Percentage',template='plotly_dark')

# Modeling

In [None]:
X = df_train.drop(['id', 'target'], axis = 1)
Y = df_train.target

In [None]:
import optuna

import lightgbm as lgb
from sklearn.metrics import mean_squared_error
from sklearn.model_selection import KFold
from sklearn.model_selection import train_test_split
from sklearn.metrics import roc_auc_score

## Let's build our optimization function using optuna

In [None]:
def objective(trial,data=X,target=Y):
    
    train_x, test_x, train_y, test_y = train_test_split(data, target, test_size=0.2, random_state=42)
    param = {
        'random_state': 42,
        'n_estimators': 500,
        'task': 'train',
        'objective': 'binary',
        'metric':'binary_logloss',
        'reg_alpha': trial.suggest_categorical('reg_alpha', [1,10.0]),
        'reg_lambda': trial.suggest_categorical('reg_lambda', [1e-1,1e-2]),
        'colsample_bytree': trial.suggest_categorical('colsample_bytree', [0.4,0.6,0.8]),
        'subsample': trial.suggest_categorical('subsample', [0.4,0.6,0.8]),
        'subsample_freq': trial.suggest_categorical('subsample_freq', [1,2]),
        'learning_rate': 5e-3,
        'max_depth': -1,
        'num_leaves' : trial.suggest_categorical('num_leaves', [128,512]),
        'min_child_weight' : trial.suggest_categorical('min_child_weight', [128,256]),
        'min_child_samples': trial.suggest_categorical('min_child_samples', [20,100]),
        'importance_type': 'gain'
    }
    model = lgb.LGBMRegressor(**param)  
    
    model.fit(train_x,train_y,eval_set=[(test_x,test_y)],early_stopping_rounds=100,verbose=250)
    
    preds = model.predict(test_x)
    
    rmse = mean_squared_error(test_y, preds,squared=False)
    
    return rmse

In [None]:
study = optuna.create_study(direction='minimize')
study.optimize(objective, n_trials=10)
print('Number of finished trials:', len(study.trials))
print('Best trial:', study.best_trial.params)

In [None]:
study.trials_dataframe()

# Visualization for Hyperparameter Optimization Analysis

In [None]:
optuna.visualization.plot_optimization_history(study)

In [None]:
optuna.visualization.plot_parallel_coordinate(study)

In [None]:
optuna.visualization.plot_slice(study)

In [None]:
#Visualize parameter importances.
optuna.visualization.plot_param_importances(study)

In [None]:
optuna.visualization.plot_edf(study)

## (Let's Create an LGBM model with the best hyperparameters)