# Data

In [None]:
import numpy as np
import pandas as pd

import matplotlib.pyplot as plt
%matplotlib inline
import seaborn as sns
sns.set(style="darkgrid")

import string
import warnings
warnings.filterwarnings('ignore')

In [None]:
df_train = pd.read_csv('../input/tabular-playground-series-sep-2021/train.csv')
df_train.head()

In [None]:
df_test = pd.read_csv('../input/tabular-playground-series-sep-2021/test.csv')
df_test.head()

In [None]:
samp_sub = pd.read_csv('../input/tabular-playground-series-sep-2021/sample_solution.csv')
samp_sub.head()

In [None]:
print(df_train.shape)
print(df_test.shape)
print(samp_sub.shape)

In [None]:
df_train.info()

In [None]:
df_train.describe()

In [None]:
sns.countplot(x=df_train.claim,palette='Set2')

# Missing values

In [None]:
print(df_train.isnull().sum())

In [None]:
# Fork of https://www.kaggle.com/mrigendraagrawal/tps-sep-eda-and-starter?scriptVersionId=73721669&cellId=21

missing_values = pd.DataFrame(df_train.isna().sum())
missing_values.rename(columns={0:'missing_value'},inplace=True)
def train_missing_perecentage(idx):
    return (idx/len(df_train))*100
missing_values['missing_value'] = missing_values.apply(train_missing_perecentage)
features = list(df_train.columns)
percentage = []
for i in features:
    percentage.append(float(missing_values.loc[str(i)]))
missing_values = pd.DataFrame({'Feature':features,'Percentage':percentage})

In [None]:
import plotly.express as px
px.scatter(data_frame=missing_values,x='Feature',y='Percentage',template='plotly_dark')

# Modeling

In [None]:
X = df_train.drop(['id', 'claim'], axis = 1)
Y = df_train.claim

In [None]:
import optuna

In [None]:
import lightgbm as lgb
from sklearn.metrics import mean_squared_error
from sklearn.model_selection import KFold
from sklearn.model_selection import train_test_split
from sklearn.metrics import roc_auc_score

# Let's build our optimization function using optuna

Fork of https://www.kaggle.com/hamzaghanmi/lgbm-hyperparameter-tuning-using-optuna?scriptVersionId=53513594&cellId=14

In [None]:
def objective(trial,data=X,target=Y):
    
    train_x, test_x, train_y, test_y = train_test_split(data, target, test_size=0.2, random_state=42)
    param = {
        'random_state': 42,
        'n_estimators': 500,
        'task': 'train',
        'objective': 'binary',
        'metric':'binary_logloss',
        'reg_alpha': trial.suggest_categorical('reg_alpha', [1,10.0]),
        'reg_lambda': trial.suggest_categorical('reg_lambda', [1e-1,1e-2]),
        'colsample_bytree': trial.suggest_categorical('colsample_bytree', [0.4,0.6,0.8]),
        'subsample': trial.suggest_categorical('subsample', [0.4,0.6,0.8]),
        'subsample_freq': trial.suggest_categorical('subsample_freq', [1,2]),
        'learning_rate': trial.suggest_categorical('learning_rate', [5e-3,2e-2]),
        'max_depth': -1,
        'num_leaves' : trial.suggest_categorical('num_leaves', [128,512]),
        'min_child_weight' : trial.suggest_categorical('min_child_weight', [128,256]),
        'min_child_samples': trial.suggest_categorical('min_child_samples', [20,100]),
        'importance_type': 'gain'
    }
    model = lgb.LGBMRegressor(**param)  
    
    model.fit(train_x,train_y,eval_set=[(test_x,test_y)],early_stopping_rounds=100,verbose=500)
    
    preds = model.predict(test_x)
    
    rmse = mean_squared_error(test_y, preds,squared=False)
    
    return rmse

In [None]:
study = optuna.create_study(direction='minimize')
study.optimize(objective, n_trials=20)
print('Number of finished trials:', len(study.trials))
print('Best trial:', study.best_trial.params)

In [None]:
study.trials_dataframe()

# Visualization for Hyperparameter Optimization Analysis

In [None]:
optuna.visualization.plot_optimization_history(study)

In [None]:
optuna.visualization.plot_parallel_coordinate(study)

In [None]:
optuna.visualization.plot_slice(study)

In [None]:
#Visualize parameter importances.
optuna.visualization.plot_param_importances(study)

In [None]:
optuna.visualization.plot_edf(study)

# Training 
## (Let's Create an LGBMRegressor model with the best hyperparameters)
'reg_alpha': 1, 'reg_lambda': 0.1, 'colsample_bytree': 0.8, 'subsample': 0.8, 'subsample_freq': 2, 'learning_rate': 0.02, 'num_leaves': 512, 'min_child_weight': 128, 'min_child_samples': 100

In [None]:
params = {
        'random_state': 42,
        'n_estimators': 1000,
        'task': 'train',
        'objective': 'binary',
        'metric':'binary_logloss',
        'reg_alpha': 1, 
        'reg_lambda': 0.1, 
        'colsample_bytree': 0.8, 
        'subsample': 0.8, 
        'subsample_freq': 2, 
        'learning_rate': 0.02, 
        'num_leaves': 512, 
        'min_child_weight': 128, 
        'min_child_samples': 100,
        'max_depth': -1,
        'importance_type': 'gain'
    }

In [None]:
folds = KFold(n_splits=5, shuffle=True, random_state=42)

for fold, (train_idx, valid_idx) in enumerate(folds.split(df_train)):
    print(f'fold {fold} starting...')
    fold_train = df_train.iloc[train_idx]
    train_x = fold_train.drop(['id', 'claim'], axis = 1)
    train_y = fold_train.claim
    dtrain = lgb.Dataset(train_x,label=train_y)
    
    fold_valid = df_train.iloc[valid_idx]
    valid_x = fold_valid.drop(['id', 'claim'], axis = 1)
    valid_y = fold_valid.claim
    dvalid = lgb.Dataset(valid_x,valid_y)
    
    model = lgb.train(params,
                    train_set=dtrain, 
                    valid_sets=dvalid,
                    early_stopping_rounds=100,
                    verbose_eval=1000
                    )
    
    oof = model.predict(valid_x)
    score = roc_auc_score(valid_y,oof)
    print(f"Valid score for {fold} is: {score}")
    oof = pd.DataFrame({'id':valid_x.index,'claim':oof})
    oof.to_csv(f'{fold}_oof.csv',index=False)
    model.save_model(f'lightgbm_{fold}.txt')
    print(f' fold {fold} completed')

# Inference

In [None]:
from tqdm import tqdm

In [None]:
df_test = df_test.drop(['id'], axis = 1)

In [None]:
for fold in tqdm(range(5)):
    model = lgb.Booster(model_file=f'./lightgbm_{fold}.txt')
    preds = model.predict(df_test)
    submission = samp_sub.copy()
    submission['claim'] = preds
    submission.to_csv(f'submission_{fold}.csv',index=False)

In [None]:
sub0 = pd.read_csv('./submission_0.csv')
sub1 = pd.read_csv('./submission_1.csv')
sub2 = pd.read_csv('./submission_2.csv')
sub3 = pd.read_csv('./submission_3.csv')
sub4 = pd.read_csv('./submission_4.csv')

In [None]:
import plotly.figure_factory as ff
import plotly.express as px

In [None]:
hist_data = [sub0.claim, sub1.claim, sub2.claim, sub3.claim, sub4.claim]

group_labels = ['sub0', 'sub1', 'sub2', 'sub3', 'sub4']

# Create distplot with custom bin_size
fig = ff.create_distplot(hist_data, group_labels, bin_size=.2, show_hist=False, show_rug=False)
fig.show()

In [None]:
data=np.corrcoef([sub0.claim, sub1.claim, sub2.claim, sub3.claim, sub4.claim])
fig = px.imshow(data,
                x=group_labels,
                y=group_labels
               )
fig.show()

In [None]:
sub = sub1.copy()
sub.loc[:, 'claim'] = (0.2 * sub0 + 0.2 * sub1 + 0.2 * sub2 + 0.2 * sub3 + 0.2 * sub4 )
sub.to_csv('submission.csv', index=False)

In [None]:
sub.head()