## Import Necessary Libraries

In [None]:
import numpy as np
import pandas as pd
from pathlib import Path

import os
for dirname, _, filenames in os.walk('/kaggle/input'):
    for filename in filenames:
        print(os.path.join(dirname, filename))
        
import matplotlib.pyplot as plt
import optuna
from sklearn.model_selection import train_test_split
from sklearn.metrics import mean_squared_error, mean_absolute_error, roc_auc_score
from sklearn.ensemble import RandomForestRegressor
from xgboost import XGBRegressor
from lightgbm import LGBMRegressor, LGBMClassifier
from sklearn.impute import SimpleImputer
from sklearn.preprocessing import StandardScaler
from sklearn.model_selection import StratifiedKFold

## Read the data files

In [None]:
train = pd.read_csv('../input/tpssep2021dataset10folds/train_10_folds.csv', index_col='id')
print(train.shape)
train.head()

In [None]:
train.describe()

In [None]:
test = pd.read_csv('/kaggle/input/tabular-playground-series-sep-2021/test.csv', index_col='id')
print(test.shape)
test.head()

In [None]:
test.describe()

In [None]:
submission = pd.read_csv('/kaggle/input/tabular-playground-series-sep-2021/sample_solution.csv')
submission.head()

## Introducing Additional Features

In [None]:
# Adding the number of missing values in a row as a feature increases the score significantly
train["missing_value_cnt"] = train.isnull().sum(axis=1)
test["missing_value_cnt"] = test.isnull().sum(axis=1)

train.head()

## Imputation for Handling Missing Values

In [None]:
def imputation(X_train, X_valid, X_test = None):
    imputer = SimpleImputer(strategy='mean')

    imputed_X_train = pd.DataFrame(imputer.fit_transform(X_train))
    imputed_X_valid = pd.DataFrame(imputer.transform(X_valid))
    imputed_X_test = None

    # Imputation removed column names; put them back
    imputed_X_train.columns = X_train.columns
    imputed_X_valid.columns = X_valid.columns
    
    if X_test is not None:
        imputed_X_test = pd.DataFrame(imputer.transform(X_test))
        imputed_X_test.columns = X_test.columns
    
    return imputed_X_train, imputed_X_valid, imputed_X_test


## Feature Scaling

In [None]:
def feature_scaling(X_train, X_valid, X_test = None):
    standardScaler = StandardScaler()
    
    scaled_X_train = pd.DataFrame(standardScaler.fit_transform(X_train))
    scaled_X_valid = pd.DataFrame(standardScaler.transform(X_valid))
    scaled_X_test = None
    
    # Scaling removed column names; put them back
    scaled_X_train.columns = X_train.columns
    scaled_X_valid.columns = X_valid.columns
    
    
    if X_test is not None:
        scaled_X_test = pd.DataFrame(standardScaler.transform(X_test))
        scaled_X_test.columns = X_test.columns
    
    return scaled_X_train, scaled_X_valid, scaled_X_test

## Choose Between LGBMClassifier and LGBMRegressor

In [None]:
# y = train.claim
# X = train.drop(columns = ['claim'])
# X_train, X_valid, y_train, y_valid = train_test_split(X, y, train_size = 0.8, stratify = y, random_state = 1234)

# # Perform imputation
# imputed_X_train, imputed_X_valid, _ = imputation(X_train, X_valid)

# # Perform Feature Scaling
# scaled_X_train, scaled_X_valid, _ = feature_scaling(imputed_X_train, imputed_X_valid)  

In [None]:
# cls_model = LGBMClassifier(device='gpu', random_state = 1234)
# cls_model.fit(scaled_X_train, y_train)
# valid_predictions = cls_model.predict_proba(scaled_X_valid)
# print("Roc AUC score for Classifier : ", roc_auc_score(y_valid, valid_predictions[:, 1]))

In [None]:
# reg_model = LGBMRegressor(device='gpu', random_state = 1234)
# reg_model.fit(scaled_X_train, y_train)
# valid_predictions = reg_model.predict(scaled_X_valid)
# print("Roc AUC score for Regressor : ",roc_auc_score(y_valid, valid_predictions))

## HyperParameter Tuning using Optuna

In [None]:
def objective(trial):
    
    y = train.claim
    X = train.drop(columns = ['claim', 'fold'])
    
    params = {
        'objective': 'binary',
        "n_estimators": trial.suggest_categorical("n_estimators", [10000]),
        "num_leaves": trial.suggest_int("num_leaves", 8, 1024, step = 8),
        "max_depth": trial.suggest_int("max_depth", 3, 10),
        "min_data_in_leaf": trial.suggest_int("min_data_in_leaf", 200,10000, step = 50),
        "learning_rate": trial.suggest_float("learning_rate", 1e-2, 0.5, log=True),
        "min_split_gain": trial.suggest_float("gamma", 0, 15.0),
        "reg_lambda": trial.suggest_float("lambda", 0, 100.0, step=0.1),
        "reg_alpha": trial.suggest_float("alpha", 0, 100.0, step=0.1),
        "subsample": trial.suggest_float("subsample", 0.2, 1.0),
        "subsample_freq": trial.suggest_categorical("subsample_freq", [1]),
        "colsample_bytree": trial.suggest_float("colsample_bytree", 0.2, 1.0)
    }
    
    X_train, X_valid, y_train, y_valid = train_test_split(X, y, train_size = 0.9, stratify = y, random_state = 1234)
    
    # Perform imputation
    X_train, X_valid, _ = imputation(X_train, X_valid)

    # Perform Feature Scaling
    X_train, X_valid, _ = feature_scaling(X_train, X_valid)

    model = LGBMClassifier(**params,
                           random_state = 1234)

    pruning_callback = optuna.integration.LightGBMPruningCallback(trial, "auc")

    model.fit(
        X_train,
        y_train,
        eval_set=[(X_valid, y_valid)],
        eval_metric="auc",
        early_stopping_rounds=15,
        verbose=False,
        callbacks=[
            pruning_callback
        ])

    y_pred = model.predict_proba(X_valid)
    roc_auc = roc_auc_score(y_valid, y_pred[:,1])

    return roc_auc

In [None]:
pruner = optuna.pruners.MedianPruner(n_warmup_steps=15)
study = optuna.create_study(pruner= pruner, study_name="lgbmc-study", direction="maximize")
study.optimize(objective, n_trials=500, timeout=7200)

In [None]:
print("Number of finished trials: ", len(study.trials))
trial = study.best_trial
print("Best trial validation score: {}".format(trial.value))

print("The best parameters are: ")
study.best_params

In [None]:
# best_params = {'n_estimators': 10000,
#  'num_leaves': 1200,
#  'max_depth': 7,
#  'min_data_in_leaf': 8200,
#  'learning_rate': 0.027488306224509,
#  'gamma': 2.7677862282755576,
#  'lambda': 10.404083471231429,
#  'alpha': 32.57511055307707,
#  'subsample': 0.3748489217712975,
#  'subsample_freq': 1,
#  'colsample_bytree': 0.7259693776119154}

# print(best_params)

## Training model with 10 Fold Cross Validation

In [None]:
all_test_predictions = []
valid_predictions = pd.DataFrame(np.zeros(train.index.shape), index = train.index, columns=['LGBM_preds'])
# print(valid_predictions.shape)
auc_scores = []

for fold in range(10):
    X_train =  train[train.fold != fold]
    X_valid = train[train.fold == fold]
    X_test = test.copy()
    
    valid_ids = X_valid.index.tolist()

    y_train = X_train.claim
    y_valid = X_valid.claim
    
    X_train = X_train.drop(columns=['claim', 'fold'])
    X_valid = X_valid.drop(columns=['claim', 'fold'])
    
    # Perform imputation
    X_train, X_valid, X_test = imputation(X_train, X_valid, X_test)

    # Perform Feature Scaling
    X_train, X_valid, X_test = feature_scaling(X_train, X_valid, X_test) 
    
    model = LGBMClassifier(**study.best_params,
                           objective = 'binary',
                           random_state = 1234)

    model.fit(X_train, 
              y_train,
              eval_set=[(X_valid, y_valid)],
              eval_metric="auc",
              verbose=200,
              early_stopping_rounds=100)
    
    valid_preds = model.predict_proba(X_valid)[:,1]
    test_preds = model.predict_proba(X_test)[:,1]
    all_test_predictions.append(test_preds)
    valid_predictions.loc[valid_ids, 'LGBM_preds'] = valid_preds
    
    roc_auc = roc_auc_score(y_valid, valid_preds)
    print("Validation score for fold {}: {}".format(fold, roc_auc))
    auc_scores.append(roc_auc)

print("Validation scores mean : {} and Standard deviation : {}".format(np.mean(auc_scores), np.std(auc_scores)))

In [None]:
valid_predictions = valid_predictions.reset_index()
valid_predictions.columns = ["id", "LGBM_preds"]
valid_predictions.to_csv("LGBM_train_predictions.csv", index=False)

In [None]:
print(valid_predictions.shape)
valid_predictions.head()

In [None]:
test_predictions = submission.copy()
test_predictions.claim = np.mean(np.array(all_test_predictions), axis=0)
test_predictions.columns = ["id", "LGBM_preds"]
test_predictions.to_csv("LGBM_test_predictions.csv", index=False)

In [None]:
print(test_predictions.shape)
test_predictions.head()

## Training Model with Whole Training Data

In [None]:
X_train = train.copy()
X_test = test.copy()

y_train = train.claim
X_train = X_train.drop(columns = ['claim', 'fold'])

# Perform imputation
X_train, X_test, _ = imputation(X_train, X_test)

# Perform Feature Scaling
X_train, X_test, _ = feature_scaling(X_train, X_test) 

In [None]:
model = LGBMClassifier(**study.best_params,
                       objective = 'binary',
                       random_state = 1234)

model.fit(X_train, y_train, verbose=50)
test_preds = model.predict_proba(X_test)[:,1]

In [None]:
print(roc_auc_score(y_train, model.predict_proba(X_train)[:,1]))

## Submission

In [None]:
submission['claim'] = test_preds
submission.to_csv('lgbm_output.csv', index = False)

In [None]:
submission.head(10)