# **1. DATASET PREPARATION**

## Importing Necessary Libraries

In [None]:
import numpy as np
import pandas as pd
import matplotlib.pyplot as plt
import seaborn as sns

import time
import optuna
import warnings
warnings.filterwarnings("ignore")

from sklearn.model_selection import KFold, StratifiedKFold, train_test_split
from sklearn.preprocessing import LabelEncoder, StandardScaler
from sklearn.decomposition import PCA
from sklearn.metrics import roc_auc_score
from sklearn.impute import SimpleImputer

from lightgbm import LGBMClassifier
from catboost import CatBoostClassifier
from xgboost import XGBClassifier

pd.set_option('display.max_rows', None)
pd.set_option('display.max_columns', None)
pd.set_option('float_format', '{:f}'.format)

## Reading the dataset files

In [None]:
train = pd.read_csv('../input/tabular-playground-series-sep-2021/train.csv')
test = pd.read_csv('../input/tabular-playground-series-sep-2021/test.csv')
submission = pd.read_csv('../input/tabular-playground-series-sep-2021/sample_solution.csv')

# **2. DATASET OVERVIEW**

## Train Dataset

In [None]:
train.head()

### Printing the information about actual and missing values in Training Data

In [None]:
print(f'Number of Rows in Training Dataset: {train.shape[0]}  \nNumber of Columns in Training Dataset: {train.shape[1]} \nTotal Number of missing values in Training Dataset: {sum(train.isna().sum())}')

### Printing the basic statistics for each variable in Training Dataset which contain information on count, mean, standard deviation, minimum, 1st quartile, median, 3rd quartile and maximum.

In [None]:
train.describe()

## Test Dataset

In [None]:
test.head()

### Printing the information about actual and missing values in Testing Data

In [None]:
print(f'Number of Rows in Testing Dataset: {test.shape[0]}  \nNumber of Columns in Testing Dataset: {test.shape[1]} \nTotal Number of missing values in Testing Dataset: {sum(train.isna().sum())}')

### Printing the basic statistics for each variable in Testing Dataset which contain information on count, mean, standard deviation, minimum, 1st quartile, median, 3rd quartile and maximum.

In [None]:
test.describe()

# **3. EXPLORATORY DATA ANALYSIS**

### Correlation Plot of the Dataset

### We observed that majority of the correlation values are near to 0 which states that there are no highly dependent features.

In [None]:
corr = train.corr()
corr.style.background_gradient(cmap='coolwarm').set_precision(2)

### Visualizing the distribution of data into Training and Testing Set

In [None]:
fig, ax = plt.subplots(figsize=(5, 5))
pie = ax.pie([len(train), len(test)],
             labels=["Train dataset", "Test dataset"],
             colors=["orange", "lightblue"],
             textprops={"fontsize": 15},
             autopct='%1.1f%%')
ax.axis("equal")
ax.set_title("Dataset length comparison\n", fontsize=18)
fig.set_facecolor('white')
plt.show();

### Visualizing the distribution of Claim (Target) in the Training Set

### Here we can observe that the target class is well balanced. This helps us to proceed with applying suitable techniques on the data during the data modeling phase. 

In [None]:
sns.set(font_scale=1.4)
train['claim'].value_counts().plot(kind='bar',figsize=(7, 6), rot=0)
plt.xlabel("Claim (target) value", labelpad=14)
plt.ylabel("Values", labelpad=14)
plt.title("Claim Value Distribution", y=1.02);

#### Analysing the features and their corresponding missing values in the Training Dataset

In [None]:
missing_train_df = pd.DataFrame(train.isna().sum())
missing_train_df = missing_train_df.drop(['id', 'claim']).reset_index()
missing_train_df.columns = ['feature', 'count']
missing_train_df['percentage'] = (missing_train_df['count']/train.shape[0])*100 
missing_train_df.head()

#### Missing feature values distribution in the Train dataset

In [None]:
fig, ax = plt.subplots(figsize=(20, 10))

bars = ax.bar(missing_train_df['feature'],
              missing_train_df['count'],
              color="lightskyblue",
              edgecolor="black",
              width=0.5
             )
ax.set_title("Missing feature values distribution in the train dataset", fontsize=20, pad=15)
ax.set_ylabel("Missing values", fontsize=14, labelpad=15)
ax.set_xlabel("Feature", fontsize=14, labelpad=15)
ax.tick_params(axis="x", rotation=90, labelsize=8)
ax.margins(0.005, 0.12)
ax.grid(axis="y")

plt.show();

#### Analysing the features and their corresponding missing values in the Testing Dataset

In [None]:
missing_test_df = pd.DataFrame(test.isna().sum())
missing_test_df = missing_test_df.drop(['id']).reset_index()
missing_test_df.columns = ['feature', 'count']
missing_test_df['percentage'] = (missing_test_df['count']/train.shape[0])*100 
missing_test_df.head()

#### Missing feature values distribution in the Test dataset

In [None]:
fig, ax = plt.subplots(figsize=(20, 10))

bars = ax.bar(missing_test_df['feature'],
              missing_test_df['count'],
              color="lightskyblue",
              edgecolor="black",
              width=0.5
             )
ax.set_title("Missing feature values distribution in the test dataset", fontsize=20, pad=15)
ax.set_ylabel("Missing values", fontsize=14, labelpad=15)
ax.set_xlabel("Feature", fontsize=14, labelpad=15)
ax.tick_params(axis="x", rotation=90, labelsize=8)
ax.margins(0.005, 0.12)
ax.grid(axis="y")

plt.show();

# **4. DATASET PREPROCESSING**

#### Creating a set of required features by dropping the id and claim column

In [None]:
features = [feature for feature in train.columns if feature not in ('id', 'claim')]

#### Calculating the sum of missing values in each features and standard deviation of each feature. Adding those new calculated columns in our original dataset to use them in the modeling phase

In [None]:
train['num_missing'] = train[features].isna().sum(axis=1)
train['std_dev'] = train[features].isna().std(axis=1)

test['num_missing'] = test[features].isna().sum(axis=1)
test['std_dev'] = test[features].isna().std(axis=1)

features += ['num_missing', 'std_dev']

#### Imputing the null values with the mean of the respective column

In [None]:
train[features] = train[features].fillna(train[features].mean())
test[features] = test[features].fillna(test[features].mean())

#### Scaling the training and testing features

In [None]:
scaler = StandardScaler()
train[features] = scaler.fit_transform(train[features])
test[features] = scaler.transform(test[features])

In [None]:
train.shape, test.shape

In [None]:
X = train.drop(["id", "claim"], axis=1)
X_test = test.drop("id", axis=1)
y = train["claim"]

# **5. HYPERPARAMETER OPTIMIZATION USING OPTUNA**

#### Partial code for Hyperparameter Optimization using Optuna. This can further be extended to find the optimal hyperparameter value for training the Light GBM model.

In [None]:
def train_model_optuna(trial, X_train, X_valid, y_train, y_valid):
    preds=0
    lgbm_params = {
        "objective": trial.suggest_categorical("objective", ['binary']),
        "learning_rate": trial.suggest_categorical("learning_rate", [0.001, 0.005]),
        "n_estimators": trial.suggest_categorical("n_estimators", [20000])
    }
    
    model = LGBMClassifier(**lgbm_params, device='gpu')
    model.fit(X_train, y_train,
              eval_set = [(X_valid, y_valid)],
              eval_metric='auc',
              early_stopping_rounds=100,
              verbose=False
             )
    
    print(f"Number of boosting rounds: {model.best_iteration_}")
    oof = model.predict_proba(X_valid)[:, 1]
    return roc_auc_score(y_valid, oof)

In [None]:
X_train, X_valid, y_train, y_valid = train_test_split(X, y, test_size=0.2, random_state=2021, stratify=y)
time_limit = 3600 * 4
study = optuna.create_study(direction='maximize')
study.optimize(lambda trial: train_model_optuna(trial, X_train, X_valid, y_train, y_valid),
               n_trials=1,
               timeout=time_limit
              )
print('Number of finished trials:', len(study.trials))
print('Best trial parameters:', study.best_trial.params)
print('Best score:', study.best_value)

# **6. DATA MODELING**

## **LIGHTGBM CLASSIFIER WITH 5 FOLDS CV**

In [None]:
lgb_params = {
    'objective': 'binary',
    'n_estimators': 20000,
    'random_state': 2021,
    'learning_rate': 5e-3,
    'subsample': 0.6,
    'subsample_freq': 1,
    'colsample_bytree': 0.4,
    'reg_alpha': 10.0,
    'reg_lambda': 1e-1,
    'min_child_weight': 256,
    'min_child_samples': 20,
    'importance_type': 'gain',
    'device': 'gpu'
}

In [None]:
lgb_oof = np.zeros(train.shape[0])
lgb_pred = np.zeros(test.shape[0])
lgb_importances = pd.DataFrame()

kf = KFold(n_splits=5, shuffle=True, random_state=2021)

for fold, (train_idx, val_idx) in enumerate(kf.split(train)):
    print(f"****** Fold {fold} ******")
    X_train = train[features].iloc[train_idx]
    y_train = y.iloc[train_idx]
    X_valid = train[features].iloc[val_idx]
    y_valid = y.iloc[val_idx]
    X_test = test[features]
    
    start = time.time()
    model = LGBMClassifier(**lgb_params)
    model.fit(X_train, y_train,
              eval_set=[(X_valid, y_valid)],
              eval_metric='auc',
              early_stopping_rounds=200,
              verbose=1000
    )
    
    fi_temp = pd.DataFrame()
    fi_temp['feature'] = model.feature_name_
    fi_temp['importance'] = model.feature_importances_
    fi_temp['fold'] = fold
    fi_temp['seed'] = 2021
    lgb_importances = lgb_importances.append(fi_temp)
    
    lgb_oof[val_idx] = model.predict_proba(X_valid)[:, -1]
    lgb_pred += model.predict_proba(X_test)[:, -1] / 5
    
    elapsed = time.time() - start
    auc = roc_auc_score(y_valid, lgb_oof[val_idx])
    print(f"fold {fold} - lgb auc: {auc:.6f}, elapsed time: {elapsed:.2f}sec\n")

In [None]:
print(f"oof lgb roc = {roc_auc_score(y, lgb_oof)}")

In [None]:
output = pd.DataFrame({'id': test.id,
                       'claim': lgb_pred})
output.to_csv('submission_lgbm_hyper.csv', index=False)