In [None]:
import math
import numpy as np
import pandas as pd
import seaborn as sns
import matplotlib.pyplot as plt

import lightgbm as lgb
from sklearn.preprocessing import StandardScaler
import optuna
from optuna.samplers import TPESampler
from sklearn.model_selection import train_test_split, cross_val_score, KFold
from sklearn import metrics
from sklearn.metrics import roc_auc_score, plot_roc_curve

## Train set summary

In [None]:
df_train = pd.read_csv('../input/tabular-playground-series-sep-2021/train.csv')
df_train.head()

In [None]:
print(f'Train set shape:   {df_train.shape}')

In [None]:
df_train.info()

In [None]:
df_train.describe()

In [None]:
df_train.isnull().sum().sort_values()

**'id' and 'clain' columns have no missing values.**

In [None]:
percentage_of_nulls = df_train.isnull().mean() * 100

plt.figure(figsize = (5, 30))
plots = sns.barplot(x = percentage_of_nulls.sort_values(), y = percentage_of_nulls.sort_values().index)
for p in plots.patches:
    width = p.get_width()
    height = p.get_height()
    plt.text(0.3 + width, p.get_y() + 0.55 * height, '{:1.3f}'.format(width), ha = 'center', va = 'center')
plt.xlabel('% of nulls')
plt.ylabel('Feature')
plt.grid()

## Test set summary

In [None]:
df_test = pd.read_csv('../input/tabular-playground-series-sep-2021/test.csv')
df_test.head()

In [None]:
print(f'Test set shape:   {df_test.shape}')

In [None]:
df_test.info()

In [None]:
df_test.describe()

In [None]:
df_test.isnull().sum().sort_values()

**'id' column has no missing values.**

In [None]:
percentage_of_nulls = df_test.isnull().mean() * 100

plt.figure(figsize = (5, 30))
plots = sns.barplot(x = percentage_of_nulls.sort_values(), y = percentage_of_nulls.sort_values().index)
for p in plots.patches:
    width = p.get_width()
    height = p.get_height()
    plt.text(0.3 + width, p.get_y() + 0.55 * height, '{:1.3f}'.format(width), ha = 'center', va = 'center')
plt.xlabel('% of nulls')
plt.ylabel('Feature')
plt.grid()

## Target summary

In [None]:
plt.figure(figsize = (5,5))
sns.countplot(x = df_train['claim'])
plt.grid()

**Classes are balansed.**

## Missing values

In [None]:
df_train['n_nulls'] = df_train.drop(['id', 'claim'], axis = 1).isnull().sum(axis = 1)
df_test['n_nulls'] = df_test.drop('id', axis = 1).isnull().sum(axis = 1)

df_train['mean'] = df_train.drop(['id', 'claim', 'n_nulls'], axis = 1).mean(axis = 1)
df_test['mean'] = df_test.drop(['id', 'n_nulls'], axis = 1).mean(axis = 1)

df_train['std'] = df_train.drop(['id', 'claim', 'n_nulls', 'mean'], axis = 1).std(axis = 1)
df_test['std'] = df_test.drop(['id', 'n_nulls', 'mean'], axis = 1).std(axis = 1)

In [None]:
df_train = df_train.fillna(np.mean(df_train))
df_test = df_test.fillna(np.mean(df_test))

In [None]:
df_train.head()

In [None]:
df_test.head()

## Standard Scaler

In [None]:
features = [i for i in df_train.columns if 'f' in i] + ['n_nulls', 'mean', 'std']

scaler = StandardScaler()

df_train[features] = pd.DataFrame(scaler.fit_transform(df_train[features]), columns = features)
df_test[features] = pd.DataFrame(scaler.transform(df_test[features]), columns = features)

In [None]:
df_train.head()

In [None]:
df_test.head()

## LGBM

In [None]:
X = df_train[features]
y = df_train['claim']

In [None]:
# Parameters improved by Optuna

params = {'n_estimators': 1000, 'subsample': 0.7, 'colsample_bytree': 0.7,
         'max_depth': 5, 'reg_alpha': 20, 'reg_lambda': 20, 'learning_rate': 0.0632598738459901}

model = lgb.LGBMClassifier(**params, device = 'gpu')

In [None]:
def check_model(model, n_splits = 10):
    
    scores = []
    cv = KFold(n_splits, shuffle = True)
    
    for train_idx, test_idx in cv.split(X):
        X_train, y_train = X.iloc[train_idx], y.iloc[train_idx]
        X_test, y_test = X.iloc[test_idx], y.iloc[test_idx]
        model.fit(X_train, y_train, eval_set = [(X_test, y_test)], eval_metric = 'auc', 
                  early_stopping_rounds = 50)
        preds = model.predict_proba(X_test)[:, -1]
        score = roc_auc_score(y_test, preds)
        scores.append(score)
        
    print('************************************')    
    print(f"Mean AUCROC:       {np.mean(scores)}")
    print(f"Std AUCROC:        {np.std(scores)}")

In [None]:
%%time

check_model(model = model)

In [None]:
plot_roc_curve(model, X, y)
plt.grid()

In [None]:
lgb.plot_importance(model, figsize = (10, 30))

In [None]:
preds = model.predict_proba(df_test[features])[:, -1]

## Submission

In [None]:
sub = pd.read_csv('../input/tabular-playground-series-sep-2021/sample_solution.csv')
sub['claim'] = preds
sub.head()

In [None]:
sub.to_csv('lgbm_final', index = False)