In [None]:
# This Python 3 environment comes with many helpful analytics libraries installed
# It is defined by the kaggle/python Docker image: https://github.com/kaggle/docker-python
# For example, here's several helpful packages to load

import numpy as np # linear algebra
import pandas as pd # data processing, CSV file I/O (e.g. pd.read_csv)

# Input data files are available in the read-only "../input/" directory
# For example, running this (by clicking run or pressing Shift+Enter) will list all files under the input directory

import os
for dirname, _, filenames in os.walk('/kaggle/input'):
    for filename in filenames:
        print(os.path.join(dirname, filename))

# You can write up to 20GB to the current directory (/kaggle/working/) that gets preserved as output when you create a version using "Save & Run All" 
# You can also write temporary files to /kaggle/temp/, but they won't be saved outside of the current session

Hi,\
**Welcome to my Notebook !!**.\
The following notebook deals with basic EDA and model training.

**Please leave an upvote or feedback, if you like or use at part of it. \
Would encourage me more to share.**

I have obtained the optimised params with optuna using from the following notebook:-\
https://www.kaggle.com/skiller/best-params-detection-optuna 

\
Bonus Notebooks\
If you have interest in **NLP** do checkout my notebook on NER and text classification :-
* https://www.kaggle.com/skiller/yes-torch-bert-finetuning-top-5
* https://www.kaggle.com/skiller/ner-pytorch-masked-accuracy-and-loss


Warm Regards

In [None]:
import os
import joblib
import numpy as np
import pandas as pd
import warnings

import matplotlib
import matplotlib.pyplot as plt
from matplotlib import ticker
import seaborn as sns

warnings.filterwarnings('ignore')

# Load the Data

In [None]:
# import datasets
train_df = pd.read_csv('/kaggle/input/tabular-playground-series-sep-2021/train.csv')
test_df = pd.read_csv('/kaggle/input/tabular-playground-series-sep-2021/test.csv')
submission = pd.read_csv('/kaggle/input/tabular-playground-series-sep-2021/sample_solution.csv')

# Lets see data

In [None]:
train_df.describe()

In [None]:
test_df.describe()

In [None]:
missing_train_df = pd.DataFrame(train_df.isna().sum(axis=0))
missing_train_df = missing_train_df.drop(['id', 'claim']).reset_index()
missing_train_df.columns = ['feature', 'count']
missing_train_df['count_percent'] = missing_train_df['count']/train_df.shape[0]


missing_test_df = pd.DataFrame(test_df.isna().sum())
missing_test_df = missing_test_df.drop(['id']).reset_index()
missing_test_df.columns = ['feature', 'count']
missing_test_df['count_percent'] = missing_test_df['count']/test_df.shape[0]

In [None]:
missing_train_row = train_df.drop(['id', 'claim'], axis=1).isna().sum(axis=1)
missing_train_feature_numbers = pd.DataFrame(missing_train_row.value_counts()/train_df.shape[0]).reset_index()
missing_train_feature_numbers.columns = ['no_of_feature', 'count_percent']

missing_test_row = test_df.drop(['id'], axis=1).isna().sum(axis=1)
missing_test_feature_numbers = pd.DataFrame(missing_test_row.value_counts()/test_df.shape[0]).reset_index()
missing_test_feature_numbers.columns = ['no_of_feature', 'count_percent']

# Lets see the visual tables quickly 

In [None]:
fig = plt.figure(figsize=(16, 16))
ax0_sns = sns.barplot(y=missing_train_df['feature'], x=missing_train_df['count_percent'], 
                      zorder=2, linewidth=0, orient='h', saturation=1, alpha=1)
ax0_sns.set_xlabel("missing values", weight='bold')
ax0_sns.set_ylabel("features", weight='bold')
ax0_sns.grid(which='major', axis='x', zorder=0, color='#EEEEEE')
ax0_sns.grid(which='major', axis='y', zorder=0, color='#EEEEEE')

See all less than 2%

In [None]:
fig = plt.figure(figsize=(16, 16))
ax0_sns = sns.barplot(y=missing_train_feature_numbers['no_of_feature'], x=missing_train_feature_numbers['count_percent'], 
                      zorder=2, linewidth=0, orient='h', saturation=1, alpha=1)
ax0_sns.set_xlabel("missing values", weight='bold')
ax0_sns.set_ylabel("features", weight='bold')
ax0_sns.grid(which='major', axis='x', zorder=0, color='#EEEEEE', linewidth=0.4)
ax0_sns.grid(which='major', axis='y', zorder=0, color='#EEEEEE', linewidth=0.4)

Well can't throw the null data more than 37% 

Lets check same for test

In [None]:
fig = plt.figure(figsize=(16, 16))
ax0_sns = sns.barplot(y=missing_test_df['feature'], x=missing_train_df['count_percent'], 
                      zorder=2, linewidth=0, orient='h', saturation=1, alpha=1)
ax0_sns.set_xlabel("missing values", weight='bold')
ax0_sns.set_ylabel("features", weight='bold')
ax0_sns.grid(which='major', axis='x', zorder=0, color='#EEEEEE')
ax0_sns.grid(which='major', axis='y', zorder=0, color='#EEEEEE')

In [None]:
fig = plt.figure(figsize=(16, 16))
ax0_sns = sns.barplot(y=missing_test_feature_numbers['no_of_feature'], x=missing_test_feature_numbers['count_percent'], 
                      zorder=2, linewidth=0, orient='h', saturation=1, alpha=1)
ax0_sns.set_xlabel("missing values", weight='bold')
ax0_sns.set_ylabel("features", weight='bold')
ax0_sns.grid(which='major', axis='x', zorder=0, color='#EEEEEE', linewidth=0.4)
ax0_sns.grid(which='major', axis='y', zorder=0, color='#EEEEEE', linewidth=0.4)

# A lot of missing values. Lets see the correlation

Lets see why people are obsessed with the null counts

In [None]:
train_df['num_nulls'] = train_df.drop(['id', 'claim'], axis = 1).isna().sum(axis = 1)
test_df['num_nulls'] = test_df.drop(['id'], axis = 1).isna().sum(axis = 1)

In [None]:
train_df['num_nulls'].corr(train_df['claim'])

Damn!! thats a large correlation . Need to keep this factor. 

# Lets also look if we have imbalance case

In [None]:
train_df.claim.value_counts()

Good to go. No Imbalance Class

# So now the work remaining is the removal of null values. (Also a bit of Preprocessing) 
But we can't drop the rows owing to the large amount single null rows

In [None]:
# import packages
from sklearn.metrics import roc_auc_score
from sklearn.model_selection import StratifiedKFold, train_test_split
from sklearn.linear_model import LinearRegression
from xgboost import XGBClassifier 
from catboost import CatBoostClassifier
from lightgbm import LGBMClassifier
from tqdm import tqdm
from sklearn import metrics
from sklearn.metrics import roc_curve, auc
from sklearn.linear_model import LogisticRegression
from sklearn.ensemble import RandomForestClassifier
from sklearn.svm import SVC
from sklearn.pipeline import Pipeline
from sklearn.preprocessing import QuantileTransformer,  KBinsDiscretizer, RobustScaler
from sklearn.impute import SimpleImputer

In [None]:
%%time

features = [col for col in train_df.columns if col not in ['claim', 'id']]
pipe = Pipeline([
        ('imputer', SimpleImputer(strategy='median',missing_values=np.nan)),
        ("scaler", QuantileTransformer(n_quantiles=64,output_distribution='uniform')),
        ('bin', KBinsDiscretizer(n_bins=64, encode='ordinal',strategy='uniform'))
        ])
train_df[features] = pipe.fit_transform(train_df[features])
test_df[features] = pipe.transform(test_df[features])

# Lets Begin the training

In [None]:
N_SPLITS = 5
EARLY_STOPPING_ROUNDS = 200
SEED = 42
TRAINING_METHODS = {
    'XGB' : True,
    'LGDM' : True,
    'CAT' : True
}

# Lets Define the training code

In [None]:
def cross_validate(
    model,
    train_df,
    test_df,
    early_stopping='True'
):
    train_oof = np.zeros(len(train_df))
    predictions = np.zeros(len(test_df))

    skf = StratifiedKFold(n_splits=N_SPLITS, shuffle=True, random_state=SEED)

    for fold, (train_idx, valid_idx) in tqdm(enumerate(skf.split(train_df.drop(['claim', 'id'],axis=1), train_df['claim']))):
        X_train, X_valid = train_df.iloc[train_idx], train_df.iloc[valid_idx]
        y_train = X_train['claim']
        y_valid = X_valid['claim']
        X_train = X_train.drop(['claim', 'id'], axis=1)
        X_valid = X_valid.drop(['claim', 'id'], axis=1)
        
        if early_stopping:
            model.fit(
                X_train, 
                y_train,
                eval_set=[(X_valid, y_valid)],
                early_stopping_rounds=EARLY_STOPPING_ROUNDS,
                verbose=0
            )
        else:
            model.fit(
                X_train, 
                y_train
            )
         
        temp_oof = model.predict_proba(X_valid)[:, 1]
        train_oof[valid_idx] = temp_oof
        print(f'Fold {fold} AUC: ', roc_auc_score(y_valid, temp_oof))
        predictions += model.predict_proba(test_df)[:, -1] / N_SPLITS

    print(f'OOF AUC: ', roc_auc_score(train_df['claim'], train_oof))
    
    return train_oof, predictions, model    

In [None]:
test_df = test_df.drop('id', axis=1)

# XGBOOST

In [None]:
xgb_params = {
        'n_estimators': 16939, 
        'learning_rate': 0.1876042995729744, 
        'subsample': 0.9947704250490819, 
        'colsample_bytree': 0.714913373260802, 
        'max_depth': 1, 
        'min_child_weight': 300, 
        'reg_lambda': 2.520228860596293e-05, 
        'reg_alpha': 0.00045044167069949973,
        'tree_method': 'gpu_hist'
}

In [None]:
%%time
if TRAINING_METHODS['XGB']:
    xg_train_oof, xg_predictions, model = cross_validate(
                                            XGBClassifier(**xgb_params),
                                        train_df,
                                        test_df,
                                    )
    model.save_model('xgb_model')
    np.save('xg_train_oof', xg_train_oof)
    np.save('xg_predictions', xg_predictions)

# LGDM CLASSIFIER

In [None]:
lgb_params = {
            'n_estimators': 12000, 
            'learning_rate': 0.027934730713420564, 
            'reg_alpha': 1.1799328678792862e-05, 
            'reg_lambda': 0.38585046073832296, 
            'num_leaves': 23, 
            'feature_fraction': 0.5301717514985537, 
            'bagging_fraction': 0.7745063435612487, 
            'bagging_freq': 6, 
            'min_child_samples': 19, 
            'min_child_weight': 193, 
            'colsample_bytree': 0.5145963018815463,
            "objective": "binary",
            "metric": "binary_logloss",
            "boosting_type": "gbdt",
            "device_type" : "gpu"
}

In [None]:
%%time
if TRAINING_METHODS['LGDM']:
    lgd_train_oof, lgd_predictions, model = cross_validate(
                                            LGBMClassifier(**lgb_params),
                                        train_df,
                                        test_df,
                                    )
    np.save('lgd_train_oof', lgd_train_oof)
    np.save('lgd_predictions', lgd_predictions)
    model.booster_.save_model('lgdm_model', num_iteration=model.best_iteration_)

# CATBOOST

In [None]:
cat_params = {
        'iterations': 12000, 
        'objective': 'Logloss', 
        'bootstrap_type': 'Bayesian', 
        'od_wait': 1491, 
        'learning_rate': 0.07733510576652604, 
        'reg_lambda': 6.067283648607877, 
        'random_strength': 19.03761597798964, 
        'depth': 4, 
        'min_data_in_leaf': 17, 
        'leaf_estimation_iterations': 8, 
        'bagging_temperature': 0.7761781866167776,
        'task_type' : 'GPU'
}

In [None]:
%%time
if TRAINING_METHODS['LGDM']:
    cat_train_oof, cat_predictions, model = cross_validate(
                                            CatBoostClassifier(**cat_params),
                                        train_df,
                                        test_df,
                                    )
    np.save('cat_train_oof', cat_train_oof)
    np.save('cat_predictions', cat_predictions)
    model.save_model('cat_boost')


# Lets stack them for Ensemble modelling

Create a dataframe for final ensemble input

In [None]:
cols = ["lgb", "xgb", "cat"]
df_oof = pd.DataFrame(
    dict(
        zip(cols, [lgd_train_oof, xg_train_oof, cat_train_oof])
    )
)
df_pred = pd.DataFrame(
    dict(
        zip(cols, [lgd_predictions, xg_predictions, cat_predictions])
    )
)

In [None]:
df_oof['claim'] = train_df['claim']
df_oof['id'] = df_oof.index

# Train the Multiple ensemble model

In [None]:
# params = {"objective": "binary", "metric": "binary_logloss", "random_state": SEED, "device_type" : "gpu", 'verbose':0, "n_estimators" : 2000} 
# oof_lgb2, pred_lgb2, _ = cross_validate(
#     LGBMClassifier(**params),
#     df_oof,
#     df_pred
# )

# params = {"objective": "binary:logistic", "random_state": SEED, 'tree_method': 'gpu_hist', 'verbose':0, 'n_estimators': 2000}
# oof_xgb2, pred_xgb2, _ = cross_validate(
#     XGBClassifier(**params),
#     df_oof,
#     df_pred
# )

# params = {"random_state": SEED, 'task_type': 'GPU', 'verbose':0, 'iterations': 2000}
# oof_cat2, pred_cat2, _ = cross_validate(
#     CatBoostClassifier(**params),
#     df_oof, 
#     df_pred
# )

params = {"random_state": SEED, 'n_jobs': -1 , 'C':1000, 'max_iter':1000}
oof_log2, pred_log2, _ = cross_validate(
    LogisticRegression(**params), 
    df_oof,
    df_pred,
    early_stopping=False
)


In [None]:
# ensemble_predictions = np.array([pred_lgb2, pred_xgb2, pred_cat2, pred_log2]).mean(axis=0)

In [None]:
# ensemble_predictions

In [None]:
submission['claim'] = pred_log2.tolist()
submission.to_csv('submission.csv', index=False)