<div style="background-color:rgba(215, 79, 21, 0.5);">
    <h1><center>Importing Libraries</center></h1>
</div>

In [None]:
import random
random.seed(123)

import pandas as pd
import numpy as np
import datatable as dt
import warnings
warnings.filterwarnings("ignore")

# importing feature selection and processing packages

from sklearn.model_selection import StratifiedKFold, train_test_split
from sklearn.feature_selection import SelectKBest,mutual_info_classif,SelectPercentile,VarianceThreshold
from mlxtend.feature_selection import SequentialFeatureSelector as SFS, ExhaustiveFeatureSelector as EFS
from sklearn.metrics import roc_auc_score
from sklearn.pipeline import Pipeline
from sklearn.preprocessing import MinMaxScaler,StandardScaler,PowerTransformer
from sklearn.decomposition import PCA
from sklearn.impute import SimpleImputer

# importing modelling packages

from sklearn.linear_model import LogisticRegression, RidgeClassifier
from lightgbm import LGBMClassifier
from catboost import CatBoostClassifier
from sklearn.ensemble import RandomForestClassifier, VotingClassifier
from xgboost import XGBClassifier

# Optimisation Packages

import optuna
from optuna import trial
from optuna.samplers import TPESampler
import pprint
import joblib
from skopt import BayesSearchCV
from skopt.callbacks import DeadlineStopper, VerboseCallback, DeltaXStopper
from skopt.space import Real, Categorical, Integer
from time import time

<div style="background-color:rgba(215, 79, 21, 0.5);">
    <h1><center>Inputting Data</center></h1>
</div>

In [None]:
# using datatable for faster loading

train = dt.fread(r'../input/tabular-playground-series-sep-2021/train.csv').to_pandas()
test = dt.fread(r'../input/tabular-playground-series-sep-2021/test.csv').to_pandas()
sub = dt.fread(r'../input/tabular-playground-series-sep-2021/sample_solution.csv').to_pandas()

<div style="background-color:rgba(215, 79, 21, 0.5);">
    <h1><center>Data Processing and Feature Engineering</center></h1>
</div>

In [None]:
train_data = train.copy()
test_data = test.copy()

train_data = train_data.drop('id',axis=1)
test_data = test_data.drop('id',axis=1)

features = train_data.columns[:-1]

In [None]:
# adding the magic features - using missing values and trends of the data

train_data['n_missing'] = train_data[features].isna().sum(axis=1)
test_data['n_missing'] = test_data[features].isna().sum(axis=1)

In [None]:
# splitting data and imputing missing values

X = train_data.drop('claim',axis=1)
y = train_data['claim'] # the target variable

X = X.apply(lambda x:x.fillna(np.mean(x)))
test_data = test_data.apply(lambda x:x.fillna(np.mean(x)))

In [None]:
# using minmax scaler to scale - we will use boosting models and need to converge to optimum value quickly

scaler = MinMaxScaler()
X = scaler.fit_transform(X)
test_for_model = scaler.transform(test_data)

<div style="background-color:rgba(215, 79, 21, 0.5);">
    <h1><center>Optuna on LightGBM</center></h1>
</div>

The inspiration for choosing the hyper-parameter ranges is - 
https://www.kaggle.com/bextuychiev/lgbm-optuna-hyperparameter-tuning-w-understanding. 

Please do upvote his work and check out his other illuminating notebooks. They are great if you have just started out on Kaggle.
This is the first time I set the lgbm ranges after understanding the parameters. Thanks a lot Bex!

In [None]:
def train_model_optuna(trial, X_train, X_valid, y_train, y_valid):

    preds = 0
           
     #A set of hyperparameters to optimize by optuna
    lgbm_params = {
                     "num_leaves": trial.suggest_int("num_leaves", 20, 3000, step=20),
                     "max_depth": trial.suggest_int("max_depth", 3, 12),
                     "learning_rate": trial.suggest_float("learning_rate", 0.01, 0.3),
                      "n_estimators": trial.suggest_categorical("n_estimators", [20000]),        
                      "lambda_l1": trial.suggest_int("lambda_l1", 0, 100, step=1),
                     "lambda_l2": trial.suggest_int("lambda_l2", 0, 100, step=1), 
                     "min_gain_to_split": trial.suggest_float("min_gain_to_split", 0, 15),
                     "bagging_fraction": trial.suggest_float("bagging_fraction", 0.2, 0.95, step=0.1),
                     "bagging_freq": trial.suggest_categorical("bagging_freq", [1]),
                     "feature_fraction": trial.suggest_float("feature_fraction", 0.2, 0.95, step=0.1),
                      'device':'gpu'
                     }

    model = LGBMClassifier(**lgbm_params)
    model.fit(X_train, y_train,eval_set=[(X_valid, y_valid)],eval_metric="auc",
               early_stopping_rounds=100,verbose=False)
    
    print(f"Number of boosting rounds: {model.best_iteration_}")
    oof = model.predict_proba(X_valid)[:,1]
    
    return roc_auc_score(y_valid, oof)

In [None]:
# setting up study on 0.67 training size

X_train, X_valid, y_train, y_valid = train_test_split(X, y, test_size=0.33, random_state=42, stratify=y)

study = optuna.create_study(direction='maximize')
study.optimize(lambda trial: train_model_optuna(trial, X_train, X_valid,y_train, y_valid),
                n_trials = 10)

# Showing optimization results

print('Number of finished trials:', len(study.trials))
print('Best trial parameters:', study.best_trial.params)
print('Best score:', study.best_value)

<div style="background-color:rgba(215, 79, 21, 0.5);">
    <h1><center>Optuna on CatBoost</center></h1>
</div>

References for parameter ranges -
1. https://www.kaggle.com/mlanhenke/tps-09-optuna-study-catboostclassifier
2. https://www.kaggle.com/ranjeetshrivastav/catboost-lightgbm

Please do upvote their work. I have used a combination from both these notebooks.

In [None]:
def train_model_optuna_2(trial, X_train, X_valid, y_train, y_valid):
    
    params = {'iterations':trial.suggest_int("iterations", 1000, 20000),
              'od_wait':trial.suggest_int('od_wait', 500, 2000),
              'task_type':"GPU",
              'learning_rate' : trial.suggest_uniform('learning_rate', 0.02 , 1),
              'reg_lambda': trial.suggest_uniform('reg_lambda',1e-5,100),
              'subsample': trial.suggest_uniform('subsample',0.9,1.0),
              'random_strength': trial.suggest_uniform('random_strength',10,50),
              'depth': trial.suggest_int('depth',1,15),
              'min_data_in_leaf': trial.suggest_int('min_data_in_leaf',1,30),
              'leaf_estimation_iterations': trial.suggest_int('leaf_estimation_iterations',1,15),
              'bootstrap_type':'Poisson',
               }
    
    model = CatBoostClassifier(**params)
    model.fit(X_train, y_train,eval_set=[(X_valid,y_valid)], early_stopping_rounds=150, verbose=False)
    
    oof = model.predict_proba(X_valid)[:,1]
    
    return roc_auc_score(y_valid, oof)

In [None]:
# setting up study on 0.67 training size

X_train, X_valid, y_train, y_valid = train_test_split(X, y, test_size=0.33, random_state=42, stratify=y)

study = optuna.create_study(direction='maximize')
study.optimize(lambda trial: train_model_optuna_2(trial,X_train, X_valid,y_train, y_valid),
                n_trials = 10)

# Showing optimization results

print('Number of finished trials:', len(study.trials))
print('Best trial parameters:', study.best_trial.params)
print('Best score:', study.best_value)

<div style="background-color:rgba(215, 79, 21, 0.5);">
    <h1><center>Tuned Hyperparameters</center></h1>
</div>

In [None]:
# LightGBM tuned parameters - I used 2000 instead of 20000 to get faster training
# I am seeing what gets me good results for now

lgbm_params = {'num_leaves': 2680, 'max_depth': 3, 'learning_rate': 0.1909226205418589,
               'n_estimators': 20000, 'lambda_l1': 65, 'lambda_l2': 76,
               'min_gain_to_split': 6.480697089399107, 'bagging_fraction': 0.7, 
               'bagging_freq': 1, 'feature_fraction': 0.6000000000000001}

# CatBoost tuned parameters

cat_params = {'iterations': 13969, 'od_wait': 1958, 'learning_rate': 0.04291773425770468,
              'reg_lambda': 15.189348850727315, 'subsample': 0.9136087381151102, 
              'random_strength': 28.743778165335534, 'depth': 5, 'min_data_in_leaf': 25, 
              'leaf_estimation_iterations': 9,'bootstrap_type':'Poisson'}

<div style="background-color:rgba(215, 79, 21, 0.5);">
    <h1><center>Ensembling</center></h1>
</div>

In [None]:
folds = StratifiedKFold(n_splits = 10, random_state = 228, shuffle = True)

predictions_lgb = np.zeros(len(test_for_model))
predictions_cb = np.zeros(len(test_for_model))

lgb_oof = np.zeros(X.shape[0])
cat_oof = np.zeros(X.shape[0])

for fold, (trn_idx, val_idx) in enumerate(folds.split(X,y)):
    print(f"Fold: {fold+1}")
    X_train, X_val = X[trn_idx], X[val_idx]
    y_train, y_val = y[trn_idx], y[val_idx]

    model_lgb =  LGBMClassifier(device='gpu',**lgbm_params)
    model_cb =  CatBoostClassifier(task_type='GPU',**cat_params,verbose=0)
    
    model_lgb.fit(X_train, y_train)
    pred_lgb = model_lgb.predict_proba(X_val)[:,1]
    lgb_oof[val_idx] = pred_lgb
    print('ROC of LGB: ',roc_auc_score(y_val,pred_lgb))
    
    model_cb.fit(X_train, y_train)
    pred_cb = model_cb.predict_proba(X_val)[:,1]
    cat_oof[val_idx] = pred_cb
    print('ROC of CB: ',roc_auc_score(y_val,pred_cb))
    
    print("-"*50)
    
    predictions_lgb += model_lgb.predict_proba(test_for_model)[:,1] / folds.n_splits
    predictions_cb += model_cb.predict_proba(test_for_model)[:,1] / folds.n_splits

In [None]:
# calculating appropriate weights for ensemble

import scipy
def class_optimizer(X, a0, a1):
    oof = X[0]*a0 + (1-X[0])*a1
    return (1-roc_auc_score(y, oof))

res = scipy.optimize.minimize(
    fun=class_optimizer,
    x0=[0.5],
    args=tuple([lgb_oof, cat_oof]),
    method='BFGS',
    options={'maxiter': 1000})

print(res)
print(f"coef0 {res.x[0]}, coef1 {1-res.x[0]}")

In [None]:
# Making ensemble using calculated weights

ensemble_oof = res.x[0] * lgb_oof + (1-res.x[0]) * cat_oof
ensemble_pred = res.x[0] * predictions_lgb  + (1-res.x[0]) * predictions_cb

# tuned lgbm gave 0.81658 score and tuned catboost gave a score 0.81782

print(roc_auc_score(y, ensemble_oof))

In [None]:
sub['claim'] = ensemble_pred
sub.to_csv('submission_cb_lgb_tuned_ensemble.csv',index = False) # gave a score of 0.81804

<div style="background-color:rgba(215, 79, 21, 0.5);">
    <h1><center>Stacking</center></h1>
</div>

In [None]:
def Stacking(model, model_name, x_train, y_train, x_test, fold):

    stk = StratifiedKFold(n_splits = fold, random_state = 42, shuffle = True)
    
    # Declaration Pred Datasets
    train_fold_pred = np.zeros((x_train.shape[0], 1))
    test_pred = np.zeros((x_test.shape[0], fold))
    
    for counter, (train_index, valid_index) in enumerate(stk.split(x_train, y_train)):
        x_train, y_train = X[train_index], y[train_index]
        x_valid, y_valid = X[valid_index], y[valid_index]
        
        print('------------ Fold', counter+1, 'Start! ------------')
        if model_name == 'cat':
            model.fit(x_train, y_train, eval_set=[(x_valid, y_valid)])
        elif model_name == 'lgbm':
            model.fit(x_train, y_train, eval_set=[(x_valid, y_valid)], eval_metric = 'auc')            
        print('------------ Fold', counter+1, 'Done! ------------')
        
        train_fold_pred[valid_index, :] = model.predict_proba(x_valid)[:, 1].reshape(-1, 1)
        test_pred[:, counter] = model.predict_proba(x_test)[:, 1]
    
    test_pred_mean = np.mean(test_pred, axis = 1).reshape(-1, 1)

    print('Done!')
    
    return train_fold_pred, test_pred_mean

In [None]:
model_lgb =  LGBMClassifier(device = 'gpu',**lgbm_params,verbose=0)
model_cb =  CatBoostClassifier(task_type='GPU',**cat_params,verbose=0)

cat_train, cat_test = Stacking(model_cb, 'cat', X, y, test_for_model, 10)
lgbm_train, lgbm_test = Stacking(model_lgb, 'lgbm', X, y, test_for_model, 10)

In [None]:
# creating stack datasets for our meta-classifier - using created feature

new_features = train_data.columns[train_data.columns!='claim']
imp_features= ['n_missing']
X_new = pd.DataFrame(X,columns=new_features)
test_for_model_new = pd.DataFrame(test_for_model,columns=new_features)
train_new = X_new[imp_features]
test_new = test_for_model_new[imp_features]

lgbm_train_1 = pd.DataFrame(lgb_oof,columns=['LGBM_train'])
cat_train_1 = pd.DataFrame(cat_oof,columns=['CAT_train'])
lgbm_test_1 = pd.DataFrame(predictions_lgb,columns=['LGBM_train'])
cat_test_1 = pd.DataFrame(predictions_cb,columns=['CAT_train'])

stack_x_train = pd.concat((train_new,lgbm_train_1, cat_train_1), axis = 1)
stack_x_test = pd.concat((test_new,lgbm_test_1, cat_test_1), axis = 1)

I have used an untuned Logistic Regression model, based on what I learnt from the following notebook-
https://www.kaggle.com/junhyeok99/stacking-ensemble-for-beginner

Please do upvote it. It helped me implement and learn stacking, being the beginner that I am, for the very first time!

In [None]:
stk = StratifiedKFold(n_splits = 5, random_state = 42)

test_pred = 0
fold = 1
total_auc = 0

for train_index, valid_index in stk.split(stack_x_train, y):
    x_train, y_train = stack_x_train.iloc[train_index], y[train_index]
    x_valid, y_valid = stack_x_train.iloc[valid_index], y[valid_index]
    
    #lr = LogisticRegression(n_jobs = -1, random_state = 42, C = 1000, max_iter = 1000)
    lr = RidgeClassifier()
    lr.fit(x_train, y_train)
    
    valid_pred = lr.predict_proba(x_valid)[:, 1]
    test_pred += lr.predict_proba(stack_x_test)[:, 1]
    auc = roc_auc_score(y_valid, valid_pred)
    total_auc += auc / 10
    print('Fold', fold, 'AUC :', auc)
    fold += 1
    
print('Total AUC score :', total_auc)

In [None]:
sub['claim'] = test_pred/10
sub.to_csv('submission_lgb_cb_tuned_log_FE_stacking.csv', index = 0) # gave a score of 0.81805

<div style="background-color:rgba(215, 79, 21, 0.5);">
    <h1><center>Blending</center></h1>
</div>

In [None]:
# Blending is a subtype of stacking - we will use only one fold, instead of 10 folds, here.

X_train,X_val,y_train,y_val = train_test_split(X,y,test_size=0.33, random_state=2021,stratify=y)

In [None]:
model_lgb =  LGBMClassifier(device = 'gpu',verbose=0)
model_cb =  CatBoostClassifier(task_type='GPU',verbose=0)
model_xgb =  XGBClassifier(tree_method='gpu_hist',verbose=0)

predictions_lgb = np.zeros(len(test_for_model))
predictions_cb = np.zeros(len(test_for_model))
predictions_xgb = np.zeros(len(test_for_model))

lgb_oof = np.zeros(len(X_val))
cat_oof = np.zeros(len(X_val))
xgb_oof = np.zeros(len(X_val))

In [None]:
# first model - LightGBM

model_lgb.fit(X_train,y_train)
lgb_oof = model_lgb.predict_proba(X_val)[:,1]
predictions_lgb = model_lgb.predict_proba(test_for_model)[:,1]

# second model - Catboost

model_cb.fit(X_train,y_train)
cat_oof = model_cb.predict_proba(X_val)[:,1]
predictions_cb = model_cb.predict_proba(test_for_model)[:,1]

# third model - XGBoost

model_xgb.fit(X_train,y_train)
xgb_oof = model_xgb.predict_proba(X_val)[:,1]
predictions_xgb = model_xgb.predict_proba(test_for_model)[:,1]

In [None]:
# creating datasets for our meta-classifier - using all features

blend_x_val = pd.concat([pd.DataFrame(X_val),pd.DataFrame(lgb_oof,columns=['lgbm']),
                         pd.DataFrame(cat_oof,columns=['cat']),
                         pd.DataFrame(xgb_oof,columns=['xgb'])], axis = 1)
blend_x_test = pd.concat([pd.DataFrame(test_for_model),pd.DataFrame(predictions_lgb,columns=['lgbm']),
                          pd.DataFrame(predictions_cb,columns=['cat'])
                          ,pd.DataFrame(predictions_xgb,columns=['xgb'])], axis = 1)

In [None]:
model = CatBoostClassifier(task_type='GPU',verbose=0)

model.fit(blend_x_val,y_val)
predictions = model.predict_proba(blend_x_test)[:,1]

In [None]:
sub['claim'] = predictions
sub.to_csv('submission_lgb_cb_cb_FE_blending.csv', index = 0) # Only for illustration

<div style="background-color:rgba(215, 79, 21, 0.5);">
    <h1><center>Please upvote if you liked my notebook! Thanks :)</center></h1>
</div>