In [None]:
import pandas as pd, numpy as np
import os
import math
from math import ceil, floor, log
import random

from sklearn.model_selection import KFold
import matplotlib.pyplot as plt
from sklearn.metrics import confusion_matrix, accuracy_score
from sklearn.metrics import f1_score, roc_auc_score, confusion_matrix, precision_recall_curve, auc, roc_curve, recall_score, classification_report 
from sklearn.model_selection import train_test_split
import sklearn
from sklearn import metrics
from sklearn import preprocessing
from sklearn.linear_model import LogisticRegression
from sklearn.ensemble import RandomForestClassifier

import seaborn as sns

from yellowbrick.classifier import ClassificationReport
import scikitplot as skplt

from xgboost import XGBClassifier
import xgboost as xgb
from lightgbm import LGBMClassifier
import catboost
print(catboost.__version__)
from catboost import *
from catboost import datasets
from catboost import CatBoostClassifier

import scikitplot as skplt

import time
from sklearn.model_selection import cross_val_score

# Data loading and setup

In [None]:
SEED = 2021
random.seed(SEED)

pd.set_option('display.max_columns', 500)
pd.set_option('display.width', 1000)
path = '../input/healthinsurance/'

Data loading and preprocessing (feature engineering)...

Easy check proves that there is no empty or NaN data.

In [None]:
df_train = pd.read_csv(path + "train.csv")
df_test = pd.read_csv(path + "test.csv")
print(df_train.isnull().sum())
print(df_test.isnull().sum())

In [None]:
df_train.dropna(axis=0,inplace=True,how='any')
print(df_train.isnull().sum())
df_train=df_train.set_index('id')
df_train

In [None]:
col_list = df_train.columns.to_list()[1:]
df_train_corr = df_train.copy()
df_train_ones = df_train_corr.loc[df_train_corr.Response == 1].copy()

categorical_features = ['Gender', 'Driving_License', 'Region_Code', 'Previously_Insured', 'Vehicle_Age', 'Vehicle_Damage','Policy_Sales_Channel']
text_features = ['Gender', 'Vehicle_Age', 'Vehicle_Damage']

# code text categorical features
le = preprocessing.LabelEncoder()
for f in text_features :
    df_train_corr[f] = le.fit_transform(df_train_corr[f])

Lets check feature correlations.

In [None]:
corr = df_train_corr.loc[:,:'Vintage'].corr()

mask = np.triu(np.ones_like(corr, dtype=bool))
f, ax = plt.subplots(figsize=(11, 9))
cmap = sns.diverging_palette(230, 20, as_cmap=True)
sns.heatmap(corr, mask=mask, cmap=cmap, square=True, linewidths=.5, cbar_kws={"shrink": .5})

In [None]:
def plot_ROC(fpr, tpr, m_name):
    roc_auc = sklearn.metrics.auc(fpr, tpr)
    plt.figure(figsize=(6, 6))
    lw = 2
    plt.plot(fpr, tpr, color='darkorange',
             lw=lw, label='ROC curve (area = %0.2f)' % roc_auc, alpha=0.5)
    
    plt.plot([0, 1], [0, 1], color='navy', lw=lw, linestyle='--', alpha=0.5)
    
    plt.xlim([0.0, 1.0])
    plt.ylim([0.0, 1.05])
    plt.xticks(fontsize=16)
    plt.yticks(fontsize=16)
    plt.grid(True)
    plt.xlabel('False Positive Rate', fontsize=16)
    plt.ylabel('True Positive Rate', fontsize=16)
    plt.title('Receiver operating characteristic for %s'%m_name, fontsize=20)
    plt.legend(loc="lower right", fontsize=16)
    plt.show()

# Oversampling and Feature Engineering
Brute-force oversampling of Positive Response class, and engineering 2 new synthetic features

In [None]:
def upsample(df, u_feature, n_upsampling):
    ones = df.copy()
    for n in range(n_upsampling):
        if u_feature == 'Annual_Premium':
            df[u_feature] = ones[u_feature].apply(lambda x: x + random.randint(-1,1)* x *0.03) # change Annual_premiun in the range of 3%
        else:#Age
            df[u_feature] = ones[u_feature].apply(lambda x: x + random.randint(-3,3)) # change Age in the range of 3 years
                
        if n == 0:
            df_new = df.copy()
        else:
            df_new = pd.concat([df_new, df])
    return df_new

try:
    df_train_corr.drop(columns = ['bin_age'], inplace = True)
except:
    print('already deleted')        

df_train_mod = df_train_corr.copy()
#df_train_mod['old_damaged'] = df_train_mod.apply(lambda x: pow(2,x.Vehicle_Age)+pow(2,x.Vehicle_Damage), axis =1)

# we shall preserve validation set without augmentation/over-sampling
df_temp, X_valid, _, y_valid = train_test_split(df_train_mod, df_train_mod['Response'], train_size=0.8, random_state = SEED)
X_valid = X_valid.drop(columns = ['Response'])

# upsampling Positive Response class only
df_train_up_a = upsample(df_temp.loc[df_temp['Response'] == 1], 'Age', 1)
df_train_up_v = upsample(df_temp.loc[df_temp['Response'] == 1], 'Vintage', 1)

In [None]:
df_train_mod.head()

In [None]:
df_ext = pd.concat([df_train_mod,df_train_up_a])
df_ext = pd.concat([df_ext,df_train_up_v])
X_train = df_ext.drop(columns = ['Response'])
y_train = df_ext.Response
print('Train set target class count with over-sampling:')
print(y_train.value_counts())
print('Validation set target class count: ')
print(y_valid.value_counts())
X_train.head()

In [None]:
from sklearn.model_selection import GridSearchCV
#gridsearchcv
XGB_model_u = XGBClassifier(random_state = SEED, max_depth = 9, 
                            n_estimators = 3000, reg_lambda = 1.2, reg_alpha = 1.2, 
                            min_child_weight = 1, scale_pos_weight = 1,
                            objective = 'binary:logistic',
                            learning_rate = 0.15, gamma = 0.1, colsample_bytree = 0.9,subsample=0.9, 
                            eval_metric = 'auc',
                           tree_method='hist',silent=True)
param_test1 = {
 'reg_alpha':[1e-5, 1e-2, 0.1, 1, 100]
}
grid_search = GridSearchCV(XGB_model_u, param_grid=param_test1, cv = 3 , 
                                   verbose=100, n_jobs=1)
grid_search.fit(X_train, y_train)

grid_search.best_estimator_

In [None]:
#XGBoost1

start=time.time()
XGB_model_u = XGBClassifier(random_state = SEED, max_depth = 8, 
                            n_estimators = 3000, reg_lambda = 1.2, reg_alpha = 1.2, 
                            min_child_weight = 1, 
                            objective = 'binary:logistic',
                            learning_rate = 0.15, gamma = 0.3, colsample_bytree = 0.5, eval_metric = 'auc',
                           tree_method='hist',silent=True)


XGB_model_u.fit(X_train, y_train,
                eval_set = [(X_valid, y_valid)],
                early_stopping_rounds=20,verbose = 1000)
XGB_preds_u = XGB_model_u.predict_proba(X_valid)
XGB_preds_t = XGB_model_u.predict_proba(X_train)
XGB_score_u = roc_auc_score(y_valid, XGB_preds_u[:,1])
XGB_score_t = roc_auc_score(y_train, XGB_preds_t[:,1])
XGB_class_u = XGB_model_u.predict(X_valid)
XGB_class_t = XGB_model_u.predict(X_train)
end=time.time()
print("Training time:",end-start)
#result=cross_val_score(XGB_model_u, pd.concat([X_train,X_valid]), pd.concat([y_train,y_valid]), cv=10, scoring='roc_auc')

#print("k-fold result:%.2f"%result.mean())
(fpr, tpr, thresholds) = roc_curve(y_valid, XGB_preds_u[:,1])
plot_ROC(fpr, tpr,'XGBoost-valid')
(fpr, tpr, thresholds) = roc_curve(y_train, XGB_preds_t[:,1])
plot_ROC(fpr, tpr,'XGBoost-train')

In [None]:
#XGBoost2

start=time.time()
XGB_model_u = XGBClassifier(random_state = SEED, max_depth = 9, 
                            n_estimators = 5000, reg_lambda = 1.2, reg_alpha = 1.2, 
                            min_child_weight = 1, scale_pos_weight = 1,
                            objective = 'binary:logistic',
                            learning_rate = 0.05, gamma = 0.1, colsample_bytree = 0.9,subsample=0.9, 
                            eval_metric = 'auc',
                           tree_method='hist',silent=True)


XGB_model_u.fit(X_train, y_train,
                eval_set = [(X_valid, y_valid)],
                early_stopping_rounds=20,verbose = 1000)
XGB_preds_u = XGB_model_u.predict_proba(X_valid)
XGB_preds_t = XGB_model_u.predict_proba(X_train)
XGB_score_u = roc_auc_score(y_valid, XGB_preds_u[:,1])
XGB_score_t = roc_auc_score(y_train, XGB_preds_t[:,1])
XGB_class_u = XGB_model_u.predict(X_valid)
XGB_class_t = XGB_model_u.predict(X_train)
end=time.time()
print("Training time:",end-start)
#result=cross_val_score(XGB_model_u, pd.concat([X_train,X_valid]), pd.concat([y_train,y_valid]), cv=10, scoring='roc_auc')

#print("k-fold result:%.2f"%result.mean())
(fpr, tpr, thresholds) = roc_curve(y_valid, XGB_preds_u[:,1])
plot_ROC(fpr, tpr,'XGBoost-valid')
(fpr, tpr, thresholds) = roc_curve(y_train, XGB_preds_t[:,1])
plot_ROC(fpr, tpr,'XGBoost-train')

In [None]:
df_test=pd.read_csv(path+'test.csv')
for f in text_features :
    df_test[f] = le.fit_transform(df_test[f])
df_test=df_test.set_index('id')

In [None]:
XGB_class_res_proba = XGB_model_u.predict_proba(df_test)
XGB_class_res_proba=np.array(XGB_class_res_proba)

In [None]:
df_res=pd.DataFrame({'Response':XGB_class_res_proba[:,1]}).set_index(df_test.index)
#df_res.to_csv('/kaggle/working/submission.csv')

In [None]:
print(df_res[df_res['Response']>0.5].count()/df_res.shape[0])
print(df_train[df_train['Response']>0.5].count()/df_train.shape[0])