In [None]:
# This Python 3 environment comes with many helpful analytics libraries installed
# It is defined by the kaggle/python Docker image: https://github.com/kaggle/docker-python
# For example, here's several helpful packages to load

import numpy as np # linear algebra
import pandas as pd # data processing, CSV file I/O (e.g. pd.read_csv)

# Input data files are available in the read-only "../input/" directory
# For example, running this (by clicking run or pressing Shift+Enter) will list all files under the input directory

import os
for dirname, _, filenames in os.walk('/kaggle/input'):
    for filename in filenames:
        print(os.path.join(dirname, filename))

# You can write up to 20GB to the current directory (/kaggle/working/) that gets preserved as output when you create a version using "Save & Run All" 
# You can also write temporary files to /kaggle/temp/, but they won't be saved outside of the current session

In [None]:
import os
import random
import numpy as np
import pandas as pd
import matplotlib.pyplot as plt
import seaborn as sns
sns.set_style("darkgrid")
from sklearn.metrics import roc_auc_score
from sklearn.feature_selection import SelectKBest, f_classif
from sklearn.preprocessing import StandardScaler, MinMaxScaler, LabelEncoder
from sklearn.model_selection import train_test_split, StratifiedKFold, RandomizedSearchCV
from sklearn.linear_model import LogisticRegression
from sklearn.ensemble import RandomForestClassifier
from catboost import CatBoostClassifier
from lightgbm import LGBMClassifier
from xgboost import XGBClassifier
from sklearn.svm import LinearSVC
from sklearn.tree import DecisionTreeClassifier
import tensorflow as tf
from tensorflow import keras
from tensorflow.keras import layers
from tensorflow.keras.wrappers.scikit_learn import KerasClassifier
import warnings

In [None]:
#clean output cell
warnings.filterwarnings('ignore')

In [None]:
# Reproducability
def set_seed(seed=42):
    np.random.seed(seed)
    tf.random.set_seed(seed)
    random.seed(seed)
    os.environ['PYTHONHASHSEED'] = str(seed)
    os.environ['TF_DETERMINISTIC_OPS'] = '1'
set_seed()

In [None]:
#read data
train = pd.read_csv('/kaggle/input/health-insurance-cross-sell-prediction/train.csv', index_col=0)
test = pd.read_csv('/kaggle/input/health-insurance-cross-sell-prediction/test.csv', index_col=0)
pd.set_option('display.max_columns', None)
X = pd.concat([train.drop("Response", axis=1),test], axis=0)
y = train[['Response']]

In [None]:
#Split into train, valid and test dataset
x = X.loc[train.index]
y = y.loc[train.index]
test = X.loc[test.index]
X_train, X_valid, y_train, y_valid = train_test_split(x, y, test_size= 0.25, random_state=42, stratify=y, shuffle=True)

In [None]:
ss = StandardScaler()
mm = MinMaxScaler()
le = LabelEncoder()

In [None]:
X_train[['Age']]= ss.fit_transform(X_train[['Age']])
X_valid[['Age']] = ss.transform(X_valid[['Age']])
test[['Age']] = ss.transform(test[['Age']])

X_train[['Vintage']] = ss.fit_transform(X_train[['Vintage']])
X_valid[['Vintage']] = ss.transform(X_valid[['Vintage']])
test[['Vintage']] = ss.transform(test[['Vintage']])

X_train[['Annual_Premium']] = mm.fit_transform(X_train[['Annual_Premium']])
X_valid[['Annual_Premium']] = mm.transform(X_valid[['Annual_Premium']])
test[['Annual_Premium']] = mm.transform(test[['Annual_Premium']])

X_train['Vehicle_Age'] = le.fit_transform(X_train['Vehicle_Age'])
X_valid['Vehicle_Age'] = le.transform(X_valid['Vehicle_Age'])
test['Vehicle_Age'] = le.transform(test['Vehicle_Age'])

X_train['Gender'] = le.fit_transform(X_train['Gender'])
X_valid['Gender'] = le.transform(X_valid['Gender'])
test['Gender'] = le.transform(test['Gender'])

X_train['Vehicle_Damage'] = le.fit_transform(X_train['Vehicle_Damage'])
X_valid['Vehicle_Damage'] = le.transform(X_valid['Vehicle_Damage'])
test['Vehicle_Damage'] = le.transform(test['Vehicle_Damage'])

# converting the float variables to int because cat boost dosen't take float for categorical variables
X_train['Region_Code']=X_train['Region_Code'].astype(int)
X_valid['Region_Code']=X_valid['Region_Code'].astype(int)
test['Region_Code']=test['Region_Code'].astype(int)

X_train['Policy_Sales_Channel']=X_train['Policy_Sales_Channel'].astype(int)
X_valid['Policy_Sales_Channel']=X_valid['Policy_Sales_Channel'].astype(int)
test['Policy_Sales_Channel']=test['Policy_Sales_Channel'].astype(int)

print(X_train.head())

In [None]:
selector = SelectKBest(f_classif, k=8)
X_train_t = selector.fit_transform(X_train, y_train)
selected_features = pd.DataFrame(selector.inverse_transform(X_train_t),
                                 index=X_train.index,
                                 columns=X_train.columns)
selected_columns = selected_features.columns[selected_features.var() != 0]
print(selected_columns)

In [None]:
lr = LogisticRegression(random_state=42, n_jobs=-1,verbose=100,solver='liblinear',C=1,max_iter=100,penalty='l1')
lr = lr.fit(X_train, y_train)
lr_predict = lr.predict_proba(X_valid)[:, 1]

lr_t = LogisticRegression(random_state=42, n_jobs=-1,verbose=100,solver='lbfgs',C=0.5,max_iter=500,penalty='l2')
lr_t = lr_t.fit(X_train_t, y_train)
lr_predict_t = lr_t.predict_proba(X_valid[selected_columns])[:, 1]

print('LogisticRegression ROC AUC SCORE: {}'.format(roc_auc_score(y_valid, lr_predict)))
print('LogisticRegression using feature selection ROC AUC SCORE: {}'.format(roc_auc_score(y_valid, lr_predict_t)))

In [None]:
dtc = DecisionTreeClassifier(random_state=42, max_depth=20, splitter='random', min_samples_leaf=1, max_leaf_nodes=50, min_samples_split=5)
dtc=dtc.fit(X_train, y_train)
dtc_predict_proba = dtc.predict_proba(X_valid)[:, 1]
print('DecisionTreeClassifier ROC AUC SCORE: {}'.format(roc_auc_score(y_valid, dtc_predict_proba)))

dtc_t = DecisionTreeClassifier(random_state=42, max_depth=20, splitter='random', min_samples_leaf=5, max_leaf_nodes=100, min_samples_split=20)
dtc_t=dtc_t.fit(X_train_t, y_train)
dtc_predict_proba_t = dtc_t.predict_proba(X_valid[selected_columns])[:, 1]
print('DecisionTreeClassifier using feature selection ROC AUC SCORE: {}'.format(roc_auc_score(y_valid, dtc_predict_proba_t)))

In [None]:
rfc = RandomForestClassifier(random_state=42, n_jobs=-1,n_estimators=50,min_samples_split=10, min_samples_leaf=10,max_depth=100, max_leaf_nodes=100,bootstrap=True,verbose=100)
rfc=rfc.fit(X_train,y_train)
pred_proba = rfc.predict_proba(X_valid)[:, 1]

rfc_t = RandomForestClassifier(random_state=42, n_jobs=-1,n_estimators=500,min_samples_split=20, min_samples_leaf=10,max_depth=10, max_leaf_nodes=50,bootstrap=True,verbose=100)
rfc_t=rfc_t.fit(X_train_t, y_train)
predict_proba_t = rfc_t.predict_proba(X_valid[selected_columns])[:, 1]

print('RandomForestClassifier ROC AUC SCORE: {}'.format(roc_auc_score(y_valid, pred_proba)))
print('RandomForestClassifier using feature selection ROC AUC SCORE: {}'.format(roc_auc_score(y_valid, predict_proba_t)))

In [None]:
xgboost = XGBClassifier(random_state=42, n_jobs=-1, scale_pos_weight=8.7, n_estimators=500, learning_rate=0.1, subsample=0.5,objective='binary:logistic')
xgboost= xgboost.fit(X_train, y_train,eval_set=[(X_valid, y_valid)],early_stopping_rounds=10,verbose=100,eval_metric='auc')

xgboost_t = XGBClassifier(random_state=42, n_jobs=-1, scale_pos_weight=8.7, n_estimators=100, learning_rate=0.3, subsample=1,objective='binary:logistic')
X_train_t_dataframe=X_train[['Gender', 'Age', 'Region_Code', 'Previously_Insured', 'Vehicle_Age',
       'Vehicle_Damage', 'Annual_Premium', 'Policy_Sales_Channel']]
xgboost_t= xgboost_t.fit(X_train_t_dataframe, y_train,eval_set=[(X_valid[selected_columns], y_valid)],early_stopping_rounds=10,verbose=100,eval_metric='auc')

# Instead of using the best tree as a model, xgboost uses the best tree depth limit to get the best one in early_stopping
pred_proba = xgboost.predict_proba(X_valid,ntree_limit=xgboost.best_ntree_limit)[:, 1]
pred_proba_t = xgboost_t.predict_proba(X_valid[selected_columns],ntree_limit=xgboost_t.best_ntree_limit)[:, 1]

print('XGBClassifier ROC AUC SCORE: {}'.format(roc_auc_score(y_valid, pred_proba)))
print('XGBClassifier using feature selection ROC AUC SCORE: {}'.format(roc_auc_score(y_valid, pred_proba_t)))

In [None]:
lgb = LGBMClassifier(boosting_type='gbdt',n_estimators=500, learning_rate=0.05,objective='binary',metric='auc',is_unbalance=True,
                     feature_fraction=0.6,bagging_freq=8,bagging_fraction=0.6,max_depth=10,random_state=42,n_jobs=-1)
lgb= lgb.fit(X_train, y_train, eval_metric='auc',eval_set=(X_valid, y_valid),verbose=100,early_stopping_rounds= 20)

lgb_t = LGBMClassifier(boosting_type='gbdt',n_estimators=500, learning_rate=0.05,objective='binary',metric='auc',is_unbalance=True,
                     feature_fraction=0.6,bagging_freq=8,bagging_fraction=0.6,max_depth=10,random_state=42,n_jobs=-1)
X_train_t_dataframe=X_train[['Gender', 'Age', 'Region_Code', 'Previously_Insured', 'Vehicle_Age',
       'Vehicle_Damage', 'Annual_Premium', 'Policy_Sales_Channel']]
lgb_t= lgb_t.fit(X_train_t_dataframe, y_train, eval_metric='auc',eval_set=(X_valid[selected_columns], y_valid),verbose=100,early_stopping_rounds= 20)

pred_proba = lgb.predict_proba(X_valid)[:, 1]
pred_proba_t = lgb_t.predict_proba(X_valid[selected_columns])[:, 1]
print('Lightgbm ROC AUC SCORE: {}'.format(roc_auc_score(y_valid, pred_proba)))
print('Lightgbm using feature selection ROC AUC SCORE: {}'.format(roc_auc_score(y_valid, pred_proba_t)))

In [None]:
cat = CatBoostClassifier(random_state=42,use_best_model=True, loss_function="Logloss",learning_rate=0.05, l2_leaf_reg=9, iterations=500, depth=6, eval_metric='AUC')
cat = cat.fit(X_train, y_train, eval_set=(X_valid, y_valid), early_stopping_rounds=10, verbose=100,plot=True)

cat_t = CatBoostClassifier(random_state=42,use_best_model=True, loss_function="Logloss",learning_rate=0.2, l2_leaf_reg=4, iterations=500, depth=6, eval_metric='AUC')
X_train_t_dataframe=X_train[['Gender', 'Age', 'Region_Code', 'Previously_Insured', 'Vehicle_Age',
       'Vehicle_Damage', 'Annual_Premium', 'Policy_Sales_Channel']]
cat_t = cat_t.fit(X_train_t_dataframe, y_train, eval_set=(X_valid[selected_columns], y_valid), early_stopping_rounds=10, verbose=100,plot=True)


pred_proba = cat.predict_proba(X_valid)[:, 1]
pred_proba_t = cat_t.predict_proba(X_valid[selected_columns])[:, 1]

print('CatBoost ROC AUC SCORE: {}'.format(roc_auc_score(y_valid, pred_proba)))
print('CatBoost using feature selection ROC AUC SCORE: {}'.format(roc_auc_score(y_valid, pred_proba_t)))

In [None]:
model_t = keras.Sequential([
    layers.BatchNormalization(input_shape=[8]),
    layers.Dense(256, activation='relu'),
    layers.BatchNormalization(),
    layers.Dropout(0.1),
    layers.Dense(256, activation='relu'),
    layers.BatchNormalization(),
    layers.Dropout(0.1),
    layers.Dense(1, activation='sigmoid'),
])
model_t.compile(
    optimizer='adam',
    loss='binary_crossentropy',
    metrics=['binary_accuracy'],
)
early_stopping = keras.callbacks.EarlyStopping(
    patience=5,
    min_delta=0.001,
    restore_best_weights=True,
)
history_t = model_t.fit(
    X_train_t, y_train,
    validation_data=(X_valid[selected_columns], y_valid),
    batch_size=256,
    epochs=10,
    callbacks=[early_stopping],
)

history_df_t = pd.DataFrame(history_t.history)
history_df_t.loc[:, ['loss', 'val_loss']].plot(title="Feature Selection Cross-entropy")
plt.show()
history_df_t.loc[:, ['binary_accuracy', 'val_binary_accuracy']].plot(title="Feature Selection Accuracy")
plt.show()
print(("Best Feature Selection Validation Loss: {:0.4f}" +\
      "\nBest Feature Selection Validation Accuracy: {:0.4f}")\
      .format(history_df_t['val_loss'].min(),
              history_df_t['val_binary_accuracy'].max()))

In [None]:
#extract only the probabilities of response = 1 
pred_LogisticRegression = lr_t.predict_proba(test[selected_columns])[:,1] 
pred_RandomForestClassifier = rfc_t.predict_proba(test[selected_columns])[:,1] 
pred_XGBClassifier = xgboost_t.predict_proba(test[selected_columns])[:,1] 
pred_LGBMClassifier = lgb_t.predict_proba(test[selected_columns])[:,1] 
pred_CatBoostClassifier = cat_t.predict_proba(test[selected_columns])[:,1] 
#pred_DNN = model_t.predict_proba(test[selected_columns])[:,1] 

w1 = 0.1                   
w2 = 0.15                 
w3 = 0.25
w4 = 0.25
w5 = 0.25

final = (1/5)*((w1*pred_LogisticRegression) + (w2*pred_RandomForestClassifier) + (w3*pred_XGBClassifier) + (w4*pred_LGBMClassifier) + (w5*pred_CatBoostClassifier))        # Blending using average performed good for my model
test_temp = pd.read_csv('/kaggle/input/health-insurance-cross-sell-prediction/test.csv')
submit = pd.DataFrame({'id': test_temp.id, 'Response': final})
submit.to_csv('blend_results.csv', index=False)