In [None]:
import numpy as np
import pandas as pd 
import matplotlib.pyplot as plt
import seaborn as sns
from sklearn.metrics import roc_auc_score
from sklearn.model_selection import cross_val_score
from sklearn.model_selection import cross_validate
from imblearn.pipeline import Pipeline, make_pipeline
scoring = ['roc_auc']

import os
for dirname, _, filenames in os.walk('/kaggle/input'):
    for filename in filenames:
        print(os.path.join(dirname, filename))

In [None]:
train = pd.read_csv("/kaggle/input/health-insurance-cross-sell-prediction/train.csv")
test = pd.read_csv("/kaggle/input/health-insurance-cross-sell-prediction/test.csv")
train=train.drop(columns=['id'])
test=test.drop(columns=['id'])
train.head()

In [None]:
100*train['Response'].value_counts()[1]/train['Response'].value_counts()[0]

14% 86% Moderately Imbalanced Dataset

In [None]:
#Checking for imbalance using shannon entropy
def balance(seq):
    from collections import Counter
    from numpy import log
    n = len(seq)
    classes = [(clas,float(count)) for clas,count in Counter(seq).items()]
    k = len(classes)
    H = -sum([ (count/n) * log((count/n)) for clas,count in classes])
    return H/log(k)

balance(train['Response'])

In [None]:
train.info()

In [None]:
train.isnull().sum()

No missing values in the Dataset

In [None]:
sns.countplot(data=train, x='Gender')

In [None]:
sns.countplot(data=train, x='Response', hue="Gender")

In [None]:
sns.countplot(data=train.loc[(train['Driving_License'] == 1)], x='Driving_License', hue="Gender")

In [None]:
sns.countplot(data=train, x='Response', hue="Previously_Insured")

In [None]:
100*train.loc[(train['Previously_Insured'] == 1)]['Response'].value_counts(normalize=True)

99.9% percentage of customers that were previously insured gave 0 response

In [None]:
sns.catplot(x="Vehicle_Age", col="Response",data=train, kind="count", height=4, aspect=.7)
# sns.catplot(x="Vehicle_Age", hue="Vehicle_Damage", col="Response",data=train, kind="count", height=4, aspect=.7)

In [None]:
sns.catplot(x="Response", hue="Vehicle_Damage", col="Vehicle_Age",data=train, kind="count", height=4, aspect=.7)

In [None]:
sns.distplot(train.Annual_Premium)

In [None]:
sns.distplot(train.Vintage)

In [None]:
#Gender
#label encode
from sklearn import preprocessing
le_g = preprocessing.LabelEncoder()
train['Gender'] = le_g.fit_transform(train['Gender'])
test['Gender'] = le_g.transform(test['Gender'])

train['Gender'].value_counts()
plt.plot(train[['Gender', 'Response']].groupby(['Gender'], as_index=True).mean())

In [None]:
#Age bands?
train['Age'].value_counts()

In [None]:
#drop DL?
train['Driving_License'].value_counts()

In [None]:
#Vehicle_Age
train.loc[(train['Vehicle_Age'] == '< 1 Year'), 'Vehicle_Age'] = 0
train.loc[(train['Vehicle_Age'] == '1-2 Year'), 'Vehicle_Age'] = 1
train.loc[(train['Vehicle_Age'] == '> 2 Years'), 'Vehicle_Age'] = 2

test.loc[(test['Vehicle_Age'] == '< 1 Year'), 'Vehicle_Age'] = 0
test.loc[(test['Vehicle_Age'] == '1-2 Year'), 'Vehicle_Age'] = 1
test.loc[(test['Vehicle_Age'] == '> 2 Years'), 'Vehicle_Age'] = 2

train['Vehicle_Age']=train['Vehicle_Age'].astype(int)
test['Vehicle_Age']=test['Vehicle_Age'].astype(int)

In [None]:
#Vehicle_Damage
#label encode
from sklearn import preprocessing
le_vd = preprocessing.LabelEncoder()
train['Vehicle_Damage'] = le_vd.fit_transform(train['Vehicle_Damage'])
test['Vehicle_Damage'] = le_vd.transform(test['Vehicle_Damage'])

In [None]:
#scale
from sklearn.preprocessing import StandardScaler
from sklearn.compose import ColumnTransformer

ct = ColumnTransformer([
        ('somename', StandardScaler(), ['Annual_Premium', 'Policy_Sales_Channel', 'Vintage'])
    ], remainder='passthrough')

train[['Annual_Premium', 'Policy_Sales_Channel', 'Vintage']] = ct.fit_transform(train[['Annual_Premium', 'Policy_Sales_Channel', 'Vintage']])
test[['Annual_Premium', 'Policy_Sales_Channel', 'Vintage']] = ct.transform(test[['Annual_Premium', 'Policy_Sales_Channel', 'Vintage']])

In [None]:
x_train=train.drop(columns=['Response'])
y_train=train['Response']

In [None]:
x_train.shape

In [None]:
from sklearn.model_selection import KFold
kf = KFold(n_splits=5, shuffle=False)
def scoreline(scores):
    print("%0.2f ROC_AUC score with a standard deviation of %0.3f" % (scores['test_roc_auc'].mean(), scores['test_roc_auc'].std()))

In [None]:
from sklearn.linear_model import LogisticRegression
classifier = LogisticRegression(random_state = 0)
classifier.fit(x_train, y_train)
scoreline(cross_validate(classifier, x_train, y_train, scoring=scoring, cv = kf))

In [None]:
from sklearn.tree import DecisionTreeClassifier
classifier = DecisionTreeClassifier(criterion = 'entropy', random_state = 0)
classifier.fit(x_train, y_train)
scoreline(cross_validate(classifier, x_train, y_train, scoring=scoring, cv = kf))

In [None]:
from sklearn.ensemble import RandomForestClassifier
classifier = RandomForestClassifier(random_state = 0)
classifier.fit(x_train, y_train)
scoreline(cross_validate(classifier, x_train, y_train, scoring=scoring, cv = kf))

In [None]:
from sklearn.model_selection import RandomizedSearchCV
from sklearn.model_selection import GridSearchCV

param_grid={'bootstrap': [True, False],
 'max_depth': [1, 2, 12, None],
 'max_features': ['auto'],
 'min_samples_leaf': [1, 2],
 'min_samples_split': [2, 5, 6],
 'n_estimators': [150, 180, 200, 220, 250]}

grid_search = GridSearchCV(estimator = classifier, param_grid = param_grid, cv = 3, n_jobs = -1)

{'bootstrap': True,
 'max_depth': 12,
 'max_features': 'auto',
 'min_samples_leaf': 1,
 'min_samples_split': 6,
 'n_estimators': 200}

# random_grid={'bootstrap': [True, False],
#  'max_depth': [10, 50, 100, None],
#  'max_features': ['auto', 'sqrt'],
#  'min_samples_leaf': [1, 2, 4],
#  'min_samples_split': [2, 5, 10],
#  'n_estimators': [100, 200, 500, 1000, 1500]}
# rf_random = RandomizedSearchCV(estimator = classifier, param_distributions = random_grid, n_iter = 20, cv = 3, random_state=0)
grid_search.fit(x_train, y_train)

{'n_estimators': 200,
 'min_samples_split': 5,
 'min_samples_leaf': 1,
 'max_features': 'auto',
 'max_depth': 10,
 'bootstrap': False}

In [None]:
grid_search.best_params_

In [None]:
classifier=grid_search.best_estimator_

In [None]:
scoreline(cross_validate(classifier, x_train, y_train, scoring=scoring, cv = kf))

In [None]:
features = x_train.columns
importances = classifier.feature_importances_
indices = np.argsort(importances)

plt.title('Feature Importances')
plt.barh(range(len(indices)), importances[indices], color='b', align='center')
plt.yticks(range(len(indices)), [features[i] for i in indices])
plt.xlabel('Relative Importance')
plt.show()

In [None]:
#xgBoost
from xgboost import XGBClassifier
classifier=XGBClassifier(scale_pos_weight=86, use_label_encoder=False)
classifier.fit(x_train, y_train)
scoreline(cross_validate(classifier, x_train, y_train, scoring=scoring, cv = kf))

# SMOTE

In [None]:
from imblearn.over_sampling import SMOTE

In [None]:
from sklearn.ensemble import RandomForestClassifier
sm_rfc = make_pipeline(SMOTE(random_state=0), RandomForestClassifier(random_state = 0))
scoreline(cross_validate(sm_rfc, x_train, y_train, scoring=scoring, cv = kf))

In [None]:
#xgBoost
from xgboost import XGBClassifier
sm_xg = make_pipeline(SMOTE(random_state=0), XGBClassifier(use_label_encoder=False))
scoreline(cross_validate(sm_xg, x_train, y_train, scoring=scoring, cv = kf))

# SMOTEEN

In [None]:
from imblearn.combine import SMOTEENN

In [None]:
from sklearn.ensemble import RandomForestClassifier
smt_rf = make_pipeline(SMOTEENN(), RandomForestClassifier(random_state = 0))
scoreline(cross_validate(smt_rf, x_train, y_train, scoring=scoring, cv = kf))

In [None]:
#xgBoost
from xgboost import XGBClassifier
smt_xg = make_pipeline(SMOTEENN(), XGBClassifier(use_label_encoder=False))
scoreline(cross_validate(smt_xg, x_train, y_train, scoring=scoring, cv = kf))