# Health Insurance Lead Prediction

### Import Required Libraries

In [None]:
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
%matplotlib inline

import seaborn as sns
from sklearn import metrics
from sklearn.preprocessing import LabelEncoder
from imblearn.over_sampling import RandomOverSampler
 
from sklearn.model_selection import train_test_split
from sklearn.preprocessing import StandardScaler
from xgboost import XGBClassifier
from sklearn.ensemble import RandomForestClassifier
from sklearn.neighbors import KNeighborsClassifier

from sklearn.metrics import confusion_matrix, classification_report, accuracy_score,precision_score, recall_score,f1_score
from sklearn.metrics import roc_auc_score, roc_curve


### Import Train dataset and Test dataset

In [None]:
train_df =pd.read_csv("/kaggle/input/jobathon-analytics-vidhya/train.csv")

In [None]:
test_df = pd.read_csv("/kaggle/input/jobathon-analytics-vidhya/test.csv")

In [None]:
train_df.head()

In [None]:
test_df.head()

In [None]:
print("Train Dataset_size :" ,train_df.shape)
print("Test Dataset_size  :",test_df.shape)

## EDA

### Data Cleaning

In [None]:
train_df.isnull().sum()

In [None]:
test_df.isnull().sum()

In [None]:
train_df.info()

In [None]:
test_df.info()

In [None]:
# verify percitage of null values

train_miss = train_df.isnull().sum() / len(train_df) * 100
train_miss = train_miss[train_miss > 0]
train_miss.sort_values(inplace=True)
train_miss

In [None]:
sns.heatmap(train_df.isnull())

In [None]:
train_miss.plot(kind = 'bar')

In [None]:
# verify percitage of null values

test_miss = test_df.isnull().sum() / len(test_df) * 100
test_miss = test_miss[test_miss > 0]
test_miss.sort_values(inplace=True)
test_miss

In [None]:
sns.heatmap(test_df.isnull())

In [None]:
test_miss.plot(kind = "bar")

### Summary of the train data

In [None]:
summary = train_df.describe()
summary = summary.transpose()
summary

In [None]:
summary_test = test_df.describe()
summary_test = summary_test.transpose()
summary_test

## Response

In [None]:
train_df['Response'].value_counts()

In [None]:
sns.countplot(train_df['Response'])

###  Label Encoder

In [None]:
labelEncoder= LabelEncoder()
train_df['City_Code'] = labelEncoder.fit_transform(train_df['City_Code'])
test_df['City_Code'] = labelEncoder.fit_transform(test_df['City_Code'])

## Holding_Policy_Duration

In [None]:
train_df['Holding_Policy_Duration'].value_counts()

In [None]:
sns.countplot('Holding_Policy_Duration', data = train_df, hue = 'Response')

In [None]:
train_df['Holding_Policy_Duration'].replace(to_replace='\+', value='', regex=True, inplace=True)
train_df['Holding_Policy_Duration'] = pd.to_numeric(train_df['Holding_Policy_Duration'], errors='coerce')
test_df['Holding_Policy_Duration'].replace(to_replace='\+', value='', regex=True, inplace=True)
test_df['Holding_Policy_Duration'] = pd.to_numeric(test_df['Holding_Policy_Duration'], errors='coerce')

## Accomodation_Type

In [None]:
train_df['Accomodation_Type'].value_counts()

In [None]:
sns.countplot('Accomodation_Type', data = train_df, hue = 'Response')

In [None]:
train_df['Accomodation_Type'].replace(to_replace='Rented', value='0', regex=True, inplace=True)
train_df['Accomodation_Type'].replace(to_replace='Owned', value='1', regex=True, inplace=True)
test_df['Accomodation_Type'].replace(to_replace='Rented', value='0', regex=True, inplace=True)
test_df['Accomodation_Type'].replace(to_replace='Owned', value='1', regex=True, inplace=True)

## Reco_Insurance_Type

In [None]:
train_df['Reco_Insurance_Type'].value_counts()

In [None]:
sns.countplot('Reco_Insurance_Type', data = train_df, hue = 'Response')

In [None]:
train_df['Reco_Insurance_Type'].replace(to_replace='Individual', value='0', regex=True, inplace=True)
train_df['Reco_Insurance_Type'].replace(to_replace='Joint', value='1', regex=True, inplace=True)
test_df['Reco_Insurance_Type'].replace(to_replace='Individual', value='0', regex=True, inplace=True)
test_df['Reco_Insurance_Type'].replace(to_replace='Joint', value='1', regex=True, inplace=True)

## Is_Spouse

In [None]:
train_df['Is_Spouse'].value_counts()

In [None]:
sns.countplot('Is_Spouse', data = train_df, hue = 'Response')

In [None]:
train_df['Is_Spouse'].replace(to_replace='Yes', value='0', regex=True, inplace=True)
train_df['Is_Spouse'].replace(to_replace='No', value='1', regex=True, inplace=True)
test_df['Is_Spouse'].replace(to_replace='Yes', value='0', regex=True, inplace=True)
test_df['Is_Spouse'].replace(to_replace='No', value='1', regex=True, inplace=True)

## Health Indicator

In [None]:
train_df['Health Indicator'].value_counts()

In [None]:
test_df['Health Indicator'].value_counts()

In [None]:
train_df['Health Indicator'].replace(to_replace='X1', value='0', regex=True, inplace=True)
train_df['Health Indicator'].replace(to_replace='X2', value='1', regex=True, inplace=True)
train_df['Health Indicator'].replace(to_replace='X3', value='2', regex=True, inplace=True)
train_df['Health Indicator'].replace(to_replace='X4', value='3', regex=True, inplace=True)
train_df['Health Indicator'].replace(to_replace='X5', value='4', regex=True, inplace=True)
train_df['Health Indicator'].replace(to_replace='X6', value='5', regex=True, inplace=True)
train_df['Health Indicator'].replace(to_replace='X7', value='6', regex=True, inplace=True)
train_df['Health Indicator'].replace(to_replace='X8', value='7', regex=True, inplace=True)
train_df['Health Indicator'].replace(to_replace='X9', value='8', regex=True, inplace=True)

In [None]:
test_df['Health Indicator'].replace(to_replace='X1', value='0', regex=True, inplace=True)
test_df['Health Indicator'].replace(to_replace='X2', value='1', regex=True, inplace=True)
test_df['Health Indicator'].replace(to_replace='X3', value='2', regex=True, inplace=True)
test_df['Health Indicator'].replace(to_replace='X4', value='3', regex=True, inplace=True)
test_df['Health Indicator'].replace(to_replace='X5', value='4', regex=True, inplace=True)
test_df['Health Indicator'].replace(to_replace='X6', value='5', regex=True, inplace=True)
test_df['Health Indicator'].replace(to_replace='X7', value='6', regex=True, inplace=True)
test_df['Health Indicator'].replace(to_replace='X8', value='7', regex=True, inplace=True)
test_df['Health Indicator'].replace(to_replace='X9', value='8', regex=True, inplace=True)

### Holding_Policy_Type

In [None]:
train_df['Holding_Policy_Type'].value_counts()

In [None]:
test_df['Holding_Policy_Type'].value_counts()

In [None]:
sns.countplot('Holding_Policy_Type', data = train_df, hue = 'Response')

###  Upper_Age

In [None]:
plt.hist(train_df['Upper_Age'], bins = 20)


In [None]:
plt.hist(test_df['Upper_Age'], bins = 20)

In [None]:
train_df["Mean_Age"] = (train_df.Upper_Age + train_df.Lower_Age)/2

### Lower_Age

In [None]:
plt.hist(train_df['Lower_Age'], bins = 20)

In [None]:
plt.hist(test_df['Lower_Age'], bins = 20)

In [None]:
test_df["Mean_Age"] = (test_df.Upper_Age + test_df.Lower_Age)/2

### Handling Null Values 

In [None]:
cols = ['Holding_Policy_Duration','Holding_Policy_Type','Health Indicator']
for col in cols:
    print('Imputation with Median: %s' % (col))
    train_df[col].fillna(train_df[col].median(), inplace=True)
    test_df[col].fillna(train_df[col].median(), inplace=True)
    #X[col].fillna(0, inplace=True)
    #x[col].fillna(0, inplace=True

In [None]:
train_df.isnull().sum()

In [None]:
test_df.isnull().sum()

In [None]:
train_df = train_df.drop(['Upper_Age','Lower_Age'], axis = 1)
test_df = test_df.drop(['Upper_Age','Lower_Age'], axis = 1)

In [None]:
train_df.columns

In [None]:
train_df.shape

In [None]:
test_df.columns

In [None]:
test_df.shape

In [None]:
train_df.head()

In [None]:
test_df.head()

### Spliting  X, Y, Train data and Test data 

In [None]:
X = train_df.drop('Response', axis = 1)
Y = train_df.Response

In [None]:
randomsample=  RandomOverSampler()
x_new,y_new=randomsample.fit_resample(X,Y)

In [None]:
sc_data= StandardScaler().fit_transform(x_new.values)

In [None]:
X_train, X_test, y_train, y_test = train_test_split(np.array(sc_data), np.array(y_new), test_size=0.30)
eval_set=[(X_test, y_test)]

In [None]:
X_train.shape

In [None]:
y_test.shape

## XG Boost Model

In [None]:
XG_model = XGBClassifier(objective="binary:logistic", learning_rate=0.05, seed=9616, 
                                       max_depth=20, gamma=10, n_estimators=500)

In [None]:
xg_model = XG_model.fit(X_train, y_train, early_stopping_rounds=50, eval_metric="auc", eval_set=eval_set, verbose=True)
xg_model

In [None]:
print('Train accuracy',xg_model.score(X_train, y_train))
print('Test accuracy',xg_model.score(X_test, y_test))

## Model Accuracy Score

In [None]:
xg_pred = xg_model.predict(X_test)

In [None]:
xg_cm = confusion_matrix(y_test, xg_pred)
print(xg_cm)

In [None]:
xg_classification_report = classification_report(y_test, xg_pred)
print(xg_classification_report)

###  XG Boost Train accuracy

In [None]:
xg_pred = pd.DataFrame( { 'actual':  y_train,   'predicted': XG_model.predict( X_train ) } )
xg_pred

In [None]:
xg_cm1 = metrics.confusion_matrix( xg_pred.actual,  xg_pred.predicted)
xg_cm1

In [None]:
xg_clf = classification_report(xg_pred.actual,  xg_pred.predicted)
print(xg_clf)

In [None]:
print("accuracy_score is  :", accuracy_score(xg_pred.actual,  xg_pred.predicted))
print("precision score is :",precision_score(xg_pred.actual,  xg_pred.predicted))
print("recall is          :",recall_score(xg_pred.actual,  xg_pred.predicted))
print("F1_score is        :",f1_score(xg_pred.actual,   xg_pred.predicted))

In [None]:


print("ROC_AUC Score :", roc_auc_score(xg_pred.actual,  xg_pred.predicted))

fpr,tpr,thresholds=roc_curve(xg_pred.actual,  xg_pred.predicted)
plt.figure(figsize=(9,6))
plt.plot(fpr,tpr,color='blue',label='ROC')
plt.plot([0,1],[0,1],color='green',linestyle='--')
plt.title('ROC Curve of  XG Boost Model')
plt.xlabel('FalsePositiveRate')
plt.ylabel('TruePositiveRate')
plt.grid()
plt.legend()
plt.show

### XG Boost Test accuracy

In [None]:
xg_test_pred = pd.DataFrame( { 'actual':  y_test, 'predicted': XG_model.predict( X_test ) } )
xg_test_pred

In [None]:
xg_cm1 = metrics.confusion_matrix( xg_test_pred.actual,   xg_test_pred.predicted)
print(xg_cm1)

In [None]:
print("accuracy_score is  :", accuracy_score(xg_test_pred.actual,  xg_test_pred.predicted))
print("precision score is :",precision_score(xg_test_pred.actual,  xg_test_pred.predicted))
print("recall is          :",recall_score(xg_test_pred.actual,    xg_test_pred.predicted))
print("F1_score is        :",f1_score(xg_test_pred.actual,  xg_test_pred.predicted))

In [None]:


print("ROC_AUC Score :", roc_auc_score(xg_test_pred.actual,  xg_test_pred.predicted))

fpr,tpr,thresholds=roc_curve(xg_test_pred.actual,  xg_test_pred.predicted)
plt.figure(figsize=(9,6))
plt.plot(fpr,tpr,color='blue',label='ROC')
plt.plot([0,1],[0,1],color='green',linestyle='--')
plt.title('ROC Curve of  XG Boost Model')
plt.xlabel('FalsePositiveRate')
plt.ylabel('TruePositiveRate')
plt.grid()
plt.legend()
plt.show

## Random Forest Model

In [None]:
rf_clf = RandomForestClassifier(bootstrap=True, class_weight=None, criterion='gini',
                       max_depth=None, max_features='auto', max_leaf_nodes=None,
                       min_impurity_decrease=0.0, min_impurity_split=None,
                       min_samples_leaf=1, min_samples_split=2,
                       min_weight_fraction_leaf=0.0, n_estimators=10,
                       n_jobs=None, oob_score=False, random_state=None,
                       verbose=0, warm_start=False)
rf_model = rf_clf.fit(X_train, y_train)

In [None]:
print('Train accuracy :',rf_model.score(X_train, y_train))
print('Test accuracy  :',rf_model.score(X_test, y_test))

###  Model Accuracy score

In [None]:
ran_forest_pred = rf_model.predict(X_test)
print(ran_forest_pred)

In [None]:
ran_forest_cm = confusion_matrix(y_test, ran_forest_pred)
print(ran_forest_cm)

In [None]:
random_forest_classification_report = classification_report(y_test, ran_forest_pred)
print(random_forest_classification_report)

In [None]:
print("accuracy_score is  :", accuracy_score(y_test, ran_forest_pred))
print("precision score is :",precision_score(y_test, ran_forest_pred))
print("recall is          :",recall_score(y_test, ran_forest_pred))
print("F1_score is        :",f1_score(y_test, ran_forest_pred))

In [None]:


print("ROC_AUC Score :",roc_auc_score(y_test, ran_forest_pred))

fpr,tpr,thresholds=roc_curve(y_test, ran_forest_pred)
plt.figure(figsize=(9,6))
plt.plot(fpr,tpr,color='blue',label='ROC')
plt.plot([0,1],[0,1],color='green',linestyle='--')
plt.title('ROC Curve of  Random Forest Model')
plt.xlabel('FalsePositiveRate')
plt.ylabel('TruePositiveRate')
plt.grid()
plt.legend()
plt.show

## KNN Model

In [None]:
error_rate = []

for i in range(1,20):
    knn = KNeighborsClassifier(n_neighbors=i)
    knn.fit(X_train, y_train)
    y_pred_i = knn.predict(X_test)
    error_rate.append(np.mean(y_pred_i != y_test))

In [None]:
plt.figure(figsize=(10,5))
plt.plot(range(1,20),error_rate,color='blue',ls='--',marker='.',markerfacecolor='red', markersize=10)
plt.title('Error Rate vs K Value')
plt.xlabel('K')
plt.ylabel('Error Rate')

In [None]:
knn_model = KNeighborsClassifier(n_neighbors = 5, p=2)
knn_model.fit(X_train,y_train)

In [None]:
print('Train accuracy :',knn_model.score(X_train, y_train))
print('Test accuracy  :',knn_model.score(X_test, y_test))

### KNN Model Accuracy score

In [None]:
knn_pred = knn.predict(X_test)

In [None]:
print("Confusion_Matrix :")
print(confusion_matrix(y_test,knn_pred))
print("Classifisction_report :")
print('\n')
print(classification_report(y_test,knn_pred))