In [None]:
# This Python 3 environment comes with many helpful analytics libraries installed
# It is defined by the kaggle/python Docker image: https://github.com/kaggle/docker-python
# For example, here's several helpful packages to load

import numpy as np # linear algebra
import pandas as pd # data processing, CSV file I/O (e.g. pd.read_csv)

# Input data files are available in the read-only "../input/" directory
# For example, running this (by clicking run or pressing Shift+Enter) will list all files under the input directory

import os
for dirname, _, filenames in os.walk('/kaggle/input'):
    for filename in filenames:
        print(os.path.join(dirname, filename))

# You can write up to 5GB to the current directory (/kaggle/working/) that gets preserved as output when you create a version using "Save & Run All" 
# You can also write temporary files to /kaggle/temp/, but they won't be saved outside of the current session

# 1. Introduction

## 1.1. Libraries 

In [None]:
import matplotlib.pyplot as plt
import seaborn as sns
from scipy.stats import skew, kurtosis
from scipy import stats
import warnings
import time

from sklearn.preprocessing import StandardScaler
from sklearn.model_selection import cross_val_score, GridSearchCV, GroupShuffleSplit, KFold, train_test_split
from sklearn.metrics import accuracy_score, auc, confusion_matrix, roc_curve, roc_auc_score, classification_report
from sklearn.discriminant_analysis import LinearDiscriminantAnalysis
from sklearn.preprocessing import LabelEncoder

import statsmodels.formula.api as smf
import statsmodels.api as sm
from sklearn.tree import DecisionTreeClassifier
from sklearn.ensemble import AdaBoostClassifier, BaggingClassifier
from sklearn.naive_bayes import GaussianNB
from sklearn.neighbors import KNeighborsClassifier
from sklearn.ensemble import GradientBoostingClassifier, RandomForestClassifier, StackingClassifier
from sklearn.linear_model import LassoCV, LogisticRegression, RidgeClassifier
from sklearn.neural_network import MLPClassifier
from xgboost import XGBClassifier
from lightgbm import LGBMClassifier



warnings.filterwarnings("ignore")

## 1.2. Datasets

In [None]:
train = pd.read_csv("/kaggle/input/health-insurance-cross-sell-prediction/train.csv")
test = pd.read_csv("/kaggle/input/health-insurance-cross-sell-prediction/test.csv")
sample_submission = pd.read_csv("/kaggle/input/health-insurance-cross-sell-prediction/sample_submission.csv")

## 1.3. Superficial Examination

![](https://encrypted-tbn0.gstatic.com/images?q=tbn%3AANd9GcQplV4m4ZEr1HaVCGtNeMFB9azlaH7OOYmuzPMTma6Z0Q&usqp=CAU&ec=45702844)

In [None]:
print("Test Row : " + str(test.shape[0]) + " | " + "Test Column : " + str(test.shape[1]))
print("*"*40)
print("Train Row : " + str(train.shape[0]) + " | " + "Train Column : " + str(train.shape[1]))

In [None]:
train.head()

In [None]:
train.info()

In [None]:
test.info()

In [None]:
train.describe().T

In [None]:
test.describe().T

I can already say that the Age and Annual_Premium are skew but I have to look deeply 

In [None]:
#skew and kurtosis function
def skewANDkurtosis(data,column):
    print(column + " - Skewness : " + str(data[column].skew()) + " | " + \
          column + " - Kurtosis : " + str(data[column].kurtosis()) )

In [None]:
for dt in [train,test]:
    for col in ["Age","Annual_Premium"]:
        skewANDkurtosis(data = dt,column=col)
    print(" ")

### Situations of Skew

1. A symmetrical distribution will have a skewness of 0.
1. If the skewness is between -0.5 and 0.5, the data are fairly symmetrical.
1. If the skewness is between -1 and -0.5(negatively skewed) or between 0.5 and 1(positively skewed), the data are moderately skewed.
1. If the skewness is less than -1(negatively skewed) or greater than 1(positively skewed), the data are highly skewed.

### Situations of Kurtosis

* **Mesocurtic** : This definition is used so that the standard normal distribution has a kurtosis of three.
* **Leptokurtic(Kurtosis > 3)** : Distribution is longer, tails are fatter. The peak is higher and sharper than Mesokurtic, which means that data are heavy-tailed or profusion of outliers.
* **Platykurtic(Kurtosis < 3)** :  Distribution is shorter, tails are thinner than the normal distribution. The peak is lower and broader than Mesokurtic, which means that data are light-tailed or lack of outliers.

#### Comment : Age and Annual_Premium have very high kurtosis so their peak is very higher and sharper for each dataset.

In [None]:
#---------------LAYOUT-----------------------
fig, ax = plt.subplots(7,2,figsize=(20,50))
#---------------GRAPHS-----------------------
sns.countplot(x="Gender", data=train, ax=ax[0,0])
sns.countplot(x="Gender", data=test, ax=ax[0,1])
sns.distplot(train["Age"], ax=ax[1,0])
sns.distplot(test["Age"], ax=ax[1,1])
sns.countplot("Driving_License",data = train, ax=ax[2,0])
sns.countplot("Driving_License",data  = test, ax=ax[2,1])
sns.countplot("Previously_Insured",data = train,ax=ax[3,0])
sns.countplot("Previously_Insured",data = test,ax=ax[3,1])
sns.countplot("Vehicle_Age",data = train,ax=ax[4,0])
sns.countplot("Vehicle_Age",data = test,ax=ax[4,1])
sns.countplot("Vehicle_Damage",data = train,ax=ax[5,0])
sns.countplot("Vehicle_Damage",data = test,ax=ax[5,1])
sns.distplot(train["Annual_Premium"], ax=ax[6,0])
sns.distplot(test["Annual_Premium"], ax=ax[6,1])


#--------TITLES-----------------------------
ax[0,0].set_title("Train - Gender")
ax[0,1].set_title("Test - Gender")
ax[1,0].set_title("Train - Age")
ax[1,1].set_title("Test - Age")
ax[2,0].set_title("Train - Driving_License")
ax[2,1].set_title("Test - Driving_License")
ax[3,0].set_title("Train - Previously_Insured")
ax[3,1].set_title("Test - Previously_Insured")
ax[4,0].set_title("Train - Vehicle_Age")
ax[4,1].set_title("Test - Vehicle_Age")
ax[5,0].set_title("Train - Vehicle_Damage")
ax[5,1].set_title("Test - Vehicle_Damage")
ax[6,0].set_title("Train - Annual_Premium")
ax[6,1].set_title("Test - Annual_Premium");

In [None]:
fig, ax = plt.subplots(2,1,figsize = (25,15))
sns.countplot("Region_Code",data = train,ax=ax[0])
sns.countplot("Region_Code",data = test,ax=ax[1])
ax[0].set_title("Train - Region_Code")
ax[1].set_title("Test - Region_Code");

In [None]:
train.Response.value_counts().plot.bar()

# 2. Exploratory Data Analysis (EDA)

## 2.2. Bivariate Analysis

In [None]:
train.head()

**a-) Qualitative data**
* Gender
* Driving_License 
* Region_Code
* Previously_Insured
* Vehicle_Damage
* Policy_Sales_Channel
* Vehicle_Age
* Response(train)

**b-) Quantitative data**
* Age
* Annual_Premium
* Vintage


### 2.2.1. Gender 

In [None]:
fig ,ax = plt.subplots(3,1,figsize=(10,7))
sns.countplot(x = "Gender" , hue = "Vehicle_Damage", data = train, ax = ax[0]);
sns.countplot(x = "Gender" , hue = "Response", data = train, ax = ax[1]);
sns.countplot(x = "Gender" , hue = "Vehicle_Age", data = train, ax = ax[2]);

### 2.2.2. Age

In [None]:
Age = sns.FacetGrid(data=train, hue = 'Response', aspect=6 )
Age.map(sns.kdeplot, 'Age', shade= True )
Age.set(xlim=(0 , train['Age'].max()))
Age.add_legend()

In [None]:
Age = sns.FacetGrid(data=train, hue = 'Gender', aspect=6 )
Age.map(sns.kdeplot, 'Age', shade= True )
Age.set(xlim=(0 , train['Age'].max()))
Age.add_legend()

In [None]:
Age = sns.FacetGrid(data=train, hue = 'Vehicle_Damage', aspect=6 )
Age.map(sns.kdeplot, 'Age', shade= True )
Age.set(xlim=(0 , train['Age'].max()))
Age.add_legend()

### 2.2.3. Driving_License

In [None]:
fig ,ax = plt.subplots(4,1,figsize=(10,7))
sns.countplot(x = "Driving_License" , hue = "Vehicle_Damage", data = train, ax = ax[0]);
sns.countplot(x = "Driving_License" , hue = "Response", data = train, ax = ax[1]);
sns.countplot(x = "Driving_License" , hue = "Vehicle_Age", data = train, ax = ax[2]);
sns.countplot(x = "Driving_License" , hue = "Gender", data = train, ax = ax[3]);

In [None]:
train = train.drop("Driving_License",axis = 1)
test = test.drop("Driving_License",axis = 1)

### 2.2.4. Region_Code

In [None]:
region = train[train["Region_Code"] == 28.0]
sns.countplot(x="Region_Code",hue="Response",data=region)

In [None]:
yes = train[train.Response == 1.0].groupby(["Region_Code","Response"])\
                         .count() .sort_values(by = "id")["id"].to_frame()
no = train[train.Response == 0.0].groupby(["Region_Code","Response"])\
                         .count().sort_values(by = "id")["id"].to_frame()

yes.rename(columns = {'id': 'YES'}, inplace = True)
no.rename(columns = {'id': 'NO'}, inplace = True)

no.reset_index(level=["Region_Code","Response"]).drop("Response",axis=1)
yes.reset_index(level=["Region_Code","Response"]).drop("Response",axis=1)

region_Code = pd.merge(yes, no, how="inner", on = "Region_Code")

def change(value):
    if value >=4000:
        return "High"
    if (value <4000) & (value >1000):
        return "Medium"
    else:
        return "Low"

region_Code["Region_St"] = region_Code["YES"].apply(change)

region_Code = region_Code.reset_index(level=["Region_Code"])\
                         .drop(["YES", "NO"], axis = 1)

train  =  pd.merge(train, region_Code, how="inner", on = "Region_Code")


In [None]:
train.head()

In [None]:
sns.countplot(x = "Region_St", hue = "Response", data = train);

In [None]:
train.drop("Region_Code",axis = 1,inplace=True)

### 2.2.5. Previously_Insured

In [None]:
fig ,ax = plt.subplots(4,1,figsize=(10,7))
sns.countplot(x = "Previously_Insured" , hue = "Vehicle_Damage", data = train, ax = ax[0])
sns.countplot(x = "Previously_Insured" , hue = "Response", data = train, ax = ax[1])
sns.countplot(x = "Previously_Insured" , hue = "Vehicle_Age", data = train, ax = ax[2])
sns.countplot(x = "Previously_Insured" , hue = "Gender", data = train, ax = ax[3]);

In [None]:
sns.distplot(train[train['Previously_Insured']==0]['Age'],kde=True,color='r',bins=5)
sns.distplot(train[train['Previously_Insured']==1]['Age'],kde=True,color='g',bins=5);

### 2.2.6 Policy_Sales_Channel	

In [None]:
yes = train[train.Response == 1.0].groupby(["Policy_Sales_Channel","Response"])\
                         .count() .sort_values(by = "id")["id"].to_frame()
no = train[train.Response == 0.0].groupby(["Policy_Sales_Channel","Response"])\
                         .count().sort_values(by = "id")["id"].to_frame()

yes.rename(columns = {'id': 'YES'}, inplace = True)
no.rename(columns = {'id': 'NO'}, inplace = True)

no.reset_index(level=["Policy_Sales_Channel","Response"]).drop("Response",axis=1)
yes.reset_index(level=["Policy_Sales_Channel","Response"]).drop("Response",axis=1)

Policy_Sales_Channel = pd.merge(yes, no, how="inner", on = "Policy_Sales_Channel")

Policy_Sales_Channel["RATE"] = Policy_Sales_Channel.YES / Policy_Sales_Channel.NO

def change(value):
    if value>=0.40:
        return "High"
    elif value<0.4 and value >=0.25:
        return "Middle"
    elif value<0.25 and value>=0.1:
        return "Low Middle"
    else:
        return "Low"
        
Policy_Sales_Channel["Policy_Channel_St"] = Policy_Sales_Channel["RATE"].apply(change)


Policy_Sales_Channel = Policy_Sales_Channel.reset_index(level=["Policy_Sales_Channel"])\
                         .drop(["YES", "NO","RATE"], axis = 1)


train  =  pd.merge(train, Policy_Sales_Channel, how="inner", on = "Policy_Sales_Channel")

In [None]:
train.drop("Policy_Sales_Channel",axis = 1,inplace=True)

## 2.3 Outlier 

In [None]:
index_list = []
for column in ["Age", "Annual_Premium","Vintage"]:
    for resp in train.Response.unique():
        resp_data = train[train["Response"] == resp]
        resp_column = resp_data[column]
        
        Q1 = np.percentile(resp_column,25)
        Q3 = np.percentile(resp_column,75)
        IQR = Q3 - Q1
        STEP = 1.5 * IQR
        MAX_BORDER = Q3 + STEP
        MIN_BORDER = Q1 - STEP
        
        train.loc[(train["Response"] == resp) & (train[column] > MAX_BORDER), column] = MAX_BORDER
        train.loc[(train["Response"] == resp) & (train[column] < MIN_BORDER), column] = MIN_BORDER
        

In [None]:
train.head()

## 2.4. MODEL SELECTION

### 2.4.1. ENCODING

In [None]:
train["Gender"] = LabelEncoder().fit_transform(train[["Gender"]])
train["Vehicle_Damage"] = LabelEncoder().fit_transform(train[["Vehicle_Damage"]])
train = pd.get_dummies(data=train, columns= ["Vehicle_Age","Region_St", "Policy_Channel_St"])

In [None]:
train.head()

### 2.4.2. Standardization

In [None]:
train.head()

In [None]:
train.drop("id",axis=1,inplace=True)

In [None]:
y = train.Response
X = train.drop("Response", axis =1)

### 2.4.3. Splitting 

In [None]:
X_train, X_test, y_train, y_test = train_test_split(X,y, test_size = 0.30, random_state = 1845)

In [None]:
X_train.shape, X_test.shape, y_train.shape, y_test.shape

### 2.4.3 Logistic Regression

In [None]:
log = LogisticRegression(solver="liblinear")
log_model = log.fit(X_train,y_train)
log_model

In [None]:
log_model.intercept_

In [None]:
log_model.coef_

#### 2.4.3.3. Prediction And Model Tuning

In [None]:
y_pred = log_model.predict(X_test)

In [None]:
confusion_matrix(y_test, y_pred)

In [None]:
# precision = TP / (TP + FP)
precision = 100135 / (100135 + 64)
precision

In [None]:
#recall = TP / (TP + FN)
recall = 100135 / (100135 + 14066)
recall

In [None]:
accuracy_score(y_test, y_pred)

In [None]:
print(classification_report(y_test,y_pred))

In [None]:
logit_roc_auc = roc_auc_score(y_test, log_model.predict(X_test))
fpr, tpr, thresholds = roc_curve(y_test, log_model.predict_proba(X_test)[:,1])

plt.figure()
plt.plot(fpr, tpr, label = "AUC (area = %0.2f)"% logit_roc_auc)
plt.plot([0,1],[0,1], 'r--')
plt.xlim([0.0, 1.0])
plt.ylim([0.0, 1.05])
plt.xlabel("False Positive Rate")
plt.ylabel("True Positive Rate")
plt.title("ROC")
plt.legend(loc="lower right", fontsize=16)
plt.show()

### 2.4.4. Gaussian Naive Bayes

In [None]:
nb = GaussianNB()
nb_model = nb.fit(X_train,y_train)
nb_model

In [None]:
y_pred = nb_model.predict(X_test)

In [None]:
nb_model.predict_proba(X_test)

In [None]:
accuracy_score(y_test,y_pred)

In [None]:
cross_val_score(nb_model, X_train, y_train, cv=10).mean()

In [None]:
print(classification_report(y_test,y_pred))

In [None]:
nb_roc_auc = roc_auc_score(y_test, nb_model.predict(X_test))
fpr, tpr, thresholds = roc_curve(y_test, nb_model.predict_proba(X_test)[:,1])

plt.figure()
plt.plot(fpr, tpr, label = "AUC (area = %0.2f)"% nb_roc_auc)
plt.plot([0,1],[0,1], 'r--')
plt.xlim([0.0, 1.0])
plt.ylim([0.0, 1.05])
plt.xlabel("False Positive Rate")
plt.ylabel("True Positive Rate")
plt.title("ROC")
plt.legend(loc="lower right", fontsize=16)
plt.show()

### 2.4.5 KNN

In [None]:
knn = KNeighborsClassifier()
knn_model = knn.fit(X_train,y_train)
knn_model

In [None]:
y_predict = knn_model.predict(X_test)

In [None]:
cross_val_score(knn_model,X_train, y_train, cv = 10).mean()

In [None]:
accuracy_score(y_test,y_predict)

In [None]:
knn_roc_auc = roc_auc_score(y_test, knn_model.predict(X_test))
fpr, tpr, thresholds = roc_curve(y_test, knn_model.predict_proba(X_test)[:,1])

plt.figure()
plt.plot(fpr, tpr, label = "AUC (area = %0.2f)"% knn_roc_auc)
plt.plot([0,1],[0,1], 'r--')
plt.xlim([0.0, 1.0])
plt.ylim([0.0, 1.05])
plt.xlabel("False Positive Rate")
plt.ylabel("True Positive Rate")
plt.title("ROC")
plt.legend(loc="lower right", fontsize=16)
plt.show()

## 2.4.6 Neural Networks

In [None]:
mlpc = MLPClassifier().fit(X_train,y_train)

In [None]:
y_pred = mlpc.predict(X_test)

In [None]:
accuracy_score(y_test,y_pred)

## 2.4.6 XGBOOST 

In [None]:
xgb_model = XGBClassifier().fit(X_train, y_train)

### It is an important and vital part for XGBOOST so I didn't change the names of columns. If your dataset includes non-string column names, XGBOOST will give an error.

In [None]:
X_train.head()

In [None]:
columnss = {"Vehicle_Age_1-2 Year" : "Vehicle_Age_1_2","Vehicle_Age_< 1 Year" : "Vehicle_Age_1","Vehicle_Age_> 2 Years" : "Vehicle_Age_2" }
X_train.rename(columns=columnss, inplace=True)
X_test.rename(columns=columnss, inplace=True)

In [None]:
xgb_model = XGBClassifier().fit(X_train, y_train)

In [None]:
y_pred = xgb_model.predict(X_test)
accuracy_score(y_test,y_pred)

In [None]:
print(classification_report(y_test,y_pred))