In [None]:
# This Python 3 environment comes with many helpful analytics libraries installed
# It is defined by the kaggle/python Docker image: https://github.com/kaggle/docker-python
# For example, here's several helpful packages to load

import numpy as np # linear algebra
import pandas as pd # data processing, CSV file I/O (e.g. pd.read_csv)

# Input data files are available in the read-only "../input/" directory
# For example, running this (by clicking run or pressing Shift+Enter) will list all files under the input directory

import os
for dirname, _, filenames in os.walk('/kaggle/input'):
    for filename in filenames:
        print(os.path.join(dirname, filename))

# You can write up to 20GB to the current directory (/kaggle/working/) that gets preserved as output when you create a version using "Save & Run All" 
# You can also write temporary files to /kaggle/temp/, but they won't be saved outside of the current session

# **Problem Statement-**
Our client is an Insurance company that has provided Health Insurance to its customers now they need your help in building a model to predict whether the policyholders (customers) from past year will also be interested in Vehicle Insurance provided by the company.

An insurance policy is an arrangement by which a company undertakes to provide a guarantee of compensation for specified loss, damage, illness, or death in return for the payment of a specified premium. A premium is a sum of money that the customer needs to pay regularly to an insurance company for this guarantee.

For example, you may pay a premium of Rs. 5000 each year for a health insurance cover of Rs. 200,000/- so that if, God forbid, you fall ill and need to be hospitalised in that year, the insurance provider company will bear the cost of hospitalisation etc. for upto Rs. 200,000. Now if you are wondering how can company bear such high hospitalisation cost when it charges a premium of only Rs. 5000/-, that is where the concept of probabilities comes in picture. For example, like you, there may be 100 customers who would be paying a premium of Rs. 5000 every year, but only a few of them (say 2-3) would get hospitalised that year and not everyone. This way everyone shares the risk of everyone else.

Just like medical insurance, there is vehicle insurance where every year customer needs to pay a premium of certain amount to insurance provider company so that in case of unfortunate accident by the vehicle, the insurance provider company will provide a compensation (called ‘sum assured’) to the customer.

Building a model to predict whether a customer would be interested in Vehicle Insurance is extremely helpful for the company because it can then accordingly plan its communication strategy to reach out to those customers and optimise its business model and revenue.

Now, in order to predict, whether the customer would be interested in Vehicle insurance, you have information about demographics (gender, age, region code type), Vehicles (Vehicle Age, Damage), Policy (Premium, sourcing channel) etc.

In [None]:
import numpy as np
import pandas as pd
import matplotlib.pyplot as plt
import seaborn as sns

In [None]:
df=pd.read_csv("../input/health-insurance-cross-sell-prediction/train.csv")
df.head()

# **Basic Data Cleaning**

In [None]:
df.shape

In [None]:
df['Response'].value_counts()

In [None]:
df.columns

In [None]:
df.info()

In [None]:
df.isnull().sum().sum() #There are no null values.

In [None]:
df=df.drop("id",axis=1)

In [None]:
df['Driving_License']=df['Driving_License'].astype('object')
df['Previously_Insured']=df['Previously_Insured'].astype('object')
df['Response']=df['Response'].astype('object')

In [None]:
df_num=df.select_dtypes(exclude='object')
df_cat=df.select_dtypes(include='object')

In [None]:
#Lets see the skewness and distribution of numerical columns
for i in df_num.columns:
    print(i)
    print(df_num[i].skew())
    sns.distplot(df_num[i])
    plt.show()

# **Exploratory Data Analysis**

In [None]:
df_cat.columns

In [None]:
df_num.columns

In [None]:
sns.countplot(df_cat['Response'])
plt.show() #There is a huge data imbalance

In [None]:
for i in df_cat.columns[0:-1]:
    sns.countplot(x=df_cat[i],hue=df_cat['Response'])
    plt.show()

Conclusion-
1. Slightly more number of male are interested in having vehicle insurance as compare to female.
2. Those, who do not have driving license are not interested in vehicle insurance.
3. There are people who already have vehicle insurance , they are not interested in it.
4. There are more number of people whose vehicle age is between 1 to 2 years and interested in vehicle
   insurance.
5. out of all the people who are interested in vehicle insurance, almost all of them have the history      of vehicle damage.

In [None]:
for i in df_num.columns:
    sns.boxplot(x=df_cat['Response'],y=df_num[i])
    plt.show()  

Conclusion-
1. Mean age is higher for those who are interested in vehicle insurance as compare to who are not.
2. Mean Regional code is same for both response.
3. Mean annual income is same for both response.
4. Mean policy channels are different for both response.
5. Mean vintage days are same for both response.

In [None]:
sns.scatterplot(x=df_num['Annual_Premium'],y=df_num['Vintage'],hue=df_cat['Response'])
plt.show()

There is no relationship between vintage days of customer and annual premium. There are very few people who are paying very high premium and the people who are interested in vehicle insurance, there annual premium are low.

In [None]:
sns.boxplot(y=df_num['Age'],hue=df_cat['Vehicle_Damage'],x=df_cat['Response'])
plt.show()

The mean age of the customer is high who have the history of vehicle damage irrespective of their response for vehicle insurance. 

In [None]:
plt.figure(figsize=[14,10])
sns.boxplot(x=df_cat['Response'],y=df_num['Annual_Premium'],hue=df_cat['Previously_Insured'])
plt.show()

If we want to compare the annual premium with previously insured and their response then we can observe that mean annnual premium is high for those who do not have previous insurance and we can observe that there are customer who already have insurance and they are still interesed in it. Mean annual premium is same for those people who are not interested in vehicle response irrespective of the status of their previous insurance.

# **Statistical Test**

In [None]:
#As we observed earlier in EDA that some of the mean values of numerical columns were same so now we can 
#Perfom some statistical test and observe some evidence to drop them.

In [None]:
from scipy.stats import stats

In [None]:
for i in df_num.columns:
    df_1=df[df['Response']==1][i]
    df_0=df[df['Response']==0][i]
    print(i)
    tsats,pval=stats.ttest_ind(df_1,df_0)
    print(pval)
    tstas,pval=stats.mannwhitneyu(df_1,df_0)
    print(pval)

In [None]:
#AS we can see that vintage column is passing both the test so if we take the significance level of 5%
#the for vintage column we fail to reject h0 which means mean value of vintage days are same hence we 
#can drop the column.

In [None]:
#We can drop the columns like regional code and policy sales channel as it will not have any effect
#on the response of the customer beacause thease values are just the way of communications.

In [None]:
df_num=df_num.drop(['Vintage','Policy_Sales_Channel','Region_Code'],axis=1)

In [None]:
df_cat.columns

In [None]:
df_num.columns

In [None]:
from sklearn.preprocessing import PowerTransformer
pt=PowerTransformer()
df_num_pt=pt.fit_transform(df_num)
df_num_pt=pd.DataFrame(df_num_pt)
df_num_pt.columns=df_num.columns
df_num_pt.head()

In [None]:
df_cat=df_cat.drop('Response',axis=1)
df_cat_dum=pd.get_dummies(df_cat,columns=list(df_cat.columns),drop_first=True)
df_cat_dum.head()

In [None]:
X=pd.concat([df_cat_dum,df_num_pt],axis=1)
y=df['Response']

# **Multicollinearity**

In [None]:
from statsmodels.stats.outliers_influence import variance_inflation_factor
vif=pd.DataFrame()
vif['VIF']=[variance_inflation_factor(X.values,i) for i in range(X.shape[1])]
vif['feature']=X.columns
vif.sort_values('VIF',ascending=False)
#Multicollinearity is in acceptable range.

In [None]:
y.value_counts() #There is a huge data imbalance so we will have to treat that.

# **Data Imbalance Treatment using smote NC**

In [None]:
from sklearn.model_selection import train_test_split

In [None]:
y=y.astype('int64')

In [None]:
xtrain,xtest,ytrain,ytest=train_test_split(X,y,test_size=0.3,stratify=y)

In [None]:
from imblearn.over_sampling import SMOTENC

In [None]:
X.head()

In [None]:
ytrain.value_counts()

In [None]:
smotenc = SMOTENC([0,1,2,3,4,5])
X_oversample,y_oversample = smotenc.fit_resample(xtrain,ytrain)

In [None]:
y_oversample.value_counts()

In [None]:
X_oversample.tail()

# **Model Building**

In [None]:
from sklearn.linear_model import LogisticRegression
log=LogisticRegression()
log.fit(X_oversample,y_oversample)

In [None]:
from sklearn.metrics import  accuracy_score , classification_report , confusion_matrix , plot_roc_curve

In [None]:
ypred=log.predict(xtest)
print(classification_report(ytest,ypred))

In [None]:
plot_roc_curve(log , xtest , ytest)
plt.show()

In [None]:
from sklearn.model_selection import KFold,cross_val_score
kf = KFold(shuffle=True , n_splits=5 , random_state=7)
score = cross_val_score(log , X , y , cv=kf , scoring='roc_auc')
bias1 = np.mean(1-score)
variance1 = np.std(score , ddof=1)
print(bias1 , variance1)

In [None]:
from sklearn.naive_bayes import GaussianNB
NB = GaussianNB()
NB.fit(X_oversample,y_oversample)

In [None]:
ypred=NB.predict(xtest)
print(classification_report(ytest,ypred))

In [None]:
plot_roc_curve(NB , xtest , ytest)
plt.show()

In [None]:
kf = KFold(shuffle=True , n_splits=5 , random_state=7)
score = cross_val_score(NB , X , y , cv=kf , scoring='roc_auc')
bias1 = np.mean(1-score)
variance1 = np.std(score , ddof=1)
print(bias1 , variance1)

In [None]:
from sklearn.neighbors import KNeighborsClassifier
KNN = KNeighborsClassifier()
KNN.fit(X_oversample,y_oversample)

In [None]:
ypred=KNN.predict(xtest)
print(classification_report(ytest,ypred))

In [None]:
plot_roc_curve(KNN , xtest , ytest)
plt.show()

In [None]:
kf = KFold(shuffle=True , n_splits=5 , random_state=7)
score = cross_val_score(KNN , X , y , cv=kf , scoring='roc_auc')
bias1 = np.mean(1-score)
variance1 = np.std(score , ddof=1)
print(bias1 , variance1)

In [None]:
knn_tuned=KNeighborsClassifier(n_neighbors=96,weights='uniform')
knn_tuned.fit(X_oversample,y_oversample)

In [None]:
ypred=knn_tuned.predict(xtest)
print(classification_report(ytest,ypred))

In [None]:
plot_roc_curve(knn_tuned , xtest , ytest)
plt.show()

In [None]:
kf = KFold(shuffle=True , n_splits=5 , random_state=7)
score = cross_val_score(knn_tuned , X , y , cv=kf , scoring='roc_auc')
bias1 = np.mean(1-score)
variance1 = np.std(score , ddof=1)
print(bias1 , variance1)

In [None]:
from sklearn.tree import DecisionTreeClassifier
dt_tuned = DecisionTreeClassifier(max_depth=110,criterion='entropy')
dt_tuned.fit(X_oversample,y_oversample)

In [None]:
ypred=dt_tuned.predict(xtest)
print(classification_report(ytest,ypred))

In [None]:
plot_roc_curve(dt_tuned , xtest , ytest)
plt.show()

In [None]:
kf = KFold(shuffle=True , n_splits=5 , random_state=7)
score = cross_val_score(dt_tuned , X , y , cv=kf , scoring='roc_auc')
bias1 = np.mean(1-score)
variance1 = np.std(score , ddof=1)
print(bias1 , variance1)

In [None]:
from sklearn.ensemble import RandomForestClassifier
rf = RandomForestClassifier(n_estimators=200)
rf.fit(X_oversample,y_oversample)

In [None]:
ypred=rf.predict(xtest)
print(classification_report(ytest,ypred))

In [None]:
plot_roc_curve(rf , xtest , ytest)
plt.show()

In [None]:
kf = KFold(shuffle=True , n_splits=5 , random_state=7)
score = cross_val_score(rf , X , y , cv=kf , scoring='roc_auc')
bias1 = np.mean(1-score)
variance1 = np.std(score , ddof=1)
print(bias1 , variance1)

In [None]:
from sklearn.ensemble import BaggingClassifier,AdaBoostClassifier,GradientBoostingClassifier
GBoost=GradientBoostingClassifier(n_estimators=100)
GBoost.fit(X_oversample,y_oversample)

In [None]:
ypred=GBoost.predict(xtest)
print(classification_report(ytest,ypred))

In [None]:
plot_roc_curve(GBoost , xtest , ytest)
plt.show()

In [None]:
kf = KFold(shuffle=True , n_splits=5 , random_state=7)
score = cross_val_score(GBoost , X , y , cv=kf , scoring='roc_auc')
bias1 = np.mean(1-score)
variance1 = np.std(score , ddof=1)
print(bias1 , variance1)

# **Conclusion-**
1. As per our problem statement we want to predict the staus of customer wheather they are interested      in vehicle insurance so we need higher recall for that. As we know that Recall is the ratio of TP      and (TP+FN) and we want FN negative to be minimum for the class 1(who are interested in insurance).
2. So among all the above model built there are 2 models which are giving us the best results. which      are KNN tuned and Gradient Boosting.
3. We will choose Gradient boosting because its recall score is slightly better than KNN tuned with        slightly increament in Roc Auc Score. 

# **Evaluation of Final Model**

In [None]:
ypred_train = GBoost.predict(xtrain)
ypred_test = GBoost.predict(xtest)

In [None]:
accuracy_score(ytrain , ypred_train)

In [None]:
accuracy_score(ytest , ypred_test)

In [None]:
#If we compare the above results and bias,variance error which has been previously calculated we can
#say that model is not overfit.

In [None]:
print(confusion_matrix(ytest,ypred_test))

In [None]:
ytest.value_counts()

As we can see the above results we can say that we are able to predict 70% of the class correctly. our Roc auc score is 85% which means that model is able to distinguish between the negative and positive classes. we can see the confusion metrix that the number of false negatives are very very less which is addressing our problem statement. In test data set there are 14013 customers who are interested in vehicle insurance and out of that 12845 have been classified correctly.