In [None]:
# This Python 3 environment comes with many helpful analytics libraries installed
# It is defined by the kaggle/python Docker image: https://github.com/kaggle/docker-python
# For example, here's several helpful packages to load

import numpy as np # linear algebra
import pandas as pd # data processing, CSV file I/O (e.g. pd.read_csv)
import matplotlib.pyplot as plt
import seaborn as sns
%matplotlib inline


from sklearn.model_selection import train_test_split
from sklearn.model_selection import KFold
from sklearn.model_selection import RandomizedSearchCV
from sklearn.metrics import confusion_matrix,accuracy_score,classification_report, roc_curve, roc_auc_score
from imblearn.over_sampling import RandomOverSampler
from sklearn.ensemble import RandomForestClassifier
from sklearn.preprocessing import LabelEncoder, StandardScaler
from sklearn.ensemble import ExtraTreesClassifier
from imblearn.over_sampling import SMOTE

# Input data files are available in the read-only "../input/" directory
# For example, running this (by clicking run or pressing Shift+Enter) will list all files under the input directory

import os
for dirname, _, filenames in os.walk('/kaggle/input'):
    for filename in filenames:
        print(os.path.join(dirname, filename))

# You can write up to 5GB to the current directory (/kaggle/working/) that gets preserved as output when you create a version using "Save & Run All" 
# You can also write temporary files to /kaggle/temp/, but they won't be saved outside of the current session

In [None]:
ds = pd.read_csv('/kaggle/input/health-insurance-cross-sell-prediction/train.csv')

In [None]:
ds.head()

In [None]:
ds.drop('id',axis = 1, inplace = True)

In [None]:
ds.info()

In [None]:
ds['Response'].value_counts(normalize = True).mul(100) #dataset is highly imbalanced

In [None]:
sns.countplot(x = ds['Gender'], hue = ds['Response'], data = ds)

From the above graph we can see that, Men have have more response then women.

In [None]:
sns.countplot(x = ds['Previously_Insured'], hue = ds['Response'], data = ds)

The above plot shows that,regardless of having previous insurance they are less likely to get another one. 

In [None]:
sns.countplot(x = ds['Driving_License'], hue = ds['Response'], data = ds)

The above plot clearly shows that, only those who are having driving license are going to get the insurance. In that those who are actually going to get is very less compared to those who are not opting for it.

In [None]:
f = sns.FacetGrid(ds, hue = 'Response', aspect = 5)
f.map(sns.kdeplot, "Age", shade = True)
f.add_legend()

From the above plot, we can conclude that people between the age 40 - 50 years are more likely to buy the insurance, whereas people between the age 20 - 30 are least likely to buy insurance.

In [None]:
ds['Vehicle_Age'].value_counts().plot.pie(y = ds['Response'], autopct="%0.1f%%")

From the above pie chart, we can see that people whos vehicle range is between 1-2 years have maximum chanches of getting the insurance, followed by people whos vehicle age is less than 1 year and on the other hand peoples whos vehicle age is more than 2 years have very less chances of getting the insurance.

In [None]:
sns.catplot(x = 'Gender', hue = 'Response', col = 'Vehicle_Damage', data = ds, kind = 'count' )

In the above catplot we can see that, number of males who have their vehicle damaged and are going to get insurance is just close to 30000 where as for females its less than 20000 basically most of the males and females have less chances of getting the reponse.
Also number of males whos vehicle is not damaged and are going to get insurance is close to 1000, for females its even less than males.

In [None]:
sns.catplot(x = 'Gender', hue = 'Response', col = 'Vehicle_Age', data = ds, kind = 'count' )

In the above catplot we can say that males and female whos vehicle age is between 1-2 years are having more chances of getting the insurance where for the whos vehicle age is greater than 2 years or less then or equal to 1 years have least chances of getting the insurance.

In [None]:
sns.catplot(x = 'Gender', hue = 'Response', col = 'Driving_License', data = ds, kind = 'count' )

From the above plot we can simply say that people having driving license are only going to get the insurance.

In [None]:
sns.catplot(x = 'Gender', hue = 'Response', col = 'Previously_Insured', data = ds, kind = 'count' )

From the above plot we can observe that those who have already taken insurance previous will be not opting for insurance again.

f = sns.FacetGrid(ds, hue = 'Response', aspect = 5)
f.map(sns.kdeplot, "Annual_Premium", shade = True)
f.add_legend()

In [None]:
f = sns.FacetGrid(ds, hue = 'Response', aspect = 5)
f.map(sns.kdeplot, "Annual_Premium", shade = True)
f.add_legend()

From the above plot we can see that those who are getting insurance their annual premium will range between 10000 to 80000.

In [None]:
f = sns.FacetGrid(ds, hue = 'Response', aspect = 5)
f.map(sns.kdeplot, "Vintage", shade = True)
f.add_legend()

The above plot shows that those got the insurance or not still they have been been associated with the company for atleast 300 days.

In [None]:
f = sns.FacetGrid(ds, hue = 'Gender', aspect = 5)
f.map(sns.kdeplot, "Age", shade = True)
f.add_legend()

The above plot shows that maximum concentration of ages ages are ranging between 20-30, and for female the concentrarion is in between 20-35.

In [None]:
plt.figure(figsize = (20,20))
sns.heatmap(ds.corr(), annot = True)

The corelation heatmap shows that there is no significant corelation between the  independent features and some features are negatively corealted to each other.

In [None]:
## Feature Scaling and Feature Selection.

In [None]:
ds1 = ds.copy()

In [None]:
ds1['Driving_License'] = ds1['Driving_License'].astype(str)
ds1['Previously_Insured'] = ds1['Previously_Insured'].astype(str)

In [None]:
ds1.info()

In [None]:
ds1.head()

In [None]:
skr = ExtraTreesClassifier() # selecting only the required features based on their score.
score = skr.fit(ds1.drop(['Gender','Vehicle_Age', 'Response', 'Vehicle_Damage'], axis = 1), ds1.iloc[:,-1])

In [None]:
co = ['Gender','Vehicle_Age', 'Response', 'Vehicle_Damage']

In [None]:
columns = [x for x in ds1.columns if x not in co] # creating a list which only consist of continous features.
columns

In [None]:
ser = pd.Series(score.feature_importances_, index = columns) # converting the scores into series

In [None]:
ser.nlargest(10).plot(kind = 'barh') # plotting the scores as we can see that features like Driving License and 
# Ploicy Sales Channel have least scores.

In [None]:
ds1.head()

In [None]:
ds1.drop(['Driving_License','Previously_Insured'], axis  = 1, inplace = True) # removing unwanted features.

In [None]:
va = {
    '> 2 Years':2,
    '1-2 Year': 1.5,
    '< 1 Year': 1
}

In [None]:
ds1['Vehicle_Age'] = ds1['Vehicle_Age'].map(va) # label encoding vehicle age.

In [None]:
le = LabelEncoder()
sc = StandardScaler()

In [None]:
ds1.head()

In [None]:
ds1['Gender'] = le.fit_transform(ds1['Gender'])
ds1['Vehicle_Damage'] = le.fit_transform(ds1['Vehicle_Damage']) 
# encoding categorical features.

In [None]:
## Model Building.

In [None]:
ds1.head()

In [None]:
x = ds1.iloc[:,:8] # seperating data into x and y.
y = ds1.iloc[:,-1]

In [None]:
x = sc.fit_transform(x) # scaling the features.

In [None]:
x_tr,x_te,y_tr,y_te = train_test_split(x,y,test_size = 0.2) # splitting the data into training and testing.

In [None]:
sm = SMOTE(random_state=42) # oversampling.
X_sm, y_sm = sm.fit_sample(x, y)

In [None]:
#print(X_sm.shape),print(y_sm.shape)
print('before oversampling',y_tr.value_counts()) #before over sampling.

In [None]:
print('after oversampling',y_sm.value_counts()) # after over sampling.

In [None]:
mod = RandomForestClassifier()

In [None]:
mod.fit(X_sm,y_sm) # training.

In [None]:
y_hat = mod.predict(x_te) # predicting.

In [None]:
accuracy_score(y_te, y_hat) # checking the accuracy which is very high.

In [None]:
sns.heatmap(confusion_matrix(y_te, y_hat), annot = True) # looking at the confusion matrix.

In [None]:
print(classification_report(y_te, y_hat)) # checking other metrics for better idea.

In [None]:
roc_auc_score(y_te, y_hat) # checking the roc auc score.

In [None]:
fpr, tpr, _ = roc_curve(y_te, y_hat) # plotting the auc curve.

plt.title('ROC curve')
plt.xlabel('FPR ')
plt.ylabel('TPR ')

plt.plot(fpr,tpr)
plt.plot((0,1), ls='dashed',color='black')
plt.show()

In [None]:
## For Test Data Sumbission.

In [None]:
te = pd.read_csv('/kaggle/input/health-insurance-cross-sell-prediction/test.csv')

In [None]:
te.head()

In [None]:
te['Gender'] = le.fit_transform(te['Gender'])
te['Vehicle_Damage'] = le.fit_transform(te['Vehicle_Damage'])
te['Vehicle_Age'] = te['Vehicle_Age'].map(va)

In [None]:
te.head()

In [None]:
te.drop(['Driving_License','Previously_Insured'], axis = 1, inplace = True)

In [None]:
x1 = te.iloc[:,1:]
x1

In [None]:
x1 = sc.fit_transform(x1)

In [None]:
final_op = mod.predict(x1)

In [None]:
df = pd.DataFrame()
df['id'] = te['id'] 
df['response'] = final_op

In [None]:
df.to_csv('submission.csv',index = False)