In [None]:
# This Python 3 environment comes with many helpful analytics libraries installed
# It is defined by the kaggle/python Docker image: https://github.com/kaggle/docker-python
# For example, here's several helpful packages to load


import numpy as np # linear algebra
import pandas as pd # data processing, CSV file I/O (e.g. pd.read_csv)

# Input data files are available in the read-only "../input/" directory
# For example, running this (by clicking run or pressing Shift+Enter) will list all files under the input directory

import os
for dirname, _, filenames in os.walk('/kaggle/input'):
    for filename in filenames:
        print(os.path.join(dirname, filename))

# You can write up to 20GB to the current directory (/kaggle/working/) that gets preserved as output when you create a version using "Save & Run All" 
# You can also write temporary files to /kaggle/temp/, but they won't be saved outside of the current session

# Introduction

This data is used for predicting weather person will buy an insurance, so will built model accordingly!

# 1.Import Libraries

In [None]:
import matplotlib
from matplotlib import pyplot as plt
import seaborn as sns
sns.set(color_codes = True)
%matplotlib inline
pd.pandas.set_option('display.max_columns', None)


from sklearn.linear_model import LinearRegression,SGDClassifier, RidgeClassifier
from sklearn.linear_model import LogisticRegression
from sklearn.model_selection import train_test_split
from sklearn.metrics import confusion_matrix
from sklearn.preprocessing import LabelEncoder,MinMaxScaler , StandardScaler

# 2.Import Data

In [None]:
df_train = pd.read_csv("/kaggle/input/health-insurance-cross-sell-prediction/train.csv")
df_test = pd.read_csv("/kaggle/input/health-insurance-cross-sell-prediction/test.csv")

df_train.head()

In [None]:
## check Nan value
for i in df_train.columns:
    print (i+": "+str(df_train[i].isna().sum()))

# 3. Preprocessing Data

In [None]:
# We are removing some columns.
df = df.drop(['id','Region_Code','Policy_Sales_Channel'],axis = 1)

In [None]:
#convert Categorical feature in nominal encoding
df["Gender"] = pd.get_dummies(df['Gender'],drop_first = True)
df['Vehicle_Damage'] = pd.get_dummies(df["Vehicle_Damage"],drop_first = True)

In [None]:
# perform ordinal encoding on vehicle age
label = LabelEncoder()

df['Vehicle_Age'] = label.fit_transform(df['Vehicle_Age'])

df.head()

# 4. Visualization

In [None]:
# From the bar chart we can say men are caught in more vehicle damage as compared to female.
plt.figure(figsize = (15,8))
ax = sns.barplot(x="Gender", y="Vehicle_Damage", data=df,palette = 'icefire_r')

In [None]:
# it is clearly showing person who dont have Insurance that having more damage as compared to person who got an insurance.
plt.figure(figsize = (15,8))
ax = sns.barplot(x="Previously_Insured", y="Vehicle_Damage", data=df,palette = 'RdPu_r')

In [None]:
# From below graph we can predict vehicle age matters when it comes to an accident or damage.
plt.figure(figsize = (15,8))
ax = sns.barplot(x="Vehicle_Age", y="Vehicle_Damage", data=df)

In [None]:
# Most of the people who caught in damage they took an insurance. 
plt.figure(figsize = (15,8))
ax = sns.barplot(x="Response", y="Vehicle_Damage", data=df)

In [None]:
#Below graph shows it never affect parameter vintage(person associate with company in days), graph is just evenly splitted.

ax = sns.violinplot(x="Response", y="Vintage", data=df,
                    inner=None, color=".8")
ax = sns.stripplot(x="Response", y="Vintage", data=df)

In [None]:
# From below Scatter plot, we can not classify weather person who will pay more or less premium will buy an insurance, so will go for more visualization.


plt.figure(figsize=(16,8)) # Adding size to the graph- width by height
# Use `+` as marker; color set as `g` (green); size proportion to Y values
plt.scatter(x = df['Response'], y = df.Annual_Premium, c='r',alpha = 0.2) 
# set x/y labels
plt.xlabel('Response')
plt.ylabel('Annual_Premium')
# set title
plt.title('Response vs Annual_Premium')

In [None]:
# Scatter plot shows us all people who have age range between 20-80, fill annual premium nearly 100000-150000 range.

plt.figure(figsize=(16,8)) # Adding size to the graph- width by height
# Use `+` as marker; color set as `g` (green); size proportion to Y values
plt.scatter(x = df['Vintage'], y = df.Annual_Premium, c='r') 
# set x/y labels
plt.xlabel('Vintage')
plt.ylabel('Annual_Premium')
# set title
plt.title('Vintage vs Annual_Premium')

In [None]:
plt.figure(figsize=(16,8)) # Adding size to the graph- width by height
# Use `+` as marker; color set as `g` (green); size proportion to Y values
plt.scatter(x = df['Age'], y = df.Annual_Premium, c='r') 
# set x/y labels
plt.xlabel('Age')
plt.ylabel('Annual_Premium')
# set title
plt.title('Age vs Annual_Premium')

In [None]:
# In scatterplot we saw most of the people were paying annual premium,but due to outliers it may affect on result so to avoid outliers will take value upto 60000 and perform one analysis and for remaining samples will perform another analysis
ax = sns.boxplot(x="Annual_Premium",data = df)

In [None]:
# separating oulier.
mod_premium = df[df["Annual_Premium"] <= 60000]
print(mod_premium.shape)

ax = sns.boxplot(x="Annual_Premium",data = mod_premium)

In [None]:
# Separate Dependent variable and Independent variable.
x = mod_premium.iloc[:,:-1]
y = mod_premium.iloc[:,-1]

In [None]:
# Standard scaler helps us to make all variable in same unit.
standard = StandardScaler()

std_x = standard.fit_transform(x)

In [None]:
# Split data into Train and test data, as  we just use file which having train data.
x_train,x_test,y_train,y_test = train_test_split(std_x,y,test_size = 0.20,random_state = 40)


print("Training data:{}".format(x_train.shape))
print("Test data:{}".format(x_test.shape))

In [None]:
results = []

# 5. Perform Model

In [None]:
clf = LogisticRegression()

clf.fit(x_train,y_train)
y_predicted = clf.predict(x_test)
score = clf.score(x_test,y_test)


print(score)
results.append(score)

In [None]:
#As we separate data with outliers, will make different Analysis for Outliers and check how it gives result.
mod_premium_outlier= df[df["Annual_Premium"] >= 60000]
print(mod_premium_outlier.shape)

ax = sns.boxplot(x="Annual_Premium",data = mod_premium_outlier)

In [None]:
x = mod_premium_outlier.iloc[:,:-1]
y = mod_premium_outlier.iloc[:,-1]

In [None]:
standard = StandardScaler()

std_x = standard.fit_transform(x)

In [None]:
x_train,x_test,y_train,y_test = train_test_split(std_x,y,test_size = 0.20,random_state = 40)


print("Training data:{}".format(x_train.shape))
print("Test data:{}".format(x_test.shape))

In [None]:
clf = LogisticRegression()

clf.fit(x_train,y_train)
y_predicted = clf.predict(x_test)
score = clf.score(x_test,y_test)


print(score)
results.append(score)

# 6. Confusion Matrix

In [None]:
cnf_matrix = confusion_matrix(y_test, y_predicted)
np.set_printoptions(precision=2)
cnf_matrix

In [None]:
import itertools

def plot_confusion_matrix(cm, classes,
                          normalize=False,
                          title='Confusion matrix',
                          cmap=plt.cm.Blues):
    """
    This function prints and plots the confusion matrix.
    Normalization can be applied by setting `normalize=True`.
    """
    if normalize:
        cm = cm.astype('float') / cm.sum(axis=1)[:, np.newaxis]
        print("Normalized confusion matrix")
    else:
        print('Confusion matrix, without normalization')

    print(cm)

    plt.imshow(cm, interpolation='nearest', cmap=cmap)
    plt.title(title)
    plt.colorbar()
    tick_marks = np.arange(len(classes))
    plt.xticks(tick_marks, classes, rotation=45)
    plt.yticks(tick_marks, classes)

    fmt = '.2f' if normalize else 'd'
    thresh = cm.max() / 2.
    for i, j in itertools.product(range(cm.shape[0]), range(cm.shape[1])):
        plt.text(j, i, format(cm[i, j], fmt),
                 horizontalalignment="center",
                 color="white" if cm[i, j] > thresh else "black")

    plt.ylabel('True label')
    plt.xlabel('Predicted label')
    plt.tight_layout()

In [None]:
classes = df['Response'].value_counts()

In [None]:
plt.figure()
plot_confusion_matrix(cnf_matrix, classes=classes.index,
                      title='Confusion matrix, without normalization')
# With normalization
plt.figure()
plot_confusion_matrix(cnf_matrix, classes= classes.index, normalize=True,
                      title='Normalized confusion matrix')

plt.show()

# 7. Results

In [None]:
result_df = pd.DataFrame({"ML Models":['mod_premium','mod_premium_outlier'],"Score":results})
result_df

Basically, from above result logistic regression perform well on data which dont have outliers and thus we can target people who pays premium less than 60000 and we also perform with outliers, resulst gives pertty much good and with this model we can target who can pay premium more than 60000.