In [None]:
# This Python 3 environment comes with many helpful analytics libraries installed
# It is defined by the kaggle/python Docker image: https://github.com/kaggle/docker-python
# For example, here's several helpful packages to load

import numpy as np # linear algebra
import pandas as pd # data processing, CSV file I/O (e.g. pd.read_csv)

# Input data files are available in the read-only "../input/" directory
# For example, running this (by clicking run or pressing Shift+Enter) will list all files under the input directory

import os
for dirname, _, filenames in os.walk('/kaggle/input'):
    for filename in filenames:
        print(os.path.join(dirname, filename))

# You can write up to 20GB to the current directory (/kaggle/working/) that gets preserved as output when you create a version using "Save & Run All" 
# You can also write temporary files to /kaggle/temp/, but they won't be saved outside of the current session

#### 1. Load Libraries

In [None]:
import pandas as pd
from pandas import DataFrame, Series
import numpy as np
import matplotlib.pyplot as plt
import seaborn as sns

from sklearn.preprocessing import LabelEncoder
from sklearn.model_selection import train_test_split
from sklearn import metrics
from scipy.stats import skew



import warnings
warnings.filterwarnings('always')
warnings.filterwarnings('ignore')


#### 2. Data mining & Inspection

* **1. id:**	Unique ID for the customer


* **2. Gender:**	Gender of the customer


* **3. Age:**	Age of the customer


* **4. Driving_License:**	   **1** : Customer already has DL, **0**: Customer does not have DL


* **5. Region_Code:**	Unique code for the region of the customer


* **6. Previously_Insured:**	  **1** : Customer already has Vehicle Insurance, **0** : Customer doesn't have Vehicle Insurance


* **7. Vehicle_Age:**	Age of the Vehicle


* **8. Vehicle_Damage:**	**1** : Customer got his/her vehicle damaged in the past. **0** : Customer didn't get his/her vehicle                damaged in the past.


* **9. Annual_Premium:**	The amount customer needs to pay as premium in the year


* **10. PolicySalesChannel:**	Anonymized Code for the channel of outreaching to the customer ie. Different Agents, Over                Mail, Over Phone, In Person, etc.


* **11. Vintage:**	Number of Days, Customer has been associated with the company


* **12. Response:**	**1** : Customer is interested, **0** : Customer is not interested

In [None]:
train = pd.read_csv('../input/health-insurance-cross-sell-prediction/train.csv')
test = pd.read_csv('../input/health-insurance-cross-sell-prediction/test.csv')

In [None]:
train.head()

In [None]:
train.shape

In [None]:
test.shape

In [None]:
print(train.info())
print('***********************************************')
print(test.info())

In [None]:
train.describe().T

In [None]:
train.isnull().sum()/train.shape[0] *100

In [None]:
test.isnull().sum()/train.shape[0] *100

In [None]:
print(train.isnull().sum())
print('\n', test.isnull().sum())

* **Conclusion -** We can see that there is no missing values in our dataset. Thus we do not need to impute any values and can proceed for further analysis


#### 3. Exploratory Data Analysis

In [None]:
train['Gender'].unique()

In [None]:
train['Gender'].value_counts()

In [None]:
plt.figure(figsize=(10,5))

plt.subplot(1,2,1)  #Univariate Analysis
plt.title('Count of Male and Female')
sns.countplot(train['Gender'])

plt.subplot(1,2,2)
plt.pie(train['Gender'].value_counts(), explode=[0.05,0] ,autopct='%.1f%%', labels=['Male', 'Female'], labeldistance=1.1)
plt.title('Percentage of Male and Female')

plt.legend()
plt.show()

* **Conclusion -** Above Plots shows that Male tend to subscribe to insurence more than female. 
  Therefore we can assume that gender plays an important role in model building.



In [None]:
print(train['Age'].mean())

In [None]:
print(train['Age'].mode())

In [None]:
sns.distplot(x=train['Age'])
plt.xlabel('Age')
plt.show()

In [None]:
print(skew(train['Age']))


* So here Age column has moderate skew, positive skew but its ok

In [None]:
sns.boxplot(train['Age'])
plt.show()

In [None]:
train.head()

In [None]:
train['Driving_License'].value_counts()

* This shows that most of all people have driving license.. That's Good 

In [None]:
plt.figure(figsize=(6,4))
sns.countplot('Gender',hue = 'Driving_License', data=train, palette='twilight')
plt.show()

In [None]:
plt.figure(figsize=(10,5))

plt.subplot(1,2,1)
sns.countplot(x = train["Previously_Insured"], palette='summer')

plt.subplot(1,2,2)
plt.pie(train['Previously_Insured'].value_counts(), explode=[0.025,0] ,autopct='%.1f%%', labels=['No', 'Yes'],
        labeldistance=1.1)
plt.title('Percentage of customers who were Previously Insured')

plt.show()

* **Conclusion -** So here we can see that most of the people have not previously insured 

In [None]:
plt.pie(train['Vehicle_Age'].value_counts(), explode=[0.025,0.05, 0] ,colors=[ 'blue', 'green', 'orange'], 
        autopct='%.1f%%', 
        labels=list(train['Vehicle_Age'].value_counts().index), labeldistance=1.1)
plt.title('Overall Percentage of Vehicle age')
plt.show()

In [None]:
#Univariate Analysis
sns.countplot(train['Vehicle_Age'],hue=train['Response'],palette='viridis')
plt.title('Vehicle Age with Response Comparison',fontsize=15)
plt.ylabel('Response')
plt.show()

* **Conclusion -** We can see that buyers whose vehicle are between 1 to 2 years are more entrusted to buy insurance rather whose vehicle are less than 1 year or more than 2 years.

In [None]:
plt.figure(figsize = (15,5))
sns.boxplot(train['Annual_Premium'])
plt.title('Box Plot')
plt.show()

* **Conclusion -**
* Question here is should we remove these ouliers in Annual_Premium data , but I think if a person is paying money more than Rs61892 annually , may he/she is a rich person who is capable of paying that much more money than others
* 10392 people are outliers if we remove them we can loss data from the table
* So the conclusion is we won't change anything is annual_premium data

In [None]:
sns.scatterplot(x='Gender',y='Annual_Premium',data=train)
plt.show()

In [None]:
plt.figure(figsize=(12,5))

sns.distplot(train['Vintage'])

plt.show()

* **Conclusion -** This Column is uniformly distributed we can't do nothing much to this column



In [None]:
train['Response'].value_counts()

In [None]:
plt.figure(figsize=(12,5))

plt.subplot(1,2,1)
plt.title('Count of Responses')
sns.countplot('Response', data=train, palette='ocean')

plt.subplot(1,2,2)
plt.pie(train['Response'].value_counts(), explode=[0.05,0] , colors=[ 'lightskyblue', 'orange'] ,autopct='%.1f%%', labels=['No', 'Yes'], labeldistance=1.1)
plt.title('Percentage of Response class')

plt.show()


* **Conclusion-** From the above graph and percentages we can clearly see that negative response is more than positive response in terms of buying insurance

In [None]:
plt.figure(figsize=(5,5))

sns.countplot(train['Gender'], hue=train['Response'], palette='Paired')
plt.title('Gender with respect to Response')

plt.show()


In [None]:
#Splitting Male & Female customers and analysing based on overall customers

male = train[train['Gender']=='Male']
female = train[train['Gender']=='Female']


count_response = []

#Count of customers who have not purchased/responded to vehicle insurance
count_response.append(train[train['Response'] == 0]['Response'].count())

#Count of customers who have purchased/responded to vehicle insurance and those who are Male customers
count_response.append(male[male['Response']==1]['Response'].count())

#Count of customers who have purchased/responded to vehicle insurance and those who are Female customers
count_response.append(female[female['Response']==1]['Response'].count())

count_response


In [None]:
plt.figure(figsize=(16,7))

plt.subplot(1,3,1)
plt.pie(male['Response'].value_counts(), explode=[0.025,0] ,autopct='%.1f%%', labels=['No', 'Yes'], labeldistance=1.1)
plt.title('Percentage of Males interested')

plt.subplot(1,3,2)
plt.pie(female['Response'].value_counts(), explode=[0.025,0] ,autopct='%.1f%%', labels=['No', 'Yes'], labeldistance=1.1)
plt.title('Percentage of Females interested')

plt.subplot(1,3,3)
plt.pie(count_response, explode=[0.05,0.05, 0] ,colors=[ 'grey', 'green', 'orange'], autopct='%.1f%%', labels=['No', 'Male_Yes', 'Female_Yes'], labeldistance=1.1)
plt.title('Overall Percentage of Males and Females interested')

plt.show()


* **Conclusion -**  From this graph we can clearly see that Male are having positive response towards insurance purchase while female are having negative response.Thus from this we can conclude that the most targeted customers are Male than Female.So, we need to focus on strengthening the Female customers buyers for buying the insurance.



In [None]:
plt.figure(figsize=(12,6))
plt.subplots_adjust(wspace=0.5)

plt.subplot(1,2,1)
sns.histplot(data=train, x='Age', hue='Gender', binwidth=5)
plt.title('Count of Male and Female who have responded')

plt.subplot(1,2,2)
sns.histplot(data=train, x='Age', hue='Gender', binwidth=5)
plt.title('Count of Male and Female who have not responded')
plt.show()


In [None]:
plt.figure(figsize = (10,10))
plt.title("Correlation Plot")
sns.heatmap(train.corr(), linewidth = 3, annot = True, square = True,  cmap="YlGnBu")
plt.show()

#### 4. Data Preprocessing

In [None]:
train.head()

In [None]:
train['Vehicle_Age'].unique()

In [None]:
train['Policy_Sales_Channel'].unique()

In [None]:

from sklearn.preprocessing import LabelEncoder
col = ['Gender', 'Vehicle_Age','Vehicle_Damage']
le = LabelEncoder()
for LE in col:
    train[LE] = le.fit_transform(train[LE])


In [None]:

from sklearn.preprocessing import LabelEncoder
col = ['Gender', 'Vehicle_Age','Vehicle_Damage']
le = LabelEncoder()
for LE in col:
    test[LE] = le.fit_transform(test[LE])


In [None]:
train.head()

In [None]:
test.head()

#### 5. Data Splitting & Data Scaling

In [None]:
#dependent and independent variables

X=train.drop(['Response'],axis=1)      #independent variable

y=train['Response']                    #dependent variable

X.head()

In [None]:
train['Response'].unique()

In [None]:
train['Response'].value_counts()

In [None]:
(train['Response'].value_counts(normalize= True)*100).plot(kind='bar')
plt.show()


* **Conclusion -**
* We can see that data is imbalanced with approx 85% of zero class and 15% of the other class.

* If such data is fed to the model as input, the classifier may make it biased w.r.t majority class as it was not provided with enough data of minority class to learn.

* So, to deal with this situation, we will apply one of the sampling techniques i.e SMOTE(Synthetic Minority Over-Sampling Technique) and check the results with imbalanced data.



In [None]:
from imblearn.over_sampling import SMOTE
oversample = SMOTE()
X_smo, y_smo = oversample.fit_resample(X, y)


In [None]:
sns.countplot(y_smo)
plt.show()

In [None]:
y_smo.value_counts()

In [None]:
#Scaling the independent variable to bring data in one range 

from sklearn.preprocessing import StandardScaler
SS=StandardScaler()

In [None]:
X1=SS.fit_transform(X_smo)
X1

In [None]:
#Sacling Of Test DataSet

X2 = SS.fit_transform(test)


In [None]:
#splitting the data into train(75%) and test(25%) for model

from sklearn.model_selection import train_test_split
X_train,X_test,y_train,y_test=train_test_split(X1,y_smo, test_size=0.25, random_state=1, stratify=y_smo)

#### 6. Maching Learning Model

* Machine Learning Libraries

In [None]:
from sklearn.linear_model import LogisticRegression
from sklearn.tree import DecisionTreeClassifier
from sklearn.linear_model import SGDClassifier
from sklearn.ensemble import RandomForestClassifier
from sklearn.ensemble import GradientBoostingClassifier
from sklearn.neighbors import KNeighborsClassifier
from sklearn.ensemble import AdaBoostClassifier
from sklearn.ensemble import BaggingClassifier

from sklearn.metrics import f1_score, recall_score, accuracy_score, roc_auc_score, precision_score, auc, roc_curve,classification_report,confusion_matrix


In [None]:
model = {
    "Logistic Regressor" : LogisticRegression(max_iter=15000, random_state=0),
    "DecisionTree Classifier" : DecisionTreeClassifier(), 
    "SGD Classifier" : SGDClassifier(), 
    "RandomForest Classifier" : RandomForestClassifier(), 
    "Gradient Boosting" : GradientBoostingClassifier(),
    "KNeighbor Classifier" : KNeighborsClassifier(n_neighbors=3),
    "AdaBoost Classifier" : AdaBoostClassifier(random_state=0),
    "Bagging Classifier" : BaggingClassifier(random_state=0)

        }


In [None]:
scores = []
prob_score = {}
for mod in model:
    classifier = model[mod]
    classifier.fit(X_train, y_train)
    pred = classifier.predict(X_test)
    try:
        score = classifier.predict_proba(X_test)[:,1]
        roc = roc_auc_score(y_test, score, average='weighted')
        prob_score[mod] = score
    except:
        roc = 0
    scores.append([
        mod,
        accuracy_score(y_test, pred),
        f1_score(y_test, pred, average='weighted'),
        precision_score(y_test, pred, average='weighted'),
        recall_score(y_test, pred, average='weighted'),
        roc
    ])


In [None]:
def highlight_max(s):
    is_max = s == s.max()
    return ['background-color: yellow' if v else '' for v in is_max]

scores_df  = pd.DataFrame(scores)
index_model = {count: s for count, s in enumerate(scores_df[0])}
col = {count+1: s for count, s in enumerate(['Accuracy','F1 Score','Precision','Recall','ROC AUC'])}
scores_df = scores_df.drop(0, axis=1)
scores_df = scores_df.rename(columns=col, index=index_model)
scores_df.style.apply(highlight_max)


* **Conclusion -** As wee see Bagging classifier gives us high accuracy 

***Bagging Classifier***

In [None]:
from sklearn import tree
model = BaggingClassifier(tree.DecisionTreeClassifier(random_state=1))


In [None]:
model.fit(X_train, y_train)

In [None]:
pred1 = model.predict(X_test)

In [None]:
print("Classification Report \n")
print(classification_report(y_test,pred1))


In [None]:
print("Confusion Matrix\n")
print(confusion_matrix(y_test,pred1))


In [None]:
plt.figure(figsize=(8,5))
sns.heatmap(confusion_matrix(y_test,pred1),cmap = 'viridis',annot = True,linewidths = 2,linecolor = 'white',fmt = '.2g')
plt.xlabel('Predicted')
plt.ylabel('Actual')
plt.show()

In [None]:
pred2 = model.predict(X2)
pred2

In [None]:
from sklearn.metrics import roc_curve

In [None]:
score = roc_auc_score(y_test, pred1)
score

In [None]:
fpr, tpr, thresholds = roc_curve(y_test, pred1)

In [None]:
plt.plot(fpr, tpr, label='ROC Curve (area=%0.2f)' % score)
plt.plot([0,1], [0,1], 'k--')
plt.legend()
plt.show()

* **Overall Conclusion -** 
* 1. People having Vehicles with age > 2 years have to pay more amount of annual premium and that has lead to higher number of people from that category not taking insurance. We need to modify the amount little bit so that people from that category do not skip taking insurance


* 2. people having Vehicle Damage tend to buy insurance as compared to the ones who do not have any damage.


* 3. Annual Premium does not depend on how many days people are associated with company. So we can modify the premium policy so that insurance company can attract more customers.


* 4. Overall there many negative response than positive response from the customers thus we can assume that most of the product offering from the insurance company are degraded or the customer after sales response is not good or the product offering are not covering today's world customer needs or there is lack of marketing strategy from the insurance company.


* 5. Thus, We can conclude that the insurance company overall need to improve in terms of all offerings and also increasing the marketability of the product so that those negative responses can be converted more positive responses.


* **submission -**

In [None]:
df1 = pd.read_csv('../input/health-insurance-cross-sell-prediction/test.csv')
df1.head()

In [None]:
df2 = pd.DataFrame(df1['id'])
df2.head()

In [None]:
df2.insert(1, 'Response',list(pred2) , True) 

In [None]:
df2.head()

In [None]:
df2.shape

In [None]:
df2.to_csv("Submission.csv", index = True)