In [None]:
# This Python 3 environment comes with many helpful analytics libraries installed
# It is defined by the kaggle/python Docker image: https://github.com/kaggle/docker-python
# For example, here's several helpful packages to load

import numpy as np # linear algebra
import pandas as pd # data processing, CSV file I/O (e.g. pd.read_csv)

# Input data files are available in the read-only "../input/" directory
# For example, running this (by clicking run or pressing Shift+Enter) will list all files under the input directory

import os
for dirname, _, filenames in os.walk('/kaggle/input'):
    for filename in filenames:
        print(os.path.join(dirname, filename))

# You can write up to 5GB to the current directory (/kaggle/working/) that gets preserved as output when you create a version using "Save & Run All" 
# You can also write temporary files to /kaggle/temp/, but they won't be saved outside of the current session

# Introduction

An insurance company that has provided Health Insurance to its customers want to predict which customers may be interest in Vehicle Insurance provided by the company. Besides we have some informations about these customers and our goal is that will try to predict potential customers with using data.

In this kernel, we'll use some of the widely used supervised learning algorithms.

<font color = 'blue'>
Content:
    
1. [Load and Check Data](#1)
1. [Variable Description](#2)
1. [Take a Look Data](#3)
1. [Feature Engineering](#4)
    * [Drop ID](#5)
1. [Modeling - Supervised Learning](#6)
    * [Normalization](#7)
    * [Train Test Split](#8)
    * [Simple Logistic Regression](#9)
    * [Hyperparameter Tuning - Grid Search - Cross Validation](#10)
    * [Comparison of Accuracy](#11)
1. [Prediction and Submission](#12)
1. [Conclusion](#13) 

<a id = '1'></a><br>
# Load and Check Data

In [None]:
train_df = pd.read_csv('../input/health-insurance-cross-sell-prediction/train.csv')
test_df = pd.read_csv('../input/health-insurance-cross-sell-prediction/test.csv')

train_df.columns

In [None]:
train_df.head(10)

In [None]:
train_df.describe()

<a id = '2'></a><br>
# Variable Description

- **id:** Unique ID for the customer 
- **Gender:** Gender of the customer
- **Age:** Age of the customer
- **Driving_License:** 0 = Customer does not have DL, 1 = Customer already has DL
- **Region_Code:** Unique code for the region of the customer
- **Previously_Insured:** 1 = Customer already has Vehicle Insurance, 0 = Customer doesn't have Vehicle Insurance
- **Vehicle_Age:** Age of the Vehicle
- **Vehicle_Damage:** 1 = Customer got his/her vehicle damaged in the past. 0 = Customer didn't get his/her vehicle damaged in the past.
- **Annual_Premium:** : The amount customer needs to pay as premium in the year
- **PolicySalesChannel:** Anonymised Code for the channel of outreaching to the customer ie. Different Agents, Over Mail, Over Phone, In Person, etc.
- **Vintage:** : Number of Days, Customer has been associated with the company
- **Response:** : 1 = Customer is interested, 0 = Customer is not interested

In [None]:
train_df.info()

In [None]:
train_df.isnull().sum()

Besides, we don't have missing data.

<a id = '3'></a><br>
# Take a Look Data

In [None]:
# libraries for Visualization
import seaborn as sns
import matplotlib.pyplot as plt

In [None]:
pd.crosstab(train_df.Age,train_df.Gender).plot(kind="bar",figsize=(30,8))
plt.title('Age Frequency for Genders')
plt.xlabel('Age')
plt.ylabel('Frequency')
plt.show()

In [None]:
plt.subplot
g = sns.FacetGrid(train_df, col = "Response", height = 6)
g.map(sns.distplot, "Age", bins = 50)
plt.show()

In [None]:
def bar_plot(variable):
    """
        input: variable ex: "Vehicle_Age"
        output: bar plot & value count
    """
    # get feature
    var = train_df[variable]
    
    # count number of categorical variable(value/sample)
    varValue = var.value_counts()   
    
    #visualize
    plt.figure(figsize =(6,6))
    labels = varValue.index
    colors = ['#2C4447','#F3EC86','#679B75','red','green','brown']
    plt.pie(varValue, labels=labels, colors=colors, autopct='%1.1f%%')
    plt.ylabel("Rate")
    plt.title(variable)
    plt.show()
    
    #print("{}: \n {}".format(variable,varValue))
    

In [None]:
category1 = ["Gender","Vehicle_Age","Vehicle_Damage"]
for c in category1:
    bar_plot(c)

<a id = '4'></a><br>
# Feature Engineering

In [None]:
train_df.head(10)

In [None]:
test_df.head()

At the same time, we will do same process for test data.

In [None]:
train_df = pd.get_dummies(train_df, columns = ["Driving_License"])
test_df = pd.get_dummies(test_df, columns = ["Driving_License"])
train_df.head()

In [None]:
plt.figure(figsize=(30,8))
sns.countplot(x="Region_Code", data = train_df)
plt.xticks(rotation = 60)
plt.show()

There are 52 different Region Codes. I think we don't change it.

In [None]:
train_df = pd.get_dummies(train_df, columns = ["Region_Code"], prefix = "RC")
test_df = pd.get_dummies(test_df, columns = ["Region_Code"], prefix = "RC")
train_df.head()

In [None]:
plt.figure(figsize=(30,8))
sns.countplot(x="Policy_Sales_Channel", data = train_df)
plt.xticks(rotation = 60)
plt.show()

In [None]:
train_df.Policy_Sales_Channel.value_counts().head(10)

There is a big difference among the Sales Channels here. So we will keep top 8 Sales Channels that sold  most insurances.

In [None]:
train_df["Policy_Sales_Channel"] = [i if i == 152.0 or i == 26.0 or i == 124.0 or i == 160.0 or i == 156.0 or i==122.0 or i == 157.0 or i == 154.0 else 200 for i in train_df.Policy_Sales_Channel]
test_df["Policy_Sales_Channel"] = [i if i == 152.0 or i == 26.0 or i == 124.0 or i == 160.0 or i == 156.0 or i==122.0 or i == 157.0 or i == 154.0 else 200 for i in test_df.Policy_Sales_Channel]
train_df.Policy_Sales_Channel.value_counts().head(10)

In [None]:
plt.figure(figsize=(30,8))
sns.countplot(x="Policy_Sales_Channel", data = train_df)
plt.xticks(rotation = 60)
plt.show()

Now, we have less category for Policy Sales Channel but this might work too.

In [None]:
train_df = pd.get_dummies(train_df, columns = ["Policy_Sales_Channel"], prefix = "SC")
test_df = pd.get_dummies(test_df, columns = ["Policy_Sales_Channel"], prefix = "SC")
train_df.head()

In [None]:
train_df = pd.get_dummies(train_df, columns = ["Vehicle_Damage"], prefix = "VD")
test_df = pd.get_dummies(test_df, columns = ["Vehicle_Damage"], prefix = "VD")
train_df.head()

In [None]:
train_df = pd.get_dummies(train_df, columns = ["Vehicle_Age"], prefix = "VA")
test_df = pd.get_dummies(test_df, columns = ["Vehicle_Age"], prefix = "VA")
train_df.head()

In [None]:
train_df = pd.get_dummies(train_df, columns = ["Gender"], prefix = "G")
test_df = pd.get_dummies(test_df, columns = ["Gender"], prefix = "G")
train_df.head()

In [None]:
train_df.info()

In [None]:
test_df.info()

<a id = '5'></a><br>
## Drop ID

In [None]:
train_df.drop(labels = ["id"], axis = 1, inplace = True)

In [None]:
train_df.columns

<a id = '6'></a><br>
# Modeling - Supervised Learning

In this section, we'll use some of the widely used supervised learning classification algorithms. But first we'll prepare data.

<a id = '7'></a><br>
## Normalization 

In [None]:
y = train_df.Response.values
x_data = train_df.drop(["Response"],axis=1)

# normalization 
x = ( x_data - np.min(x_data) ) / ( np.max(x_data) - np.min(x_data) ).values

<a id = '8'></a><br>
## Train Test Split

In [None]:
# %% split data
from sklearn.model_selection import train_test_split

x_train, x_val, y_train, y_val = train_test_split(x,y,test_size = 0.1, random_state = 42) # validation data = 0.1 data

In [None]:
print("X_train",len(x_train))
print("x_val",len(x_val))
print("y_train",len(y_train))
print("y_val",len(y_val))

print("test",len(test_df))


<a id = '9'></a><br>
# Simple Logistic Regression

In [None]:
from sklearn.linear_model import LogisticRegression

logreg = LogisticRegression()
logreg.fit(x_train,y_train)
acc_log_train = round(logreg.score(x_train, y_train)*100,2)
acc_log_val = round(logreg.score(x_val, y_val)*100,2)
print("Training Accuracy: % {}".format(acc_log_train))
print("Testing Accuracy: % {}".format(acc_log_val))

<a id = '10'></a><br>
# Hyperparameter Tuning - Grid Search - Cross Validation

We will compare 6 ml classifier and evaluate mean accuracy of each of them by stratified cross validation. Therefore, we will create subset training data.

* Decision Tree
* SVM
* Random Forest
* KNN
* Logistic Regression
* Naive Bayes Classification

In [None]:
# import models
from sklearn.model_selection import train_test_split, StratifiedKFold, GridSearchCV
from sklearn.linear_model import LogisticRegression
from sklearn.svm import SVC
from sklearn.ensemble import RandomForestClassifier, VotingClassifier
from sklearn.neighbors import KNeighborsClassifier
from sklearn.tree import DecisionTreeClassifier
from sklearn.metrics import accuracy_score
from sklearn.naive_bayes import GaussianNB

In [None]:
# %% split data

x_train1, x_ss, y_train1, y_ss = train_test_split(x_train,y_train,test_size = 0.01, random_state = 42) # subset data = 0.01 training data

In [None]:
random_state = 42
classifier = [DecisionTreeClassifier(random_state = random_state),
             SVC(random_state = random_state),
             RandomForestClassifier(random_state = random_state),
             LogisticRegression(random_state = random_state),
             KNeighborsClassifier(),
             GaussianNB()]

dt_param_grid = {"min_samples_split" : range(10,100,20),
                "max_depth": range(1,20,4)}

svc_param_grid = {"kernel" : ["rbf"],
                 "gamma": [ 0.01, 0.1, 1],
                 "C": [1,10,50,100,500]}

rf_param_grid = {"max_features": [1,3,10],
                "min_samples_split":[2,3,10],
                "min_samples_leaf":[1,3,10],
                "bootstrap":[False],
                "n_estimators":[100,300],
                "criterion":["gini"]}

logreg_param_grid = {"C":np.logspace(-3,3,7),
                    "penalty": ["l1","l2"]}

knn_param_grid = {"n_neighbors": np.linspace(1,19,10, dtype = int).tolist(),
                 "weights": ["uniform","distance"],
                 "metric":["euclidean","manhattan"]}

naive_param_grid = {}

classifier_param = [dt_param_grid,
                   svc_param_grid,
                   rf_param_grid,
                   logreg_param_grid,
                   knn_param_grid,
                   naive_param_grid]

In [None]:
cv_result = []

best_estimators = []

for i in range(len(classifier)):
    clf = GridSearchCV(classifier[i], param_grid=classifier_param[i], cv = StratifiedKFold(n_splits = 10), scoring = "accuracy", n_jobs = -1,verbose = 1)
    clf.fit(x_ss,y_ss)     
    cv_result.append(clf.best_score_) # save best scores
    best_estimators.append(clf.best_estimator_) # save best estimators
    print(cv_result[i])

<a id = '11'></a><br>
# Comparison of Accuracy

In [None]:
cv_results = pd.DataFrame({"Cross Validation Means":cv_result, "ML Models":["DecisionTreeClassifier", "SVM","RandomForestClassifier",
             "LogisticRegression",
             "KNeighborsClassifier","GaussianNB"]})

g = sns.barplot("Cross Validation Means", "ML Models", data = cv_results)
g.set_xlabel("Mean Accuracy")
g.set_title("Cross Validation Scores")
plt.show()

I will choose 3 ML algortihms.  

In [None]:
votingC = VotingClassifier(estimators = [("dt",best_estimators[0]),
                                        ("rfc",best_estimators[2]),
                                        ("lr",best_estimators[3])],
                                        voting = "soft", n_jobs = -1)

votingC = votingC.fit(x_train, y_train) 
print(accuracy_score(votingC.predict(x_val),y_val))

<a id = '12'></a><br>
# Prediction and Submission
Actually, we've achieved almost same score with Logistic Regression. Although I will use last model to make prediction. 

In [None]:
test_df_id = test_df.id
test_df.drop(labels = ["id"], axis = 1, inplace = True)

In [None]:
test_df_response = pd.Series(votingC.predict(test_df), name = "Response").astype(int)

results = pd.concat([test_df_id, test_df_response],axis = 1)

results.to_csv("cross_sell_prediction.csv", index = False)

<a id = '13'></a><br>
## Conclusion
Please let me know if you have any suggestions or ideas on how to improve the model and results. Thanks for reading.