In [None]:
# This Python 3 environment comes with many helpful analytics libraries installed
# It is defined by the kaggle/python Docker image: https://github.com/kaggle/docker-python
# For example, here's several helpful packages to load

import numpy as np # linear algebra
import pandas as pd # data processing, CSV file I/O (e.g. pd.read_csv)

# Input data files are available in the read-only "../input/" directory
# For example, running this (by clicking run or pressing Shift+Enter) will list all files under the input directory

import os
for dirname, _, filenames in os.walk('/kaggle/input'):
    for filename in filenames:
        print(os.path.join(dirname, filename))

# You can write up to 20GB to the current directory (/kaggle/working/) that gets preserved as output when you create a version using "Save & Run All" 
# You can also write temporary files to /kaggle/temp/, but they won't be saved outside of the current session

In [None]:
import warnings
warnings.filterwarnings("ignore")

## Welcome to my first kernel!

I will try to examine the dataset of JantaHackathon: Cross-sell Prediction. Please give me your feedback, i will be glad to read it! If you like the kernel, please vote up.

 If you are interested in more informations about the dataset, check out the link: https://www.kaggle.com/kbambardekar/av-jantahackathon/tasks?taskId=2063

## Content

 1.  Introduction
     * Info about the dataset  
    
    
 2. Reading the data and knowing a little about it  
 
    
 3. Data analysis
     * Exploratory data analysis
     * Correlation
 
 
 4. Models
     * Setting X and y
     * Naive Bayes, Decision Tree, Random Forest, Logistic Regression
     * Stratified Nested Cross Validation
     * ROC AUC score

### 1. Introduction

#### Context

Each entry of the dataset represents a person who was offered a vehicle insurance.
The idea  is to predict whether a customer would be interested vehicle insurance or not based os some caractheristcs. These informations consists of demographics (gender, age, region code type), vehicles (vehicle age, damage), policy (premium, sourcing channel), current insurance (previously insurance, vintage).


#### Columns definitions

**Id** (numeric) - Unique ID for the customer

**Gender** (text: Male, Female) - Gender of the customer

**Age** (numeric) - Age of the customer

**Driving license** (numeric: 0, 1) - Customer does not have DL / customer has DL

**RegionCode** (numeric) - Unique code for the region of the customer

**PreviouslyInsured** (numeric: 0, 1) - Customer doesn't have vehicle insurance / customer already has vehicle insurance.

**VehicleAge** (numeric) - Age of the Vehicle

**VehicleDamage** (numeric: 0, 1)- Customer didn't get his/her vehicle damaged in the past / customer got his/her vehicle damaged in the past 

**AnnualPremium** (numeric) - The amount customer needs to pay as premium in the year

**PolicySalesChannel** (numeric) - Anonymised code for the channel of outreaching to the customer ie. Different agents, over mail, over phone, in person, etc.

**Vintage** (numeric) - Number of days customer has been associated with the company

**Response** (numeric: 0, 1) - Customer is not interested in vehicle insurance / customer is interested in vehicle insurance.

### 2. Reading the data and knowing a little about it

In [None]:
#math and working with data.
import pandas as pd 
import numpy as np 

#plots
import seaborn as sns 
import matplotlib.pyplot as plt 

train = pd.read_csv("/kaggle/input/av-jantahackathon/train.csv")

In [None]:
train.shape

In [None]:
#What does the data look like?
train.head()

In [None]:
#Id column is unnecesary.
train = train.drop(columns = ["id"])

In [None]:
#If there is any duplicated row, lets remove it.
train.drop_duplicates()

In [None]:
#Data description
train.info()

In [None]:
#Some statistics from each attribute
train.describe()

In [None]:
#Checking for NA values
train.isna().sum()

### 3. Data analysis

#### Target variable

In [None]:
plt.figure()
sns.countplot(x = train["Response"])
plt.title("Target distribution")
plt.show()

#### Gender

In [None]:
plt.figure()
sns.countplot(x = train["Response"], hue = train["Gender"])
plt.title("Target distribution")
plt.show()

#### Age 

In [None]:
train_both = train["Age"].values.tolist()
train_yes = train.loc[train["Response"] == 1]['Age'].values.tolist()
train_no = train.loc[train["Response"] == 0]['Age'].values.tolist()

plt.figure()

plt.subplot(2, 1, 1)
sns.kdeplot(x = train_both, fill = True)
plt.title("General age distribution")
    
plt.subplot(2, 2, 3)
sns.kdeplot(x = train_yes, fill = True, color = "green")
plt.title("Took insurance")
    
plt.subplot(2, 2, 4)
sns.kdeplot(x = train_no, fill = True, color = "orange")
plt.title("Did not take insurance")

plt.tight_layout()

plt.show()

#### I will  create a category column to work with age.

In [None]:
#Lets work with the age in classes
interval = (20, 30, 55, 90)
cats = ['Young', 'Adult', 'Senior']
train["AgeCat"] = pd.cut(train["Age"], interval, labels=cats)

plt.figure(figsize = (12,6))
sns.boxplot(x = train["AgeCat"], y = train["Age"], hue = train["Response"])
plt.show()

#### Driving License

In [None]:
plt.figure()
sns.countplot(x = train["Response"], hue = train["Driving_License"])
plt.title("Target distribution")
plt.show()

It seems like people that don't have a DL don't want to take a vehicle insurance. In fact, at least here in Brazil, people that don't have DL can buy a vehicle insurance, but the insurance company will not provide financial protection against traffic collisions or any other accident that occurs.

#### Region Code

I will try to find the top 5 regions that people are more and less interested in buying a vehicle insurance.

In [None]:
regions = [i for i in range(1, 53)]

regions_yes = []
regions_no = []

for region in regions:
    regions_yes.append(train.loc[(train["Region_Code"] == region) & (train["Response"] == 1)].shape[0])
    regions_no.append(train.loc[(train["Region_Code"] == region) & (train["Response"] == 0)].shape[0])
    
regions_yes, regions_no = np.array(regions_yes), np.array(regions_no)

percent_yes = regions_yes/(regions_yes+regions_no)
percent_no = regions_no/(regions_yes+regions_no)

tops_yes  = (-percent_yes).argsort()[:5]
tops_no = (-percent_no).argsort()[:5]

tops_yes += 1
tops_no += 1

arr = train["Region_Code"].values.tolist()
aux_yes = []
aux_no = []

for region in arr:
    if region in tops_yes: 
        aux_yes.append(1)
        aux_no.append(0)
    elif region in tops_no:
        aux_no.append(1)
        aux_yes.append(0)
    else:
        aux_yes.append(0)
        aux_no.append(0)

train["Top_Yes_Region"] = aux_yes
train["Top_No_Region"] = aux_no

In [None]:
top_yes_regions = train.loc[train["Top_Yes_Region"] == 1]
top_no_regions = train.loc[train["Top_No_Region"] == 1]

plt.figure()

plt.subplot(2, 1, 1)
sns.countplot(x = top_yes_regions["Region_Code"], hue = train["Response"])
plt.title("Regions most interested in insurance")

plt.subplot(2, 1, 2)
sns.countplot(x = top_no_regions["Region_Code"], hue = train["Response"])
plt.title("Regions less interested in insurance")


plt.tight_layout()


plt.show()

#### Previously Insured

In [None]:
plt.figure()
sns.countplot(x = train["Previously_Insured"], hue = train["Response"])
plt.show()

People that already have an insurance don't want to buy another or change it...

In [None]:
plt.figure()
sns.countplot(x = train["AgeCat"], hue = train["Previously_Insured"])
plt.show()

The "Young" category is the only one that has more people that already have an insurance. 
That can help explain why the majority of them reject taking insurance.

#### Vehicle Age

In [None]:
plt.figure()
sns.countplot(x = train["Vehicle_Age"], hue = train["Response"])
plt.show()

#### Vehicle Damage

In [None]:
plt.figure()
sns.countplot(x = train["Vehicle_Damage"], hue = train["Response"])
plt.show()

Lets create a new feature "New_NoDamage" to know when a car has < 1 Year and no damage.

In [None]:
train['New_NoDamage'] = np.where((train["Vehicle_Damage"] == "No") & (train["Vehicle_Age"] == "< 1 Year"), 1, 0)

#### Annual Premium

In [None]:
train_both = train["Annual_Premium"].values.tolist()
train_yes = train.loc[train["Response"] == 1]['Annual_Premium'].values.tolist()
train_no = train.loc[train["Response"] == 0]['Annual_Premium'].values.tolist()

plt.figure()

plt.subplot(2, 1, 1)
sns.kdeplot(x = train_both, fill = True)
plt.title("General annual premium distribution")
    
plt.subplot(2, 2, 3)
sns.kdeplot(x = train_yes, fill = True, color = "green")
plt.title("Took insurance")
    
plt.subplot(2, 2, 4)
sns.kdeplot(x = train_no, fill = True, color = "orange")
plt.title("Did not take insurance")

plt.tight_layout()

plt.show()

The distributions don't seem to be very different.

#### Policy Sales Channel

Lets look at the top channels in term of percentage of people that do/don't take insurance

In [None]:
channels = [i for i in range(1, 163)]

channel_yes = []
channel_no = []

for channel in channels:
    channel_yes.append(train.loc[(train["Policy_Sales_Channel"] == channel) & (train["Response"] == 1)].shape[0])
    channel_no.append(train.loc[(train["Policy_Sales_Channel"] == channel) & (train["Response"] == 0)].shape[0])
    
channel_yes, channel_no = np.array(channel_yes), np.array(channel_no)

channel_no = channel_no + 1

percent_yes = channel_yes/(channel_yes+channel_no)
percent_no = channel_no/(channel_yes+channel_no)

tops_yes  = (-percent_yes).argsort()[:3]
tops_no = (-percent_no).argsort()[:5]

tops_yes += 1
tops_no += 1

arr = train["Policy_Sales_Channel"].values.tolist()
aux_yes = []
aux_no = []

for channel in arr:
    if channel in tops_yes: 
        aux_yes.append(1)
        aux_no.append(0)
    elif channel in tops_no:
        aux_no.append(1)
        aux_yes.append(0)
    else:
        aux_yes.append(0)
        aux_no.append(0)

train["Top_Yes_Channel"] = aux_yes
train["Top_No_Channel"] = aux_no

In [None]:
#Top channels in term of percentage of people that don't take insurance
top_yes_channel = train.loc[train["Top_Yes_Channel"] == 1]
top_no_channel = train.loc[train["Top_No_Channel"] == 1]

plt.figure()

plt.subplot(1, 2, 1)
sns.countplot(x = top_yes_channel["Policy_Sales_Channel"], hue = train["Response"])
plt.title("Most effective channel")

plt.subplot(1, 2, 2)
sns.countplot(x = top_no_channel["Policy_Sales_Channel"], hue = train["Response"])
plt.title("Less effective channel")


plt.tight_layout()


plt.show()

#### Vintage

In [None]:
plt.figure()
sns.boxplot(y = train["Vintage"], x  = train["Response"])
plt.show()

I did some others visualizations and all of them look alike this one. They don't seem to have much information.

#### What does the dataset look like right now?**

In [None]:
train.head()

In [None]:
train.shape

I will use the one hot encoding to encode the categorical features.

In [None]:
train = pd.get_dummies(train)  #One Hot Encoding

#### Data correlation

In [None]:
plt.figure(figsize=(14,12))
sns.heatmap(train.astype(float).corr(),linewidths=0.1,vmax=1.0, 
            square=True,  linecolor='white', annot=True)
plt.show()

### 4. Models

#### Librarys

In [None]:
#Split data
from sklearn.model_selection import train_test_split, StratifiedKFold, cross_val_score

#Scale data
from sklearn.preprocessing import StandardScaler

#Metrics used in the competition to check model's answers
from sklearn.metrics import roc_auc_score

#Models
from sklearn.model_selection import GridSearchCV
from sklearn.neighbors import KNeighborsClassifier
from sklearn.naive_bayes import GaussianNB
from sklearn.tree import DecisionTreeClassifier
from sklearn.ensemble import RandomForestClassifier
from sklearn.linear_model import LogisticRegression

#Reproducibility of results
import random
random.seed(2)

#### Function to create models and measure their efficency

I'll use the following models:
    Naive Bayes, Decision tree, Random forest, Logistic regression
  
I'll use stratified nested cross validation because our data has a lot more people not interested in the insurance than people interested in it. Finnaly, ROC AUC score will be the score measure because it was the one used in the hackaton.

In [None]:
def classification_nested_crossvalidation(X, y, n_folds,k_folds, model, hp_grid):
    """
    Function that applies stratified nested crossvalidation method in a model and return
    its mean and std roc auc score
    
    Input:
        X - Independent variables;
        y - dependent variable (target);
        n_folds - number of folds;
        model - model to be applied.
    """
    
    #External cross validation
    cv_outer = StratifiedKFold(n_folds, shuffle=True)
    results = list()
    
    # internal cross validation
    for train_out, test_out in cv_outer.split(X, y):
        
        train_x, train_y = X[train_out], y[train_out]
        test_x, test_y = X[test_out], y[test_out]
        
        # internal CV (training and validation sets)
        grid_search_cv = GridSearchCV(model, hp_grid, cv=k_folds)
        
        # find the best hyperparameters
        result = grid_search_cv.fit(train_x, train_y)
        best_model = result.best_estimator_
        
        # prediction in the test fold
        pred_y = best_model.predict(test_x) 
        results.append(roc_auc_score(test_y, pred_y))
    
    return np.mean(results), np.std(results)

In [None]:
X = train.loc[:, train.columns != "Response"]
y = train.loc[:, "Response"]
X, y = X.to_numpy(), y.to_numpy()

models = [GaussianNB(),DecisionTreeClassifier(), RandomForestClassifier(), LogisticRegression()]
names = ["Naive Bayes", "Decision Tree", "Random Forest", "Logistic Regression"]
hiperparams = [{}, 
               {'criterion':['gini', 'entropy']},
               {'criterion':['entropy', 'gini'],'n_estimators':[10, 20]},
               {'solver':['newton-cg', 'lbfgs'], 'max_iter':[500]}]

In [None]:
for name, model, params in zip(names, models, hiperparams):
        
        n_folds = 5
        k_folds = 3
        mean, std = classification_nested_crossvalidation(X, y, n_folds, k_folds, model, params)
        message = "Model {}:\nMean: {}\tStd:{}\n\n".format(name, mean, std)
        print(message)

The logistic regression method did not converge... Any tips of what should I do here, other than increasing the parameter "max_iter"?