<img src="https://bradshiversinsurance.com/wp-content/uploads/sites/2194/2015/10/bgtest.jpg" alt="Insurance" width="1500">


# 1. Introduction

Insurance company that has provided Health Insurance to its customers now they need your help in building a model to predict whether the policyholders (customers) from past year will also be interested in Vehicle Insurance provided by the company.

The strategy is to reach out to those customers and optimise its business model and revenue by create model in order to predict whether a customer would be interested in Vehicle Insurance from these information about demographics (gender, age, region code type), Vehicles (Vehicle Age, Damage), Policy (Premium, sourcing channel) etc.

In [None]:
# This Python 3 environment comes with many helpful analytics libraries installed
# It is defined by the kaggle/python Docker image: https://github.com/kaggle/docker-python
# For example, here's several helpful packages to load

import numpy as np # linear algebra
import pandas as pd # data processing, CSV file I/O (e.g. pd.read_csv)

# Input data files are available in the read-only "../input/" directory
# For example, running this (by clicking run or pressing Shift+Enter) will list all files under the input directory

import os
for dirname, _, filenames in os.walk('/kaggle/input'):
    for filename in filenames:
        print(os.path.join(dirname, filename))

# You can write up to 20GB to the current directory (/kaggle/working/) that gets preserved as output when you create a version using "Save & Run All" 
# You can also write temporary files to /kaggle/temp/, but they won't be saved outside of the current session

In [None]:
import numpy as np
import pandas as pd
import matplotlib.pyplot as plt
import seaborn as sns
from collections import Counter
from time import time

from sklearn.preprocessing import OneHotEncoder, OrdinalEncoder, StandardScaler
from sklearn.pipeline import make_pipeline
from sklearn.compose import ColumnTransformer
from sklearn.model_selection import RandomizedSearchCV, cross_val_score, cross_val_predict, train_test_split, StratifiedKFold
from sklearn.metrics import confusion_matrix, plot_confusion_matrix,  roc_curve, auc, accuracy_score, precision_score, classification_report, roc_auc_score

from sklearn.metrics import precision_recall_curve
from sklearn.metrics import plot_precision_recall_curve

from sklearn.metrics import average_precision_score

from sklearn.ensemble import RandomForestClassifier
from lightgbm import LGBMClassifier
from sklearn.linear_model import LogisticRegression, RidgeClassifier
from sklearn.neighbors import KNeighborsClassifier
from xgboost import XGBClassifier

from imblearn.over_sampling import RandomOverSampler
from imblearn.under_sampling import RandomUnderSampler
from imblearn.over_sampling import SMOTE, ADASYN

import warnings
warnings.filterwarnings('ignore')

# 2. Load Dataset

In [None]:
train = pd.read_csv('../input/health-insurance-cross-sell-prediction/train.csv')

test = pd.read_csv('../input/health-insurance-cross-sell-prediction/test.csv')

train.drop(columns = 'id', inplace =True)
test.drop(columns = 'id', inplace =True)

In [None]:
print(train.shape)
train.head()

In [None]:
print(test.shape)
test.head()

In [None]:
#Check missing values
dataset = pd.concat([train,test]).reset_index(drop=True)

print(dataset.isnull().sum())
# sns.heatmap(dataset.isnull(), cbar=False, cmap='YlGnBu_r');

* We combine train and test data and check missing value.
* There are no missing (Response is targer variable, test dataset have no target variable)

In [None]:
train.info()

 # 3. Exploratory Data Analysis

## 3.1 Univariate analysis

In [None]:
# dependent variable/Predicted variable/Target variable
sns.countplot(x='Response', data=train, order = train['Response'].value_counts().index);

Target variable `Response` is imbalance data so in this case will be a problem like a bias in the training dataset can influence many machine learning algorithms, leading some to ignore the minority class entirely. This is a problem as it is typically the minority class on which predictions are most important.

In [None]:
col = ['Gender', 'Driving_License', 'Previously_Insured', 'Vehicle_Age', 'Vehicle_Damage']

plt.figure(figsize=(18,4))

for i, col in enumerate(col):
    plt.subplot(1,5, i+1)
    sns.countplot(x=col, data=train, order = train[col].value_counts().index)
    
plt.tight_layout(pad=1);

In [None]:
plt.figure(figsize=(17,10))

col = [c for c in train.columns if c not in ['Gender', 'Driving_License', 'Previously_Insured', 'Vehicle_Age', 'Vehicle_Damage', 'id', 'Response']]

for i, col in enumerate(col):
    plt.subplot(2,3, i+1)
    sns.histplot(data=train, x=col, kde=True)
    
plt.tight_layout(pad=1)

## 3.2 Bivariate analysis

In [None]:
col = ['Gender', 'Driving_License', 'Previously_Insured', 'Vehicle_Age', 'Vehicle_Damage']

for i, col in enumerate(col):
    sns.catplot(x=col, col='Response', col_wrap=2,data=train, kind="count", height=2.5, aspect=1)

In [None]:
# Plot heat map
fig, ax = plt.subplots(figsize=(12, 10))
cmap = sns.diverging_palette(10, 240, n=9)

train_corr = train.corr()
sns.heatmap(train_corr, annot=True, fmt=".2f", linewidths=2, cmap=cmap, vmin=-1, vmax=1, cbar_kws={"shrink": .9}, square=True);

There are no independent variable is highly correlated that mean there no Multicollinearity Problem

In [None]:
col = [c for c in train.columns if c not in ['Gender', 'Driving_License', 'Previously_Insured', 'Vehicle_Age', 'Vehicle_Damage', 'id', 'Response']]

sns.pairplot(train[col]);

# 4. Data Preprocessing

In [None]:
# dataset = [train,test]

# for df in dataset:
    
# #     df.replace(['Male', 'Female'], [1, 0], inplace=True)

#     vehicle_age_map = {'< 1 Year':0, '1-2 Year':1, '> 2 Years':2}
#     df['Vehicle_Age'] = df['Vehicle_Age'].map(vehicle_age_map).astype('int')

# #     vehicle_damage_map = {'Yes':1, 'No':0}
# #     df['Vehicle_Damage'].replace(vehicle_damage_map, inplace=True)


We consider these independent variables to "String" type then we will apply one-hot encoding to transform categorical data to numerical data.

In [None]:
# Change data type
dataset = [train,test]

for df in dataset:
    col = ['Gender', 'Driving_License', 'Previously_Insured', 'Vehicle_Damage']
    for col in col:
        df[col] = df[col].astype(str)

In [None]:
from sklearn.utils import shuffle

train = shuffle(train)

X = train.iloc[:,:-1]
y = train.iloc[:,-1]

print(Counter(y))
print(f'Ratio 0:1 = {Counter(y)[0] / Counter(y)[1]:.4f}%')

Python scikit-learn provides a Pipeline utility to help automate machine learning workflows. The goal is to ensure that all of the steps in the pipeline are constrained to the data available for the evaluation.

* Numerical dataset: we will scale data by using StandardScaler
* Categorical dataset: we will transform data to numerical data by using OrdinalEncoder such as `Vehicle_Age` for ordinal data. OneHotEncoder for data without ordinal such as `Gender`, `Driving_License`, `Previously_Insured`, `Vehicle_Damage`

In [None]:
# Get categorical/numerical columns
cat_col = [col for col in X.columns if X[col].dtype == 'object']
cat_col.remove('Vehicle_Age')

ori_col = ['Vehicle_Age']

num_col = [col for col in X.columns if X[col].dtype != 'object']


# Numerical preprocess missing value
numerical_transformer = make_pipeline(StandardScaler())

# Categorical preprocess missing value
categorical_transformer = make_pipeline(OrdinalEncoder())
categorical_transformer2 = make_pipeline(OneHotEncoder(handle_unknown='error',sparse=False))
                                        
# Preprocess Numerical and Categorical variable
preprocess = ColumnTransformer(transformers=[
        ('num', numerical_transformer, num_col),
        ('ordi', categorical_transformer, ori_col),
        ('cat', categorical_transformer2, cat_col)])


In [None]:
preprocess.fit_transform(X)

enc_cat_col = preprocess.named_transformers_['cat']['onehotencoder'].get_feature_names()

labels = np.concatenate([num_col, ori_col, enc_cat_col])

X_transformed = pd.DataFrame(preprocess.fit_transform(X), columns=labels)

X_transformed.rename(columns={'x0_Female':'Gender_Female',
                             'x0_Male':'Gender_Male',
                              'x1_0':'Driving_License_0',
                              'x1_1':'Driving_License_1',
                              'x2_0':'Previously_Insured_0',
                              'x2_1':'Previously_Insured_1',
                              'x3_No':'Vehicle_Damage_No',
                              'x3_Yes':'Vehicle_Damage_Yes',}, inplace=True)
X_transformed

In [None]:
# # Get categorical/numerical columns
# cat_col = [col for col in X.columns if X[col].dtype == 'object']

# num_col = [col for col in X.columns if X[col].dtype != 'object']


# # # Numerical preprocess missing value
# # numerical_transformer = make_pipeline(StandardScaler())

# # Categorical preprocess missing value
# categorical_transformer = make_pipeline(OneHotEncoder(handle_unknown='error',sparse=False))
                                        
# # Preprocess Numerical and Categorical variable
# preprocess = ColumnTransformer(transformers=[
# #         ('num', numerical_transformer, num_col),
#         ('cat', categorical_transformer, cat_col)])

# preprocess.fit_transform(X)

# enc_cat_col = preprocess.named_transformers_['cat']['onehotencoder'].get_feature_names()

# labels = np.concatenate([num_col, enc_cat_col])

# X_transformed = pd.DataFrame(preprocess.fit_transform(X), columns=enc_cat_col)
# X_transformed
# X_transformed.rename(columns={'x0_Female':'Gender_Female',
#                              'x0_Male':'Gender_Male',
#                               'x1_0':'Driving_License_0',
#                               'x1_1':'Driving_License_1',
#                               'x2_0':'Previously_Insured_0',
#                               'x2_1':'Previously_Insured_1',
#                               'x3_1-2 Year' : 'Vehicle_Age_1_2_Year', 
#                               'x3_< 1 Year' : 'Vehicle_Age_1_Year', 
#                               'x3_> 2 Years' : 'Vehicle_Age_2_Year', 
#                               'x4_No':'Vehicle_Damage_No',
#                               'x4_Yes':'Vehicle_Damage_Yes',}, inplace =True)

# X_transformed = pd.concat([X[num_col], X_transformed], axis =1)
# X_transformed

We will split data to train set 80% and test set 20%

In [None]:
#Split data
X_train, X_test, y_train, y_test = train_test_split(X_transformed, y, test_size=0.20, random_state=0, stratify=y)

print('Train Dataset',Counter(y_train))
print(f'Ratio 0:1 = {Counter(y_train)[0] / Counter(y_train)[1]:.4f}%')
print('\n')
print('Test Dataset',Counter(y_test))
print(f'Ratio 0:1 = {Counter(y_test)[0] / Counter(y_test)[1]:.4f}%')

In [None]:
# skf = StratifiedKFold(n_splits=5, random_state=0, shuffle=True)

# model = {'LogisticRegression': LogisticRegression(random_state=0),
#         'RidgeClassifier' : RidgeClassifier(random_state=0),
#         'LGBMClassifier' : LGBMClassifier(random_state=0),
#         'KNeighborsClassifier' : KNeighborsClassifier(),
#         'XGBClassifier' : XGBClassifier(random_state=0),
#         'RandomForestClassifier': RandomForestClassifier(random_state=0)}

# for name in model:
#     start = time()
#     score = cross_val_score(model[name], X_train, y_train, scoring = 'accuracy', cv = skf, n_jobs = -1)
#     end = time()
#     print(f'{name}\naccuracy score: {score.mean():.5f} Â±{score.std():.4f} Time:{end-start:.1f} sec')

# 5. Baseline Original Dataset

In [None]:
model = LGBMClassifier()

model.fit(X_train, y_train)

y_predict = model.predict(X_test)

# confusion_matrix(y_test, y_predict)

data = pd.DataFrame({'test': y_test,'pred': y_predict})
matrix = pd.crosstab(data.test, data.pred, rownames=['Actual'], colnames=['Predicted'])
sns.heatmap(matrix, annot=True, fmt='g')
plt.title(f'Original Dataset')
plt.show()

y_score = model.predict_proba(X_test)[:,1]
fpr, tpr, _ = roc_curve(y_test, y_score)

plt.title('LogisticRegression ROC curve: CC Fraud')
plt.xlabel('FPR (Precision)')
plt.ylabel('TPR (Recall)')

plt.plot(fpr,tpr, label='LogisticRegression')
plt.legend(loc='lower right')
plt.plot((0,1), ls='dashed',color='black')
plt.show()
print ('Area under curve (AUC): ', auc(fpr,tpr)) #roc_auc_score(y_test, y_score)

print(f'precision : {precision_score(y_test, y_predict)}')
print('\n')
print(classification_report(y_test, y_predict))

average_precision = average_precision_score(y_test, y_predict)

disp = plot_precision_recall_curve(model, X_test, y_test)

disp.ax_.set_title('Precision-Recall curve: Average precision score={0:0.2f}'.format(average_precision))

# 6. Resampling Training Dataset

<img src="https://miro.medium.com/max/2400/1*ENvt_PTaH5v4BXZfd-3pMA.png" alt=" resample" width="700">

One approach to addressing the problem of class imbalance is to randomly resample the training dataset. 

The two main approaches technique for rebalancing the class distribution for an imbalanced dataset, to randomly resampling an imbalanced dataset are


## **6.1. Oversampling**

* Random oversampling duplicates examples from the minority class in the training dataset and can result in overfitting for some models.


## **6.2. Undersampling**

* Random undersampling deletes examples from the majority class and can result in losing information invaluable to a model.

### 6.1.1 Naive random over-sampling

This makes them simple to implement and fast to execute, which is desirable for very large and complex datasets.

Both techniques can be used for two-class (binary) classification problems and multi-class classification problems with one or more majority or minority classes.

Generally, these naive methods can be effective, although that depends on the specifics of the dataset and models involved.


*Importantly, the change to the class distribution is only applied to the training dataset. The intent is to influence the fit of the models. The resampling is not applied to the test or holdout dataset used to evaluate the performance of a model.*

In [None]:
ros = RandomOverSampler(random_state=0)

X_resampled, y_resampled = ros.fit_resample(X_train, y_train)

print(Counter(y_resampled))
print(f'Ratio 0:1 = {Counter(y_resampled)[0] / Counter(y_resampled)[1]:.4f}%')

model = LGBMClassifier(random_state=0)

model.fit(X_resampled, y_resampled)

y_predict = model.predict(X_test)

# confusion_matrix(y_test, y_predict)

data = pd.DataFrame({'test': y_test,'pred': y_predict})
matrix = pd.crosstab(data.test, data.pred, rownames=['Actual'], colnames=['Predicted'])
sns.heatmap(matrix, annot=True, fmt='g')
plt.title(f'Naive random over-sampling')
plt.show()

#plot_confusion_matrix(model, X_test, y_test);

y_score = model.predict_proba(X_test)[:,1]
fpr, tpr, _ = roc_curve(y_test, y_score)

plt.title('LogisticRegression ROC curve: CC Fraud')
plt.xlabel('FPR (Precision)')
plt.ylabel('TPR (Recall)')

plt.plot(fpr,tpr, label='LogisticRegression')
plt.legend(loc='lower right')
plt.plot((0,1), ls='dashed',color='black')
plt.show()
print ('Area under curve (AUC): ', auc(fpr,tpr)) #roc_auc_score(y_test, y_score)

print(f'precision : {precision_score(y_test, y_predict)}')
print('\n')
print(classification_report(y_test, y_predict))

average_precision = average_precision_score(y_test, y_predict)

disp = plot_precision_recall_curve(model, X_test, y_test)

disp.ax_.set_title('Precision-Recall curve: Average precision score={0:0.2f}'.format(average_precision))

<img src="https://miro.medium.com/max/512/1*FcM03wUtW_dB2YGZXyVb7Q.png" alt=" resample" width="600">


### 6.1.2 Over-sampling to SMOTE

Synthetic Minority Over-sampling Technique (SMOTE). This method is considered a state-of-art technique and works well in various applications. This method generates synthetic data based on the feature space similarities between existing minority instances. In order to create a synthetic instance, it finds the K-nearest neighbors of each minority instance, randomly selects one of them, and then calculate linear interpolations to produce a new minority instance in the neighborhood.

In [None]:
sm = SMOTE(random_state=0)

X_resampled, y_resampled = sm.fit_resample(X_train, y_train)

print(Counter(y_resampled))
print(f'Ratio 0:1 = {Counter(y_resampled)[0] / Counter(y_resampled)[1]:.4f}%')

model = LGBMClassifier(random_state=0)

model.fit(X_resampled, y_resampled)

y_predict = model.predict(X_test)

# confusion_matrix(y_test, y_predict)

data = pd.DataFrame({'test': y_test,'pred': y_predict})
matrix = pd.crosstab(data.test, data.pred, rownames=['Actual'], colnames=['Predicted'])
sns.heatmap(matrix, annot=True, fmt='g')
plt.title(f'SMOTE')
plt.show()

#plot_confusion_matrix(model, X_test, y_test);

y_score = model.predict_proba(X_test)[:,1]
fpr, tpr, _ = roc_curve(y_test, y_score)

plt.title('LogisticRegression ROC curve: CC Fraud')
plt.xlabel('FPR (Precision)')
plt.ylabel('TPR (Recall)')

plt.plot(fpr,tpr, label='LogisticRegression')
plt.legend(loc='lower right')
plt.plot((0,1), ls='dashed',color='black')
plt.show()
print ('Area under curve (AUC): ', auc(fpr,tpr)) #roc_auc_score(y_test, y_score)

print(f'precision : {precision_score(y_test, y_predict)}')
print('\n')
print(classification_report(y_test, y_predict))

average_precision = average_precision_score(y_test, y_predict)

disp = plot_precision_recall_curve(model, X_test, y_test)

disp.ax_.set_title('Precision-Recall curve: Average precision score={0:0.2f}'.format(average_precision))

### 6.1.3 Over-sampling to ADASYN: Adaptive Synthetic Sampling

ADASYN generates samples of the minority class according to their density distributions. More synthetic data is generated for minority class samples that are harder to learn, compared to those minority samples that are easier to learn. It calculates the K-nearest neighbors of each minority instance, then gets the class ratio of the minority and majority instances to generate new samples. By repeating this process, it adaptively shifts the decision boundary to focus on those samples that are difficult to learn.

In [None]:
adasyn = ADASYN(random_state=0)

X_resampled, y_resampled = adasyn.fit_resample(X_train, y_train)

print(Counter(y_resampled))
print(f'Ratio 0:1 = {Counter(y_resampled)[0] / Counter(y_resampled)[1]:.4f}%')

model = LGBMClassifier(random_state=0)

model.fit(X_resampled, y_resampled)

y_predict = model.predict(X_test)

# confusion_matrix(y_test, y_predict)

data = pd.DataFrame({'test': y_test,'pred': y_predict})
matrix = pd.crosstab(data.test, data.pred, rownames=['Actual'], colnames=['Predicted'])
sns.heatmap(matrix, annot=True, fmt='g')
plt.title(f'ADASYN')
plt.show()

#plot_confusion_matrix(model, X_test, y_test);


y_score = model.predict_proba(X_test)[:,1]
fpr, tpr, _ = roc_curve(y_test, y_score)

plt.title('LogisticRegression ROC curve: CC Fraud')
plt.xlabel('FPR (Precision)')
plt.ylabel('TPR (Recall)')

plt.plot(fpr,tpr, label='LogisticRegression')
plt.legend(loc='lower right')
plt.plot((0,1), ls='dashed',color='black')
plt.show()
print ('Area under curve (AUC): ', auc(fpr,tpr)) #roc_auc_score(y_test, y_score)

print(f'precision : {precision_score(y_test, y_predict)}')
print('\n')
print(classification_report(y_test, y_predict))

average_precision = average_precision_score(y_test, y_predict)

disp = plot_precision_recall_curve(model, X_test, y_test)

disp.ax_.set_title('Precision-Recall curve: Average precision score={0:0.2f}'.format(average_precision))

### 6.2.1 Random under-sampling
RandomUnderSampler is a fast and easy way to balance the data by randomly selecting a subset of data for the targeted classes:

In [None]:
rus = RandomUnderSampler(random_state=0)

X_resampled, y_resampled = rus.fit_resample(X_train, y_train)

print(Counter(y_resampled))
print(f'Ratio 0:1 = {Counter(y_resampled)[0] / Counter(y_resampled)[1]:.4f}%')

model = LGBMClassifier(random_state=0)

model.fit(X_resampled, y_resampled)

y_predict = model.predict(X_test)

# confusion_matrix(y_test, y_predict)

data = pd.DataFrame({'test': y_test,'pred': y_predict})
matrix = pd.crosstab(data.test, data.pred, rownames=['Actual'], colnames=['Predicted'])
sns.heatmap(matrix, annot=True, fmt='g')
plt.title(f'RandomUnderSampler')
plt.show()

#plot_confusion_matrix(model, X_test, y_test);


y_score = model.predict_proba(X_test)[:,1]
fpr, tpr, _ = roc_curve(y_test, y_score)

plt.title('LogisticRegression ROC curve: CC Fraud')
plt.xlabel('FPR (Precision)')
plt.ylabel('TPR (Recall)')

plt.plot(fpr,tpr, label='LogisticRegression')
plt.legend(loc='lower right')
plt.plot((0,1), ls='dashed',color='black')
plt.show()
print ('Area under curve (AUC): ', auc(fpr,tpr)) #roc_auc_score(y_test, y_score)

print(f'precision : {precision_score(y_test, y_predict)}')
print('\n')
print(classification_report(y_test, y_predict))

average_precision = average_precision_score(y_test, y_predict)

disp = plot_precision_recall_curve(model, X_test, y_test)

disp.ax_.set_title('Precision-Recall curve: Average precision score={0:0.2f}'.format(average_precision))

We will use Naive random over-sampling technique to deal with imbalance data because in this case we need to focus the customer which might interested in Vehicle insurance. That is a reason why we chosse model which high recall (The actual data which is a relevant instances that were retrieved) and the overall F1 score is the highest compare to other over-sampling like SMOTE and ADASYN. We not use under sampling in this case because it result in losing information invaluable to a model. 

# 7. Model Selection

In [None]:
# In this case we use RandomOverSampler technique to transfrom data
ros = RandomOverSampler(random_state=0)

X_resampled, y_resampled = ros.fit_resample(X_train, y_train)

print(Counter(y_resampled))
print(f'Ratio 0:1 = {Counter(y_resampled)[0] / Counter(y_resampled)[1]:.4f}%')

In [None]:
skf = StratifiedKFold(n_splits=5, random_state=0, shuffle=True)

models = {'LogisticRegression': LogisticRegression(random_state=0),
        'RidgeClassifier' : RidgeClassifier(random_state=0),
        'LGBMClassifier' : LGBMClassifier(random_state=0),
        'KNeighborsClassifier' : KNeighborsClassifier(),
        'XGBClassifier' : XGBClassifier(random_state=0,eval_metric = 'auc'),
        'RandomForestClassifier': RandomForestClassifier(random_state=0)}
        

accuracy = []
precision = []
recall = []
f1 = []
roc_auc = []
times = []

for model_name in models:
    
    start = time()

    models[model_name].fit(X_resampled, y_resampled)
    
    end = time()
    
    accuracy_ = cross_val_score(models[model_name], X_test, y_test, scoring = 'accuracy', cv = skf, n_jobs = -1)
    precision_ = cross_val_score(models[model_name], X_test, y_test, scoring = 'precision', cv = skf, n_jobs = -1)
    recall_ = cross_val_score(models[model_name], X_test, y_test, scoring = 'recall', cv = skf, n_jobs = -1)
    f1_ = cross_val_score(models[model_name], X_test, y_test, scoring = 'f1', cv = skf, n_jobs = -1)
    roc_auc_ = cross_val_score(models[model_name], X_test, y_test, scoring = 'roc_auc', cv = skf, n_jobs = -1)

    accuracy.append(np.mean(accuracy_))
    precision.append(np.mean(precision_))
    recall.append(np.mean(recall_))
    f1.append(np.mean(f1_))
    roc_auc.append(np.mean(roc_auc_))
    times.append(end-start)
    
pd.concat([pd.DataFrame([models.keys()]).T.rename(columns = {0:'models'}),
           pd.DataFrame({'accuracy':accuracy, 'precision':precision, 'recall':recall, 'f1':f1, 'roc_auc':roc_auc, 'times':times})],
          axis=1)

In [None]:
# from scipy.stats import randint as sp_randint
# from scipy.stats import uniform as sp_uniform
# import time

# #Set hypermeter search
# fit_params={"early_stopping_rounds":30, 
#             "eval_metric" : 'auc', 
#             "eval_set" : [(X_test,y_test)],
#             'eval_names': ['valid'],
#             #'callbacks': [lgb.reset_parameter(learning_rate=learning_rate_010_decay_power_099)],
#             'verbose': 100,
#             'categorical_feature': 'auto'}


# params ={'num_leaves': sp_randint(6, 50), 
#          'min_child_samples': sp_randint(100, 500), 
#          'min_child_weight': [1e-5, 1e-3, 1e-2, 1e-1, 1, 1e1, 1e2, 1e3, 1e4],
#          'subsample': sp_uniform(loc=0.2, scale=0.8), 
#          'colsample_bytree': sp_uniform(loc=0.4, scale=0.6),
#          'reg_alpha': [0, 1e-1, 1, 2, 5, 7, 10, 50, 100],
#          'reg_lambda': [0, 1e-1, 1, 5, 10, 20, 50, 100]}

# n_HP_points_to_test = 100

# clf = LGBMClassifier(max_depth=-1, random_state=0, silent=True, metric='None', n_jobs=-1, n_estimators=5000)

# gs = RandomizedSearchCV(
#     estimator = clf,
#     param_distributions = params, 
#     n_iter = n_HP_points_to_test,
#     scoring = 'roc_auc',
#     cv = 3,
#     refit = True,
#     random_state = 314,
#     verbose = False)

# %time
# gs.fit(X_train, y_train, **fit_params)

# 8. Hyperparameter Tuning

In [None]:
model = LGBMClassifier( 
    boosting_type="gbdt",
    is_unbalance=True, 
    random_state=10, 
    n_estimators=50,
    num_leaves=30, 
    max_depth=8,
    feature_fraction=0.5,  
    bagging_fraction=0.8, 
    bagging_freq=15, 
    learning_rate=0.01,    
)

params_opt = {'n_estimators':range(200, 600, 80),
              'num_leaves':range(20,60,10)}

rc = RandomizedSearchCV(estimator = model, 
    param_distributions = params_opt, 
    scoring='roc_auc',
    n_jobs=4,
    iid=False, 
    verbose=1,
    cv=3)

In [None]:
%time
rc.fit(X_train, y_train)

In [None]:
rc.best_estimator_, rc.best_params_, rc.best_score_

In [None]:
#get model parameters at first
clf_final = LGBMClassifier(**model.get_params())

#set optimal parameters got from RandomizedSearchCV
clf_final.set_params(**rc.best_params_)

# --------  or second way use this --------
# clf_final = LGBMClassifier(**rc.best_estimator_.get_params())
# clf_final

In [None]:
clf_final.fit(X_train, y_train)

In [None]:
accuracy_ = cross_val_score(clf_final, X_test, y_test, scoring = 'accuracy', cv = skf, n_jobs = -1)
precision_ = cross_val_score(clf_final, X_test, y_test, scoring = 'precision', cv = skf, n_jobs = -1)
recall_ = cross_val_score(clf_final, X_test, y_test, scoring = 'recall', cv = skf, n_jobs = -1)
f1_ = cross_val_score(clf_final, X_test, y_test, scoring = 'f1', cv = skf, n_jobs = -1)
roc_auc_ = cross_val_score(clf_final, X_test, y_test, scoring = 'roc_auc', cv = skf, n_jobs = -1)

print(f'accuracy: {np.mean(accuracy_):.5f}')
print(f'precision: {np.mean(precision_):.5f}')
print(f'recall: {np.mean(recall_):.5f}')
print(f'f1: {np.mean(f1_):.5f}')
print(f'roc_auc: {np.mean(roc_auc_):.5f}')

In [None]:
importance = pd.DataFrame({'feature':X_train.columns, 'importance':clf_final.feature_importances_}).sort_values(by = 'importance',ascending=False)

plt.figure(figsize=(10,8))
sns.barplot(x="importance", y="feature", data=importance);

# 9. Prediction

In [None]:
preprocess.transform(test)

enc_cat_col = preprocess.named_transformers_['cat']['onehotencoder'].get_feature_names()

labels = np.concatenate([num_col, ori_col, enc_cat_col])

test_transformed = pd.DataFrame(preprocess.fit_transform(test), columns=labels)

test_transformed.rename(columns={'x0_Female':'Gender_Female',
                             'x0_Male':'Gender_Male',
                              'x1_0':'Driving_License_0',
                              'x1_1':'Driving_License_1',
                              'x2_0':'Previously_Insured_0',
                              'x2_1':'Previously_Insured_1',
                              'x3_No':'Vehicle_Damage_No',
                              'x3_Yes':'Vehicle_Damage_Yes',}, inplace=True)
test_transformed

In [None]:
Prediction = clf_final.predict(test_transformed)
Prediction

In [None]:
sub = pd.read_csv('../input/health-insurance-cross-sell-prediction/sample_submission.csv')

submission = pd.concat([sub.drop(columns = 'Response'), pd.DataFrame({'Prediction': Prediction})], axis=1)

#save submission file to .csv
submission.to_csv('vehicle_insurance_predicted.csv', index = False)
submission