In [None]:
# This Python 3 environment comes with many helpful analytics libraries installed
# It is defined by the kaggle/python Docker image: https://github.com/kaggle/docker-python
# For example, here's several helpful packages to load

from time import time
import numpy as np # linear algebra
import pandas as pd # data processing, CSV file I/O (e.g. pd.read_csv)
import warnings
warnings.filterwarnings("ignore")

# Input data files are available in the read-only "../input/" directory
# For example, running this (by clicking run or pressing Shift+Enter) will list all files under the input directory

import os
for dirname, _, filenames in os.walk("/kaggle/input"):
    for filename in filenames:
        print(os.path.join(dirname, filename))

# You can write up to 5GB to the current directory (/kaggle/working/) that gets preserved as output when you create a version using "Save & Run All" 
# You can also write temporary files to /kaggle/temp/, but they won't be saved outside of the current session

In [None]:
train = pd.read_csv("/kaggle/input/health-insurance-cross-sell-prediction/train.csv")
test = pd.read_csv("/kaggle/input/health-insurance-cross-sell-prediction/test.csv")

Let's take a look on the datasets...

In [None]:
print("Train size dataset:", train.shape)
print("Test size dataset:", test.shape)

In [None]:
train.head(5)

In [None]:
test.head(5)

In [None]:
# Getting the right index column

train = train.set_index("id")
test = test.set_index("id")

In [None]:
train[["Age", "Annual_Premium"]].describe()

In [None]:
for col in list(train.columns):
    print("Column:", col, "- NA value:", train[col].isna().unique())

In [None]:
for col in list(test.columns):
    print("Column:", col, "- NA value:", test[col].isna().unique())

In [None]:
train.groupby("Response").size()

In [None]:
train.skew()

# Viewing the data...

In [None]:
# Import my favorite visualization lib

import matplotlib.pyplot as plt
%matplotlib inline
plt.style.use('seaborn-darkgrid')
import seaborn as sns

In [None]:
grid = sns.FacetGrid(train, col="Response", palette="tab20c", height=6)
grid.map_dataframe(sns.countplot, "Gender")
grid.add_legend()

In [None]:
grid = sns.FacetGrid(train, col="Vehicle_Age", row="Gender", hue="Response",  palette="tab20c", height=6, legend_out=True)
grid.map_dataframe(sns.countplot, "Age")
grid.add_legend()

In [None]:
grid = sns.FacetGrid(train, col="Previously_Insured", row="Gender", hue="Response", palette="tab20c", height=6, legend_out=True)
grid.map_dataframe(sns.countplot, "Age")
grid.add_legend()

In [None]:
sns.jointplot(data=train, x="Age", y="Annual_Premium", kind="scatter")

# Data Pre Processing

**Feature transformation and selection:**

In [None]:
# Taking a look on the categorical values on train dataset

for col in train[["Gender", "Vehicle_Age", "Vehicle_Damage"]]:
    print("Column:",col, "\nItems number:", len(train[col].unique()), "\nItems:", train[col].unique().tolist(), "\n")

In [None]:
# Taking a look on the categorical values on test dataset

for col in test[["Gender", "Vehicle_Age", "Vehicle_Damage"]]:
    print("Column:",col, "\nItems number:", len(test[col].unique()), "\nItems:", test[col].unique().tolist(), "\n")

In [None]:
# Transforming string data into numeric data on train dataset

train["Gender"] = train["Gender"].map(lambda s: 1 if (s == "Male") else 0)

train["Vehicle_Damage"] = train["Vehicle_Damage"].map(lambda s: 1 if (s == "Yes") else 0)

train["Vehicle_Age"] = train["Vehicle_Age"].map(lambda s: 3 if (s == "> 2 Years") else (2 if (s == "1-2 Year") else 1))

train.head(5)

In [None]:
# Transforming string data into numeric data on test dataset

test["Gender"] = test["Gender"].map(lambda s: 1 if (s == "Male") else 0)

test["Vehicle_Damage"] = test["Vehicle_Damage"].map(lambda s: 1 if (s == "Yes") else 0)

test["Vehicle_Age"] = test["Vehicle_Age"].map(lambda s: 3 if (s == "> 2 Years") else (2 if (s == "1-2 Year") else 1))

test.head(5)

**Data Dictionary**

* Gender -  Male: 1 | Female: 0
* Vehicle_Damage -  Yes: 1 | No: 0
* Vehicle_Age -  > 2 Years: 3 | 1-2 Year: 2 | < 1 Year: 1

In [None]:
correlations_train = train.corr()
#correlations_train.head(15)

f, ax = plt.subplots(figsize=(12, 9))
sns.heatmap(correlations_train, annot=True, linewidths=.5, fmt=".2g")

In [None]:
# Normalization on train dataset

from sklearn.preprocessing import MinMaxScaler

selected_cols = ["Age", "Annual_Premium", "Vintage"]

normalize = MinMaxScaler(feature_range = (0, 1))

for col in selected_cols:
    train[col] = normalize.fit_transform(train[[col]])
    
train.head(5)

In [None]:
# Normalization on test dataset
for col in selected_cols:
    test[col] = normalize.fit_transform(test[[col]])
    
test.head(5)

In [None]:
# Feature engineering

from sklearn.ensemble import ExtraTreesClassifier

feature_selection = ExtraTreesClassifier()
feature_selection.fit(train.iloc[:,0:10], train.iloc[:,10])

#print(train.columns[0:10])
#print(feature_selection.feature_importances_)

for item in range(len(feature_selection.feature_importances_)):
    print("Feature:", train.columns[item], "- Score:", feature_selection.feature_importances_[item])

In [None]:
# Droping low score correlation columns

train = train.drop(columns=["Driving_License"])
test = test.drop(columns=["Driving_License"])

**Cross-Validation approach and Randomized Search parameters**

In [None]:
from sklearn.model_selection import RandomizedSearchCV

# Setting the X and Y variables
x_train = train.iloc[:,0:9]
y_train = train.iloc[:,9]

x_test = test.iloc[:,0:9]

# Func to print the best scores
def report(results, n_top=3):
    for i in range(1, n_top + 1):
        candidates = np.flatnonzero(results["rank_test_score"] == i)
        for candidate in candidates:
            print("Model with rank: {0}".format(i))
            print("Mean validation score: {0:.3f} (std: {1:.3f})".format(results["mean_test_score"][candidate], results["std_test_score"][candidate]))
            print("Parameters: {0}".format(results["params"][candidate]))

In [None]:
# Setting up the cross-validation

from sklearn.model_selection import KFold

kfold = KFold(10, shuffle=True)

# ML Models

**Logistic Regression Model**

In [None]:
# Logistic Regression

from sklearn.linear_model import LogisticRegression
from sklearn.model_selection import cross_val_score

log_regress_model = LogisticRegression()

lr_result_cross_val = cross_val_score(log_regress_model, x_train, y_train, cv = kfold, scoring = "accuracy")

print("Logistic Regression accuracy: %.2f%%" % (lr_result_cross_val.mean() * 100))

In [None]:
from sklearn.model_selection import RandomizedSearchCV

random_lg_params = {
    "penalty":["l1", "l2"],
    "C":[0.1, 1, 2, 5, 10, 15, 25, 50, 75, 100, 150, 500, 1000],
    "class_weight":["balanced", None],
    "solver":["newton-cg", "lbfgs", "sag", "saga"],
    "max_iter":[50, 100, 150, 500, 1000],
    "multi_class":["ovr", "multinomial"]
}

log_regress_best_params = RandomizedSearchCV(estimator = log_regress_model, param_distributions = random_lg_params, n_iter = 10, cv = kfold, verbose= 1, random_state= 101, n_jobs = -1)

In [None]:
log_regress_best_params.fit(x_train, y_train)

In [None]:
report(results=log_regress_best_params.cv_results_)

**KNN model**

In [None]:
from sklearn.neighbors import KNeighborsClassifier

knn_model = KNeighborsClassifier()

knn_cross_val = cross_val_score(knn_model, x_train, y_train, cv = kfold, scoring = "accuracy")

print("KNN accuracy: %.2f" % (knn_cross_val.mean() * 100))

In [None]:
random_knn_params = {
    "n_neighbors":[3,4,5,8,10,15,20,30],
    "weights":["uniform", "distance"],
    "algorithm":["ball_tree", "kd_tree", "brute"],
}

knn_best_params = RandomizedSearchCV(estimator = knn_model, param_distributions = random_knn_params, n_iter = 10, cv = kfold, verbose= 1, random_state= 101, n_jobs = -1)

In [None]:
knn_best_params.fit(x_train, y_train)

In [None]:
report(results=knn_best_params.cv_results_)

**Random Forest Classifier model**

In [None]:
from sklearn.ensemble import RandomForestClassifier

rfc_model = RandomForestClassifier()

In [None]:
random_rf_params = {
    "n_estimators":[5, 10, 100, 300, 500, 1000],
    "criterion":["gini", "entropy"],
    "max_features":["auto", "sqrt", "log2", None],
    "min_samples_split":[2, 4, 6, 8, 10],
    "min_samples_leaf":[1, 2, 4, 6, 8],
    "min_weight_fraction_leaf":[0, 1, 2, 4, 6, 8, 10],
    "max_leaf_nodes":[1, 2, 4, 8, 16, 32],
    "max_depth": [2,3,4,5,6,7,10]
}

rf_best_params = RandomizedSearchCV(estimator = rfc_model, param_distributions = random_rf_params, n_iter = 10, cv = kfold, verbose = 1, random_state = 101, n_jobs = -1)

In [None]:
rf_best_params.fit(x_train, y_train)

In [None]:
report(results=rf_best_params.cv_results_)

**XGBoostClassifier model**

*Working in progress here*

In [None]:
from xgboost import XGBClassifier

xgb_model = XGBClassifier()

In [None]:
xgb_best_params = {
    "max_depth":[1, 2, 3, 4, 5, 6, 7, 8, 9, 10],
    "n_estimators":[10, 50, 100, 250, 500, 850, 1000],
    "subsample":[i for i in np.arange(0.1, 1.1, 0.1)],
    "eta":[0.0001, 0.001, 0.01, 0.1, 1.0],
    "colsample_bytree":[i for i in np.arange(0.1, 1.1, 0.1)]
}

xgb_best_params = RandomizedSearchCV(estimator = xgb_model, param_distributions = xgb_best_params, n_iter = 10, cv = kfold, verbose = 1, random_state = 101, n_jobs = -1)

In [None]:
xgb_best_params.fit(x_train, y_train)

In [None]:
report(results=xgb_best_params.cv_results_)