In [None]:
# Importing the necessary libraries
import pandas as pd
import numpy as np

import matplotlib.pyplot as plt
import seaborn as sns
sns.set()

import gc

from sklearn import metrics

import warnings
warnings.filterwarnings("ignore")

In [None]:
# Loading the data
train = pd.read_csv("/kaggle/input/health-insurance-cross-sell-prediction/train.csv")
test = pd.read_csv("/kaggle/input/health-insurance-cross-sell-prediction/test.csv")

print(f"Train Data: {train.shape}")
display(train.head())
print(f"Test Data: {test.shape}")
display(test.head())

## Hypothesis:
1. Younger people tend to take Vehicle Insurance more than older people because the chance of getting a new vehicle in older age is less and also older people might already have insurance for their vehicles.
2. Response will be positive for people who haven't insured previously.
3. People with old vehicles(age) won't take vehicle insurance because of the high premium rates for older vehicles.
4. If the age of the vehicle is less and if it had a previous damage, then those people will buy the vehicle insurance.


These are some of the hypothesis I want to check. There are more hypothesis which you can come up with and validate the same.

First let's change the datatype of some of the features to categorical since it is wrongly inferred as integer type.

In [None]:
# dataset info
train.info()

In [None]:
# Creating a copy of the train and test set
trainData = train.copy()
testData = test.copy()

In [None]:
# Converting the datatypes
# In order to reduce the repetitive process for test set, will combine both the train and test data and then change the data types
trainData["type"] = "train"
testData["type"] = "test"
combined = pd.concat([trainData, testData], axis=0)
print(combined.shape)
combined.head()

In [None]:
# Changing Region_code to category
combined["Region_Code"] = combined["Region_Code"].astype("category")

# Changing Policy_Sales_Channel to category
combined["Policy_Sales_Channel"] = combined["Policy_Sales_Channel"].astype("category")

# Changing Driving_License to category
combined["Driving_License"] = combined["Driving_License"].astype("category")

# Changing Previously_Insured to category
combined["Previously_Insured"] = combined["Previously_Insured"].astype("category")

# Changing Vehicle_Damage to category
combined["Vehicle_Damage"] = combined["Vehicle_Damage"].astype("category")

# Changing Vehicle_Age to category
combined["Vehicle_Age"] = combined["Vehicle_Age"].astype("category")

# Changing Response to category
combined["Response"] = combined["Response"].astype("category")

# Changing Gender to category
combined["Gender"] = combined["Gender"].astype("category")

In [None]:
combined.info()

In [None]:
# Splitting it back to train and test
trainData = combined[combined["type"] == "train"].drop("type", axis=1)
testData = combined[combined["type"] == "test"].drop(["type", "Response"], axis=1)
print(f"Train: {trainData.shape}")
print(f"Test: {testData.shape}")

First check for the null values in the data set

In [None]:
# Null Values in the Train Data
trainData.isnull().sum()

In [None]:
# Null values in the Test data
testData.isnull().sum()

Exploring the Numerical Data:

In [None]:
trainData.describe()

In [None]:
# Distribution of Age
fig = plt.figure(figsize=(10,5))
ax = fig.subplots(1,2)
ax[0].hist(trainData["Age"], bins=20)
ax[0].axvline(trainData["Age"].mean(), color="r")

ax[1].boxplot(trainData["Age"])
fig.suptitle("Distribution of Age")


In [None]:
# Distribution of Annual Premium
fig = plt.figure(figsize=(10,5))
ax = fig.subplots(1,2)
ax[0].hist(trainData["Annual_Premium"], bins=20)
ax[0].axvline(trainData["Annual_Premium"].mean(), color="r")

ax[1].boxplot(trainData["Annual_Premium"])
fig.suptitle("Distribution of Annual Premium")

In [None]:
# Distribution of Vintage
fig = plt.figure(figsize=(10,5))
ax = fig.subplots(1,2)
ax[0].hist(trainData["Vintage"], bins=20)
ax[0].axvline(trainData["Vintage"].mean(), color="r")

ax[1].boxplot(trainData["Vintage"])
fig.suptitle("Distribution of Vintage")

Exploring the Categorical Data:

In [None]:
cat_features = trainData.select_dtypes("category").columns
cat_features

Will check the different categories in each of the categorical features

In [None]:
# Distribution of Gender
sns.countplot(trainData["Gender"])
display(trainData["Gender"].value_counts())

In [None]:
# Distribution of Driving_License
sns.countplot(trainData["Driving_License"])
display(trainData["Driving_License"].value_counts())

In [None]:
# Distribution of Region_Code
sns.countplot(trainData["Region_Code"])

In [None]:
# Distribution of Previously_Insured
sns.countplot(trainData["Previously_Insured"])
display(trainData["Previously_Insured"].value_counts())

In [None]:
# Distribution of Vehicle_Age
sns.countplot(trainData["Vehicle_Age"])
display(trainData["Vehicle_Age"].value_counts())

In [None]:
# Distribution of Vehicle_Damage
sns.countplot(trainData["Vehicle_Damage"])
display(trainData["Vehicle_Damage"].value_counts())

In [None]:
# Distribution of Policy_Sales_Channel
sns.countplot(trainData["Policy_Sales_Channel"])

In [None]:
# Distribution of Response
sns.countplot(trainData["Response"])
display(trainData["Response"].value_counts())

Some key observations from the EDA:
1. **The Target/Label is highly imbalanced**
2. Most people considered in the data have license with them.
3. There are very few people with vehicle age > 2 years are present in the data.

# **Answering the hypothesis through the data:**

## **1. Are Younger people taking more vehicle insurance than the older ones?** 

In [None]:
# Age vs Response
# Grouping the age into buckets
bins = [1, 20, 40, 60,100]
trainData["Age_bucket"] = pd.cut(trainData["Age"], bins=bins, labels=["children","young", "middle-aged","old"])

crosstab = pd.crosstab(trainData["Age_bucket"], trainData["Response"])
crosstab.plot(kind="bar")

## **2. Is there any positive response among people who haven't insured previously?** 

In [None]:
# Previously_Insured vs Response
pd.crosstab(trainData["Previously_Insured"], trainData["Response"]).plot(kind="bar")

## **3. Does vehicle age affects the response rate?** 

In [None]:
# Vehicle Age vs Response
pd.crosstab(trainData["Vehicle_Age"], trainData["Response"]).plot(kind="bar")

## **4. Does the age of vehicle and any damage in the past affectsthe response?** 

In [None]:
# Vehicle Age vs Vehicle Damage vs Response
t1 = trainData.copy()
t1["Response"] =trainData["Response"].astype(int)
pd.pivot_table(t1, values="Response",index=["Vehicle_Age", "Vehicle_Damage"], aggfunc= np.sum)

# **Data Preparation:**

In [None]:
trainData.head()

In [None]:
# Splitting X and Y
X = trainData.drop(["id", "Age", "Response"], axis=1)
y= trainData["Response"]

print(X.shape)
print(y.shape)
display(X.head())
display(y.head())

In [None]:
# Creating Train and Test set
from sklearn.model_selection import train_test_split, cross_val_score

X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.3, stratify=y)
print(f"Training set: {X_train.shape}")
print(f"Testing set: {X_test.shape}")

In [None]:
# Creating a preprocessing pipeline
from sklearn.compose import make_column_transformer
from sklearn.pipeline import make_pipeline
from sklearn.preprocessing import MinMaxScaler, OneHotEncoder

cat_features_full = list(X.select_dtypes("category").columns)

num_features_full = list(X.select_dtypes(["int", "float"]).columns)

preprocess = make_column_transformer(
                                        (OneHotEncoder(handle_unknown="ignore"), cat_features_full),
                                        (MinMaxScaler(), num_features_full)
                                    )


In [None]:
#
from sklearn.linear_model import LogisticRegression

logreg = LogisticRegression()
pipe = make_pipeline(preprocess,logreg)

# **Model Building:**

In [None]:
# Function to calculate different metrics
from sklearn.metrics import accuracy_score, f1_score, roc_auc_score, precision_score, recall_score

score_log = pd.DataFrame()
def score_model(model, name, x_train, y_train, x_test, y_test, position):
    model = model.fit(x_train, y_train)
    y_pred = model.predict(x_test)
    y_pred_prob = model.predict_proba(x_test)[:,1]
    score_log.loc[position, "Model"] = name
    score_log.loc[position, "Accuracy"] = round(accuracy_score(y_test, y_pred), 3)
    score_log.loc[position,"F1-Score"] = round(f1_score(y_test, y_pred), 3)
    score_log.loc[position, "Roc-Auc"] = round(roc_auc_score(y_test, y_pred), 3)
    score_log.loc[position, "Roc-Auc (Proba)"] = round(roc_auc_score(y_test, y_pred_prob), 3)
    score_log.loc[position, "Precision"] = round(precision_score(y_test, y_pred), 3)
    score_log.loc[position, "Recall"] = round(recall_score(y_test, y_pred), 3)
    return score_log

In [None]:
# Function to test and create submission file
def submission(pipeline):
    testData = test.copy()
    
    bins = [1, 20, 40, 60,100]
    testData["Age_bucket"] = pd.cut(testData["Age"], bins=bins, labels=["children","young", "middle-aged","old"])
    testID = testData.id
    testData = testData.drop(["id", "Age"], axis=1)
    
    preds = pipeline.predict_proba(testData)[:,1]
    
    submission_log = pd.DataFrame(zip(testID, preds), columns=["id", "Response"])
    
    return submission_log


In [None]:
# Creating a Logistic Regression
from sklearn.linear_model import LogisticRegression

logreg = LogisticRegression(class_weight="balanced")
pipe = make_pipeline(preprocess,logreg)
# print(cross_val_score(pipe, X, y,cv=StratifiedKFold(3), scoring="accuracy").mean())
score_model(pipe, "Logistic Regression", X_train, y_train, X_test, y_test, 1)

In [None]:
# Creating a Decision Tree model
from sklearn.tree import DecisionTreeClassifier

dtree = DecisionTreeClassifier(class_weight="balanced")
pipe = make_pipeline(preprocess,dtree)
# print(cross_val_score(pipe, X, y,cv=StratifiedKFold(3), scoring="accuracy").mean())
score_model(pipe, "Decision Tree", X_train, y_train, X_test, y_test, 2)

In [None]:
gc.collect()

In [None]:
# calculating the weightage for the positive class to handle the imbalance
weight = int(round(y.value_counts()[0]/y.value_counts()[1],2))
weight

In [None]:
# Creating XGBoost model
import xgboost as xgb


# Hyperparameter values are arbitrary
modelXGB = xgb.XGBClassifier(n_estimators = 1500,
                      scale_pos_weight=weight,
                      learning_rate = 0.01,
                      colsample_bytree = 0.4,
                      subsample = 0.7,
                      objective='binary:logistic', 
                      reg_lambda = 0.4,
                      max_depth=4, 
                      gamma=10,
                      n_jobs=-1,
                      )

pipe = make_pipeline(preprocess,modelXGB)
# %time print(cross_val_score(pipe, X, y,cv=StratifiedKFold(3), scoring="accuracy").mean())
score_model(pipe, "XGB Classifier", X_train, y_train, X_test, y_test, 4)

In [None]:
# Creating a Light GBM Model
import lightgbm as lgb

modelLGB = lgb.LGBMClassifier(objective='binary',
                             n_estimators = 1500,
                             learning_rate = 0.01,
                             n_jobs = -1,
                             seed=123,
                             max_depth = 4,
                             subsample = 0.7,
                             reg_lambda = 0.3,
                             colsample_bytree = 0.4,
                             scale_pos_weight=weight,
                             num_leaves = 10)

pipe = make_pipeline(preprocess,modelLGB)
score_model(pipe, "LGBM Classifier", X_train, y_train, X_test, y_test, 5)

In [None]:
# Creating an ensemble
from sklearn.ensemble import VotingClassifier

votingCLF = VotingClassifier([
                             ("XGBoost", modelXGB),
                             ("LGBM", modelLGB)], voting="soft", weights=[0.3, 0.7])

pipe = make_pipeline(preprocess,votingCLF)
score_model(pipe, "Voting Ensemble", X_train, y_train, X_test, y_test, 6)

In [None]:
sub_log = submission(pipe)
# sub_log.to_csv("submission_ensemble1_withProba.csv", index=False)
sub_log.head()

Building the model with the given features gives an AUC score of 0.851 with the Private leaderboard ranking of around 300, wheras the Top score for the hackathon is around 0.863.

Now, let's perform the feature engineering and try building a new model on top of it to see whether adding new features impacts the performance of the model or not.

# **Feature Engineering**

In [None]:
trainDataFeat = train.copy()
testDataFeat = test.copy()

combinedFeat = pd.concat([trainDataFeat, testDataFeat], axis=0)
print(f"The size of the combined data: {combinedFeat.shape}")
combinedFeat.head()

In [None]:
combinedFeat.info()

In [None]:
# Encoding the categorical variables {Gender, Vehicle_Age, Vehicle_Damage}
combinedFeat["Gender"] = combinedFeat["Gender"].map({"Male": 1,
                                                   "Female": 0})

combinedFeat["Vehicle_Age"] = combinedFeat["Vehicle_Age"].map({"< 1 Year": 0,
                                                             "1-2 Year": 1,
                                                             "> 2 Years": 2})

combinedFeat["Vehicle_Damage"] = combinedFeat["Vehicle_Damage"].map({"Yes": 1,
                                                                   "No": 0})

In [None]:
# Creating new features by combining and transforming the given set of features

# Changing vintage to years from days
combinedFeat["Vintage"] = combinedFeat["Vintage"]/365

# Policy channels per Region
combinedFeat["unique_policy_channel_per_Region"] = combinedFeat.groupby(["Region_Code"])["Policy_Sales_Channel"].transform("nunique")

# Average age per Region
combinedFeat["avg_age_per_Region"] = combinedFeat.groupby(["Region_Code"])["Age"].transform("mean")

# Total & Average Driving license per Region
combinedFeat["sum_license_per_Region"] = combinedFeat.groupby(["Region_Code"])["Driving_License"].transform("sum")
combinedFeat["avg_license_per_Region"] = combinedFeat.groupby(["Region_Code"])["Driving_License"].transform("mean")

# Total & Average Insured persons per Region
combinedFeat["sum_insured_per_Region"] = combinedFeat.groupby(["Region_Code"])["Previously_Insured"].transform("sum")
combinedFeat["avg_insured_per_Region"] = combinedFeat.groupby(["Region_Code"])["Previously_Insured"].transform("mean")

# Total & Average vehicle damager per Region
combinedFeat["sum_vehicle_damage_per_Region"] = combinedFeat.groupby(["Region_Code"])["Vehicle_Damage"].transform("sum")
combinedFeat["avg_vehicle_damage_per_Region"] = combinedFeat.groupby(["Region_Code"])["Vehicle_Damage"].transform("mean")

# Average vintage per Region
combinedFeat["avg_vintage_per_Region"] = combinedFeat.groupby(["Region_Code"])["Vintage"].transform("mean")

# Total, Average & Standard deviation of Annual premium paid by customers per Region
combinedFeat["sum_premium_per_Region"] = combinedFeat.groupby(["Region_Code"])["Annual_Premium"].transform("sum")
combinedFeat["avg_premium_per_Region"] = combinedFeat.groupby(["Region_Code"])["Annual_Premium"].transform("mean")
combinedFeat["std_premium_per_Region"] = combinedFeat.groupby(["Region_Code"])["Annual_Premium"].transform("std")

# Previously not insured & have vehicle damage
combinedFeat["not_insured_has_damage"] = np.where((combinedFeat["Previously_Insured"]== 0) & (combinedFeat["Vehicle_Damage"]== 1), 1, 0)

# Have Driving license & vehicle damage
combinedFeat["license_has_damage"] = np.where((combinedFeat["Driving_License"] == 1) & (combinedFeat["Vehicle_Damage"] == 1), 1, 0)

# Total, Average & Standard deviation of Annual premium by vehicle age
combinedFeat["sum_premium_per_vehicle_age"] = combinedFeat.groupby(["Vehicle_Age"])["Annual_Premium"].transform("sum")
combinedFeat["avg_premium_per_vehicle_age"] = combinedFeat.groupby(["Vehicle_Age"])["Annual_Premium"].transform("mean")
combinedFeat["std_premium_per_vehicle_age"] = combinedFeat.groupby(["Vehicle_Age"])["Annual_Premium"].transform("std")

# Total, Average & Standerd deviation of Annual premium by vehicle damage
combinedFeat["sum_premium_per_vehicle_damage"] = combinedFeat.groupby(["Vehicle_Damage"])["Annual_Premium"].transform("sum")
combinedFeat["avg_premium_per_vehicle_damage"] = combinedFeat.groupby(["Vehicle_Damage"])["Annual_Premium"].transform("mean")
combinedFeat["std_premium_per_vehicle_damage"] = combinedFeat.groupby(["Vehicle_Damage"])["Annual_Premium"].transform("std")

# Total, Average & Standerd deviation of Annual premium by Policy channel
combinedFeat["sum_premium_by_policy_channel"] = combinedFeat.groupby(["Policy_Sales_Channel"])["Annual_Premium"].transform("sum")
combinedFeat["avg_premium_by_policy_channel"] = combinedFeat.groupby(["Policy_Sales_Channel"])["Annual_Premium"].transform("mean")
combinedFeat["std_premium_by_policy_channel"] = combinedFeat.groupby(["Policy_Sales_Channel"])["Annual_Premium"].transform("std")

# Total, Average & Standerd deviation of Annual premium by vehicle damage & vehicle age
combinedFeat["sum_premium_per_vehicle_age_damage"] = combinedFeat.groupby(["Vehicle_Age", "Vehicle_Damage"])["Annual_Premium"].transform("sum")
combinedFeat["avg_premium_per_vehicle_age_damage"] = combinedFeat.groupby(["Vehicle_Age", "Vehicle_Damage"])["Annual_Premium"].transform("mean")
combinedFeat["std_premium_per_vehicle_age_damage"] = combinedFeat.groupby(["Vehicle_Age", "Vehicle_Damage"])["Annual_Premium"].transform("std")

In [None]:
combinedFeat.head(10)

In [None]:
# Splitting the data into train and test
trainDataFeat = combinedFeat[~combinedFeat["Response"].isnull()].drop("id", axis=1).dropna()
testDataFeat = combinedFeat[combinedFeat["Response"].isnull()].drop(["Response", "id"], axis=1).fillna(0)

print(f"Train Data: {trainDataFeat.shape}")
print(f"Test Data: {testDataFeat.shape}")

In [None]:
# Splitting the training data into train and validation
from sklearn.model_selection import train_test_split

X_train_feat, X_test_feat, y_train_feat, y_test_feat = train_test_split(trainDataFeat.drop("Response", axis=1), trainDataFeat["Response"], test_size=0.3, random_state=123, stratify=trainDataFeat["Response"])

print(f"Training set: {X_train_feat.shape}")
print(f"Testing set: {X_test_feat.shape}")

In [None]:
from sklearn.preprocessing import MinMaxScaler

scaler = MinMaxScaler()
X_train_feat_scaled = pd.DataFrame(scaler.fit_transform(X_train_feat), columns = X_train_feat.columns)
X_test_feat_scaled = pd.DataFrame(scaler.transform(X_test_feat), columns = X_test_feat.columns)

In [None]:
# Preprocessing the test data
scaler = MinMaxScaler()
scaler.fit(trainDataFeat.drop("Response", axis=1))
testDataFeat_scaled = pd.DataFrame(scaler.transform(testDataFeat), columns = testDataFeat.columns)

In [None]:
# Creating a Logistic Regression Model
from sklearn.linear_model import LogisticRegression

logreg = LogisticRegression(class_weight="balanced")
score_model(logreg, "Logistic Regression (Features)", X_train_feat_scaled, y_train_feat, X_test_feat_scaled, y_test_feat, 7)

In [None]:
#### Creating XGBoost model
import xgboost as xgb

modelXGB = xgb.XGBClassifier(n_estimators = 1000,
                      scale_pos_weight=weight,
                      learning_rate = 0.01,
                      colsample_bytree = 0.4,
                      subsample = 0.7,
                      objective='binary:logistic', 
                      reg_lambda = 0.4,
                      max_depth=4, 
                      gamma=10,
                      n_jobs=-1,
                      )


score_model(modelXGB, "XGB Classifier (features)", X_train_feat_scaled, y_train_feat, X_test_feat_scaled, y_test_feat, 8)

In [None]:
# Creating a Light GBM Model
import lightgbm as lgb

modelLGB = lgb.LGBMClassifier(objective='binary',
                             n_estimators = 1000,
                             learning_rate = 0.01,
                             n_jobs = -1,
                             seed=123,
                             max_depth = 4,
                             subsample = 0.7,
                             reg_lambda = 2,
                             colsample_bytree = 0.4,
                             scale_pos_weight=weight,
                             num_leaves = 10)

score_model(modelLGB, "LGBM Classifier (features)", X_train_feat_scaled, y_train_feat, X_test_feat_scaled, y_test_feat, 9)

In [None]:
# Creating an ensemble
from sklearn.ensemble import VotingClassifier

votingCLF = VotingClassifier([
                             ("XGBoost", modelXGB),
                             ("LGBM", modelLGB)], voting="soft", weights=[0.4, 0.6])

score_model(votingCLF, "Voting Ensemble  (Features)", X_train_feat_scaled, y_train_feat, X_test_feat_scaled, y_test_feat, 10)

In [None]:
# Predicting on the test set
preds = votingCLF.predict_proba(testDataFeat_scaled)
submission_log = pd.DataFrame(zip(test.id, preds[:,1]), columns=["id", "Response"])
submission_log.head()

In [None]:
# submission_log.to_csv("submission_voting_feat_prob.csv", index=False)

This final voting ensemble has an AUC score of 0.857 and private AUC score of 0.862 with the private leaderboard rank of 185.

We can still fine tune the hyperparameters of the individual models and also add CatBoost to the ensemble to try improve the score further. Try it out and let me know if you were able to improve the leaderboard score.

Happy Learning!!