In [None]:
import numpy as np 
import pandas as pd
import os
import matplotlib.pyplot as plt
import seaborn as sns
from sklearn.preprocessing import MinMaxScaler
import plotly.express as px
from sklearn.model_selection import train_test_split
import tensorflow as tf
for dirname, _, filenames in os.walk('/kaggle/input'):
    for filename in filenames:
        print(os.path.join(dirname, filename))

**Task Details**

Your client is an Insurance company that has provided Health Insurance to its customers now they need your help in building a model to predict whether the policyholders (customers) from past year will also be interested in Vehicle Insurance provided by the company.

For example, you may pay a premium of Rs. 5000 each year for a health insurance cover of Rs. 200,000/- so that if, God forbid, you fall ill and need to be hospitalised in that year, the insurance provider company will bear the cost of hospitalisation etc. for upto Rs. 200,000. Now if you are wondering how can company bear such high hospitalisation cost when it charges a premium of only Rs. 5000/-, that is where the concept of probabilities comes in picture. For example, like you, there may be 100 customers who would be paying a premium of Rs. 5000 every year, but only a few of them (say 2-3) would get hospitalised that year and not everyone. This way everyone shares the risk of everyone else.

Just like medical insurance, there is vehicle insurance where every year customer needs to pay a premium of certain amount to insurance provider company so that in case of unfortunate accident by the vehicle, the insurance provider company will provide a compensation (called ‘sum assured’) to the customer.

Building a model to predict whether a customer would be interested in Vehicle Insurance is extremely helpful for the company because it can then accordingly plan its communication strategy to reach out to those customers and optimise its business model and revenue.

Now, in order to predict, whether the customer would be interested in Vehicle insurance, you have information about demographics (gender, age, region code type), Vehicles (Vehicle Age, Damage), Policy (Premium, sourcing channel) etc.

**Evaluation Metric**

The evaluation metric for this hackathon is ROC_AUC score.

In [None]:
train_df = pd.read_csv('../input/health-insurance-cross-sell-prediction/train.csv')
test_df = pd.read_csv('../input/health-insurance-cross-sell-prediction/test.csv')
sample_submission = pd.read_csv('../input/health-insurance-cross-sell-prediction/sample_submission.csv')

In [None]:
train_df

**Variables Definition**

id - Unique ID for the customer

Gender - Gender of the customer

Age - Age of the customer

Driving_License - 0 : Customer does not have DL, 1 : Customer already has DL

Region_Code - Unique code for the region of the customer

Previously_Insured - 1 : Customer already has Vehicle Insurance, 0 : Customer doesn't have Vehicle Insurance

Vehicle_Age - Age of the Vehicle

Vehicle_Damage - 1 : Customer got his/her vehicle damaged in the past. 0 : Customer didn't get his/her vehicle damaged in the past.

Annual_Premium - The amount customer needs to pay as premium in the year

PolicySalesChannel - Anonymized Code for the channel of outreaching to the customer ie. Different Agents, Over Mail, Over Phone, In Person, etc.

Vintage - Number of Days, Customer has been associated with the company

Response - 1 : Customer is interested, 0 : Customer is not interested

In [None]:
train_df.info()

In [None]:
train_df.describe()

**Checking the number of rows and columns**

In [None]:
train_df.shape

**We have 381109 instances and 12 features in our dataset**

**Checking for missing values**

In [None]:
train_df.isnull().sum()

We can infer from above that there are no missing values.

In [None]:
train_df.Response.value_counts()

**Distribution of numeric independent variables.**

In [None]:
# for the independent numeric variables, we plot the histogram to check the distribution of the variables
# Note: the hist() function considers the numeric variables only, by default
# we drop the target variable using drop()
# 'axis=1' drops the specified column

train_df.drop('Response', axis = 1).hist()
plt.tight_layout()
plt.figure(figsize = (15,10))
# display the plot
plt.show()

**Distribution of dependent variable.**

In [None]:
df_target = train_df['Response'].copy()
df_target.value_counts()

# plot the countplot of the variable 'diagnosis'
sns.countplot(x = df_target)
plt.text(x = 0.95, y = df_target.value_counts()[1] + 1, s = str(round((df_target.value_counts()[0])*100/len(df_target),2)) + '%')
plt.text(x = -0.05, y = df_target.value_counts()[0] +1, s = str(round((df_target.value_counts()[1])*100/len(df_target),2)) + '%')

plt.title('Count Plot for Target Variable (Diagnosis)', fontsize = 15)
plt.xlabel('Target Variable', fontsize = 15)
plt.ylabel('Count', fontsize = 15)

# to show the plot
plt.show()

**The above plot shows that there is imbalance in the target variable.**

**Encoding Categorical features , binary features and ordinal features.**

In [None]:
def get_uniques(df, columns):
    return {column: list(df[column].unique()) for column in columns}

In [None]:
categorical_features = ['Gender', 'Vehicle_Age', 'Vehicle_Damage']

get_uniques(train_df, categorical_features)

**Classify categorical features to be encoded**

In [None]:
binary_features = ['Gender', 'Vehicle_Damage']

ordinal_features = ['Vehicle_Age']

**Binary Encoding**

In [None]:
def binary_encode(df, column, positive_label):
    df = df.copy()
    df[column] = df[column].apply(lambda x: 1 if x == positive_label else 0)
    return df

In [None]:
train_df = binary_encode(train_df, 'Gender', 'Male')
test_df = binary_encode(test_df, 'Gender', 'Male')

train_df = binary_encode(train_df, 'Vehicle_Damage', 'Yes')
test_df = binary_encode(test_df, 'Vehicle_Damage', 'Yes')

**Ordinal Encoding**

In [None]:
def ordinal_encode(df, column, ordering):
    df = df.copy()
    df[column] = df[column].apply(lambda x: ordering.index(x))
    return df

In [None]:
age_ordering = ['< 1 Year', '1-2 Year', '> 2 Years']

train_df = ordinal_encode(train_df, 'Vehicle_Age', age_ordering)
test_df = ordinal_encode(test_df, 'Vehicle_Age', age_ordering)

**Splitting and Scaling**

In [None]:
test_ids = test_df['id'].tolist()

train_df = train_df.drop('id', axis=1)
test_df = test_df.drop('id', axis=1)

In [None]:
X = train_df.drop('Response', axis=1)
y = train_df['Response']

**Let's check for outliers before choosing a scaler**

In [None]:
X.plot(kind='box', figsize=(20, 10), logy=True)

**Using Mix Max Scaler to scale down the independent features**

In [None]:
scaler = MinMaxScaler()

X = scaler.fit_transform(X)
test_df = scaler.fit_transform(test_df)

**Splitting the dataset**

In [None]:
X_train, X_test, y_train, y_test = train_test_split(X, y, train_size=0.7)

In [None]:
y.sum() / len(y)

**Only 12% people of the whole dataset have responded.**

In [None]:
X_train.shape , y_train.shape

**Building the Model**

Considering input shape to be 10 as we have 10 features.

In [None]:
inputs = tf.keras.Input(shape=(10,))
x = tf.keras.layers.Dense(64, activation='relu')(inputs)
x = tf.keras.layers.Dense(64, activation='relu')(x)
outputs = tf.keras.layers.Dense(1, activation='sigmoid')(x)

model = tf.keras.Model(inputs=inputs, outputs=outputs)

In [None]:
model.summary()

**Compiling the model**

In [None]:
model.compile(
    optimizer='adam',
    loss='binary_crossentropy',
    metrics=[tf.keras.metrics.AUC(name='auc')]
)

In [None]:
batch_size = 64
epochs = 25

history = model.fit(
    X_train,
    y_train,
    validation_split=0.2,
    batch_size=batch_size,
    epochs=epochs,
    callbacks=[tf.keras.callbacks.ReduceLROnPlateau()]
)

In [None]:
fig = px.line(
    history.history, y=['loss', 'val_loss'], 
    labels={'index': 'Epoch', 'value': 'Loss'}, 
    title='Training History')
fig.show()

**Loss and val loss are decreasing overtime and seems to converge.**

In [None]:
model.evaluate(X_test, y_test)

**We are getting an AUC of 0.85 and loss of 0.2689 which is pretty good.**

In [None]:
preds = model.predict(test_df)

**Considering a threshold of 0.5. If the probability of being positive is greater than 50% we will have 1 else 0**

In [None]:
preds = list(map(lambda x: np.int(x[0]),  preds >= 0.5))

In [None]:
submission = pd.concat([pd.Series(test_ids), pd.Series(preds)], axis=1)
submission.columns = ['id', 'Response']

In [None]:
submission

In [None]:
submission.to_csv('./submission.csv')