# Task for Today  

***

## Health Insurance Customer Response Prediction  

Given *data about a health insurance marketing campaign*, let's try to predict whether a given customer will **purchase** the insurance or not.

We will use a logistic regression model to make our predictions.

# Getting Started

In [None]:
import numpy as np
import pandas as pd
pd.set_option('max_columns', None)

import matplotlib.pyplot as plt
import seaborn as sns

from sklearn.model_selection import train_test_split
from sklearn.preprocessing import StandardScaler

from sklearn.linear_model import LogisticRegression

from sklearn.metrics import confusion_matrix, classification_report

import warnings
warnings.filterwarnings(action='ignore')

In [None]:
data = pd.read_csv('../input/health-insurance-lead-prediction/train.csv')

In [None]:
data

In [None]:
data.info()

# Preprocessing

In [None]:
def preprocess_inputs(df):
    df = df.copy()
    
    # Drop ID column
    df = df.drop('ID', axis=1)
    
    # Drop columns with too many missing values
    df = df.drop(['Holding_Policy_Duration', 'Holding_Policy_Type'], axis=1)
    
    # Drop high-cardinality columns
    df = df.drop('Region_Code', axis=1)
    
    # Fill remaining missing values
    df['Health Indicator'] = df['Health Indicator'].fillna(df['Health Indicator'].mode()[0])
    
    # Binary encoding
    df['Accomodation_Type'] = df['Accomodation_Type'].replace({'Rented': 0, 'Owned': 1})
    df['Reco_Insurance_Type'] = df['Reco_Insurance_Type'].replace({'Individual': 0, 'Joint': 1})
    df['Is_Spouse'] = df['Is_Spouse'].replace({'No': 0, 'Yes': 1})
    
    # One-hot encoding
    for column in ['City_Code', 'Health Indicator', 'Reco_Policy_Cat']:
        dummies = pd.get_dummies(df[column], prefix=column)
        df = pd.concat([df, dummies], axis=1)
        df = df.drop(column, axis=1)
    
    # Split df into X and y
    y = df['Response']
    X = df.drop('Response', axis=1)
    
    # Train-test split
    X_train, X_test, y_train, y_test = train_test_split(X, y, train_size=0.7, shuffle=True, random_state=1)
    
    # Scale X
    scaler = StandardScaler()
    scaler.fit(X_train)
    X_train = pd.DataFrame(scaler.transform(X_train), index=X_train.index, columns=X_train.columns)
    X_test = pd.DataFrame(scaler.transform(X_test), index=X_test.index, columns=X_test.columns)
    
    return X_train, X_test, y_train, y_test

In [None]:
X_train, X_test, y_train, y_test = preprocess_inputs(data)

In [None]:
X_train

In [None]:
y_train.value_counts()

# Training

In [None]:
model = LogisticRegression(
    class_weight={
        0: 1.0,
        1: 2.5
    }
)
model.fit(X_train, y_train)

# Results

In [None]:
acc = model.score(X_test, y_test)
y_pred = model.predict(X_test)

print("Test Accuracy: {:.2f}%".format(acc * 100))

cm = confusion_matrix(y_test, y_pred, labels=[0, 1])
clr = classification_report(y_test, y_pred, labels=[0, 1], target_names=["Negative", "Positive"])

plt.figure(figsize=(6, 6))
sns.heatmap(cm, annot=True, fmt='g', vmin=0, cmap='Blues', cbar=False)
plt.xticks(ticks=[0.5, 1.5], labels=["Negative", "Positive"])
plt.yticks(ticks=[0.5, 1.5], labels=["Negative", "Positive"])
plt.xlabel("Predicted")
plt.ylabel("Actual")
plt.title("Confusion Matrix")
plt.show()

print("Classification Report:\n----------------------\n", clr)

# Data Every Day  

This notebook is featured on Data Every Day, a YouTube series where I train models on a new dataset each day.  

***

Check it out!  
https://youtu.be/b1H452jCs00