In [None]:
# This Python 3 environment comes with many helpful analytics libraries installed
# It is defined by the kaggle/python Docker image: https://github.com/kaggle/docker-python
# For example, here's several helpful packages to load

import numpy as np # linear algebra
import pandas as pd # data processing, CSV file I/O (e.g. pd.read_csv)

# Input data files are available in the read-only "../input/" directory
# For example, running this (by clicking run or pressing Shift+Enter) will list all files under the input directory

import os
for dirname, _, filenames in os.walk('/kaggle/input'):
    for filename in filenames:
        print(os.path.join(dirname, filename))

# You can write up to 5GB to the current directory (/kaggle/working/) that gets preserved as output when you create a version using "Save & Run All" 
# You can also write temporary files to /kaggle/temp/, but they won't be saved outside of the current session

In [None]:
import matplotlib.pyplot as plt
import seaborn as sns
from sklearn.model_selection import train_test_split
from sklearn.metrics import accuracy_score, classification_report, confusion_matrix, cohen_kappa_score

In [None]:
df = pd.read_csv('/kaggle/input/health-insurance-cross-sell-prediction/train.csv')
df.head()

In [None]:
df.info()

No null values present in the dataset

In [None]:
df.describe()

### Exploratory Data Analysis

In [None]:
sns.set(style='whitegrid')

In [None]:
sns.countplot(df['Response'])

We can clearly see that the dataset is highly unbalanced in nature. We shall need to take the correct steps to ensure that this unbalanced nature  does not affect our final model.

In [None]:
sns.countplot(df['Driving_License'])

In [None]:
df[df['Driving_License'] == 0].describe()

We see that it's mostly the retired citizens (age>65) who do not possess a driving license.

In [None]:
sns.countplot(df['Response'], hue=df['Previously_Insured'])

Thus everyone who gave a positive (1) reponse was a new customer or was getting insured the first time

In [None]:
sns.distplot(df['Age'])

In [None]:
sns.violinplot(df['Age'])

1. Most of the people buy a car from the age-group (20-30) and get their insurance done.
2. Again in age-group (40-50) people buy cars (After saving money, or a part of their retirement plan)

Taking a look at the 'Gender' column

In [None]:
sns.countplot(df['Gender'])

In [None]:
sns.countplot(df['Gender'], hue=df['Response'])

In [None]:
sns.countplot(df['Response'] ,hue=df['Previously_Insured'])

Most of the customers who gave a positive response were not previously insured. This can also be a good point to research on.

In [None]:
sns.countplot(df['Vehicle_Damage'])

In [None]:
sns.countplot(df['Response'], hue=df['Vehicle_Damage'])

From this plot we can see that all customers which gave a postive response had a **Vehical Damage**

In [None]:
sns.countplot(df['Response'], hue=df['Vehicle_Age'])

We cannot make any good guess from the 'Vehicle_Age' column.
#### Let's try looking at the numeric(continuous) features

In [None]:
df.head()

In [None]:
df.groupby('Region_Code')['Response'].agg('mean').sort_values().head(10)

In [None]:
sns.distplot(df['Annual_Premium'])

In [None]:
sns.boxplot(df['Annual_Premium'])

The annual premium has a lot of outliers, let's check if they are interesting

In [None]:
df['Annual_Premium'].describe()

In [None]:
high_premium = df[df['Annual_Premium'] >39400.00]
high_premium.describe()

In [None]:
print(df['Response'].value_counts())

In [None]:
print(high_premium['Response'].value_counts())
sns.countplot(high_premium['Response'])

So, we can see that nearly 30% customers who gave a positive response are high premium customers.
Thus we can include a feature that checks if a customer is having high premium or not.

In [None]:
sns.scatterplot(df['Annual_Premium'], df['Response'])

### Data Preprocessing

In [None]:
# gender
df['Gender'] = df['Gender'].map({'Female':0, 'Male':1}).astype('int')

In [None]:
df=pd.get_dummies(df, drop_first=True)

In [None]:
df = df.rename(columns = {'Vehicle_Age_< 1 Year':'AgeOneYear',
                          'Vehicle_Age_> 2 Years':'AgeTwoYears',
                          'Vehicle_Damage_Yes':'Vehicle_Damage'})

In [None]:
df['AgeOneYear'] = df['AgeOneYear'].astype('int')
df['AgeTwoYears'] = df['AgeTwoYears'].astype('int')
df['Vehicle_Damage'] = df['Vehicle_Damage'].astype('int')

In [None]:
df['HighPremium'] = np.where(df['Annual_Premium'] > 39400.00, 1, 0)

In [None]:
df.describe()

In [None]:
X = df.drop(['id', 'Response'], axis=1)
y = df['Response']

X_train,X_test,y_train,y_test = train_test_split(X,y, random_state = 0, stratify=y)

In [None]:
from sklearn.preprocessing import MinMaxScaler
scaler = MinMaxScaler()

X_train_sc = scaler.fit_transform(X_train)
X_test_sc = scaler.transform(X_test)

### Model Building

In [None]:
from sklearn.model_selection import cross_val_score, GridSearchCV
from sklearn.linear_model import LogisticRegression

### 1. Logistic Regression

In [None]:
model = LogisticRegression()
solvers = ['newton-cg', 'lbfgs', 'liblinear']
penalty = ['l1','l2']
c_values = [100, 10, 1.0, 0.1, 0.01]
# define grid search
grid = dict(solver=solvers,penalty=penalty,C=c_values)

grid_search = GridSearchCV(estimator=model, param_grid=grid, n_jobs=-1, cv=3, scoring='f1',error_score=0)
grid_result = grid_search.fit(X_train_sc, y_train)

print("Best: %f using %s" % (grid_result.best_score_, grid_result.best_params_))

In [None]:
lr = LogisticRegression(C=10, class_weight='balanced')
lr.fit(X_train_sc, y_train)

y_pred = lr.predict(X_test_sc)
print(classification_report(y_test, y_pred))

In [None]:
print(confusion_matrix(y_test, y_pred))

In [None]:
# get importance
importance = lr.coef_[0]
# summarize feature importance
for i,v in enumerate(importance):
    print('Feature: %0d, Score: %.5f' % (i,v))
# plot feature importance
plt.bar([x for x in X_train.columns], importance)
plt.xticks(rotation=90)
plt.show()

In [None]:
from sklearn.metrics import precision_recall_curve
from sklearn.metrics import auc


# plot no skill and model precision-recall curves
def plot_pr_curve(y_test, model_probs):
    # calculate the no skill line as the proportion of the positive class
    no_skill = len(y_test[y_test==1]) / len(y_test)
    # plot the no skill precision-recall curve
    plt.plot([0, 1], [no_skill, no_skill], linestyle='--', label='No Skill')
    # plot model precision-recall curve
    precision, recall, thresholds = precision_recall_curve(y_test, model_probs)
    # convert to f score
#     fscore = (2 * precision * recall) / (precision + recall)
#     # locate the index of the largest f score
#     ix = np.argmax(fscore)
#     print('Best Threshold=%f, F-Score=%.3f' % (thresholds[ix], fscore[ix]))
    plt.plot(recall, precision, marker='.', label='Logistic')
    #plt.scatter(recall[ix], precision[ix], marker='o', color='black', label='Best')
    # axis labels
    plt.xlabel('Recall')
    plt.ylabel('Precision')
    # show the legend
    plt.legend()
    # show the plot
    plt.show()

In [None]:
yhat = lr.predict_proba(X_test_sc)
model_probs = yhat[:, 1]
# calculate the precision-recall auc
precision, recall, _ = precision_recall_curve(y_test, model_probs)
auc_score = auc(recall, precision)
print('Logistic PR AUC: %.3f' % auc_score)
# plot precision-recall curves
plot_pr_curve(y_test, model_probs)

In [None]:
from sklearn.metrics import roc_auc_score, roc_curve

print('Area under curve score for Logistic Regression is: ', roc_auc_score(y_test, y_pred))

2. Random Forests

In [None]:
from sklearn.ensemble import RandomForestClassifier
from sklearn.tree import DecisionTreeClassifier

In [None]:
rf1 = RandomForestClassifier(n_estimators=300, max_depth=8, min_samples_split=4,
                             max_features='auto', bootstrap=True, min_samples_leaf=4,
                            class_weight='balanced_subsample')
rf1.fit(X_train, y_train)
y_pred = rf1.predict(X_test)

In [None]:
print(confusion_matrix(y_train, rf1.predict(X_train)))
print('Accuracy of our model is: ', accuracy_score(y_train, rf1.predict(X_train)))

In [None]:
print(confusion_matrix(y_test, y_pred))
print('Accuracy of our model is: ', accuracy_score(y_test, y_pred))

In [None]:
print(classification_report(y_test, y_pred))
print('Area under curve score for Random Forests is: ', roc_auc_score(y_test, y_pred))
print('Kappa score for Random Forests',cohen_kappa_score(y_test, y_pred))

In [None]:
yhat = rf1.predict_proba(X_test)
model_probs = yhat[:, 1]
# calculate the precision-recall auc
precision, recall, _ = precision_recall_curve(y_test, model_probs)
auc_score = auc(recall, precision)
print('Logistic PR AUC: %.3f' % auc_score)
# plot precision-recall curves
plot_pr_curve(y_test, model_probs)

In [None]:
features = pd.DataFrame()
features['Feature'] = X_train.columns
features['Importance'] = rf1.feature_importances_
features.sort_values(by=['Importance'], ascending=False, inplace=True)
features.set_index('Feature', inplace=True)
features.plot(kind='bar', figsize=(20, 10))

Plotting AUC-ROC curves for both Random Forest and Logistic Regression

In [None]:
# predict probabilities
pred_prob1 = lr.predict_proba(X_test_sc)
pred_prob2 = rf1.predict_proba(X_test)

# roc curve for models
fpr1, tpr1, thresh1 = roc_curve(y_test, pred_prob1[:,1], pos_label=1)
fpr2, tpr2, thresh2 = roc_curve(y_test, pred_prob2[:,1], pos_label=1)

# roc curve for tpr = fpr 
random_probs = [0 for i in range(len(y_test))]
p_fpr, p_tpr, _ = roc_curve(y_test, random_probs, pos_label=1)

# auc scores
auc_score1 = roc_auc_score(y_test, pred_prob1[:,1])
auc_score2 = roc_auc_score(y_test, pred_prob2[:,1])

print('AUC for Logistic Regression', auc_score1, 
      'AUC for Random Forests', auc_score2)

# plot roc curves
plt.plot(fpr1, tpr1, linestyle='--',color='orange', label='Logistic Regression')
plt.plot(fpr2, tpr2, linestyle='--',color='green', label='Random Forests')
plt.plot(p_fpr, p_tpr, linestyle='--', color='blue')
# title
plt.title('ROC curve')
# x label
plt.xlabel('False Positive Rate')
# y label
plt.ylabel('True Positive rate')

plt.legend(loc='best')
plt.savefig('ROC',dpi=300)
plt.show();

#### Let's also try a simple Gradient Boost classifier

In [None]:
from sklearn.ensemble import GradientBoostingClassifier

clf = GradientBoostingClassifier(n_estimators=200, min_samples_split=5,max_depth=6,
                                max_features = 'auto')
clf.fit(X_train, y_train)
y_pred = clf.predict(X_test)

In [None]:
print(confusion_matrix(y_test, y_pred))
print('Accuracy of our model is: ', accuracy_score(y_test, y_pred))
print(classification_report(y_test, y_pred))
print('Area under curve score for GBM is: ', roc_auc_score(y_test, y_pred))
print('Kappa score for GBM',cohen_kappa_score(y_test, y_pred))

## The End

Please upvote if this notebook was of any use to you. Feel free to comment.