In [None]:
import pandas as pd
import numpy as np
import matplotlib as mpl
import matplotlib.pyplot as plt
import seaborn as sns
import sklearn

from sklearn.model_selection import cross_val_score
from sklearn.linear_model import LogisticRegression
from sklearn.ensemble import RandomForestClassifier
from sklearn.neighbors import KNeighborsClassifier
from sklearn.tree import DecisionTreeClassifier
from sklearn.metrics import confusion_matrix, classification_report
import sklearn.metrics as metrics
from sklearn.metrics import make_scorer, accuracy_score, roc_auc_score
from sklearn.model_selection import GridSearchCV
from sklearn.model_selection import train_test_split


In [None]:
df=pd.read_csv("../input/health-insurance-cross-sell-prediction/train.csv")

# Data description

In [None]:
df.head()

In [None]:
df.info()

Check missing value

In [None]:
pd.isnull(df).sum()

# Visualization

In [None]:
df.nunique()

In [None]:
##ditribution of Response
fig_dims = (5, 5)
fig, ax = plt.subplots()
sns.countplot('Response',
              data = df,
              order = df['Response'].value_counts().index,
              ax = ax)
ax.set(xlabel='Response', ylabel='Count')
plt.show()

In [None]:
#ditribution of Gender,Driving_License,Previously_Insured,Previously_Insured
fig, axarr = plt.subplots(2, 2, figsize=(10, 10))

df['Gender'].value_counts().sort_index().plot.pie(
    ax=axarr[0][0])
axarr[0][0].set_title("Gender", fontsize=18)
df['Previously_Insured'].value_counts().sort_index().plot.pie(
    ax=axarr[1][0])
axarr[1][0].set_title("Previously_Insured", fontsize=18)

df['Vehicle_Damage'].value_counts().sort_index().plot.pie(
    ax=axarr[1][1])
axarr[1][1].set_title("Vehicle_Damage", fontsize=18)

df['Driving_License'].value_counts().head().plot.pie(
    ax=axarr[0][1])
axarr[0][1].set_title("Driving_License", fontsize=18)

In [None]:
fig=plt.figure(figsize=(5, 5))
sns.countplot(x="Gender", hue="Vehicle_Damage", data=df)
plt.title("Vehicle Damage by Gender")

Find male are more likely to cause Vehicle_Damage lol.

In [None]:
#ditribution of Age
fig_dims = (15, 8)
fig, ax = plt.subplots(figsize=fig_dims)
sns.countplot('Age',
              data = df,
              ax = ax)
ax.set(xlabel='Age', ylabel='Count')
plt.show()

In [None]:
df.head()

In [None]:
# represent binary variable as 1and 0
df['Gender'].replace(to_replace={'Male':0,'Female':1},
             inplace=True)
df['Vehicle_Damage'].replace(to_replace={'No':0,'Yes':1},
             inplace=True)
df['Vehicle_Age'].replace(to_replace={'< 1 Year':0,'1-2 Year':1,'> 2 Years':2},
             inplace=True)

In [None]:
df.info()

In [None]:
df.head()

### Correlation Heatmap

In [None]:
plt.figure(figsize=(10,10))
cor=df.corr()
sns.heatmap(cor,annot=True,cmap=plt.cm.Blues)
plt.show()

Descriptive Statistic

In [None]:
df.describe()

# Split into Train and Test set

In [None]:
df=df.drop(columns=['id'])

In [None]:
y=df.Response
X=df.drop(columns=['Response'])

In [None]:
## split into 70%train set and 30%test
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.3, random_state=42)

# Modeling

### decision tree

In [None]:
dt = DecisionTreeClassifier()
dt.fit(X_train, y_train)
dt_predict = dt.predict(X_test)


In [None]:
print(classification_report(y_test, dt_predict))
dt_accuracy = accuracy_score(y_test, dt_predict)
print("Accuracy of decision tree" + ' : ' + str(dt_accuracy))

In [None]:
 # Compute 10-fold cross-validation scores: cv_scores
from sklearn.model_selection import cross_val_score 
cv_scores = cross_val_score(dt,X,y,cv=10)

print(cv_scores)
print("Average 10-Fold CV Score: {}".format(np.mean(cv_scores)))

## Improve the decision tree model 

In [None]:
# use  GridSearchCV to test all accuracy, and choose the combinations of the highest accuracy
from sklearn.model_selection import GridSearchCV
param_grid = {'max_depth': np.arange(3, 10),
             'criterion' : ['gini','entropy'],
             'max_leaf_nodes': [5,10,50,100],
             'min_samples_split': [2, 5, 10, 20]}
grid_tree = GridSearchCV(DecisionTreeClassifier(), param_grid, cv = 5, scoring= 'accuracy')
grid_tree.fit(X_train, y_train)
np.abs(grid_tree.best_score_)
#test the accuracy of all the combination of the parameters, then output the highest parameter.
print(grid_tree.best_estimator_)

In [None]:
# use the best performance combinations  to test
Tree = DecisionTreeClassifier(ccp_alpha=0.0, class_weight=None, criterion='gini',
                       max_depth=9, max_features=None, max_leaf_nodes=50,
                       min_impurity_decrease=0.0, min_impurity_split=None,
                       min_samples_leaf=1, min_samples_split=2,
                       min_weight_fraction_leaf=0.0, presort='deprecated',
                       random_state=None, splitter='best')
Tree.fit(X_train, y_train)
predictions = Tree.predict(X_test)
accuracy_score(y_true = y_test, y_pred = predictions)

## Plot ROC_AUC

In [None]:
import sklearn.metrics as metrics
# calculate the fpr and tpr for all thresholds of the classification
probs = dt.predict_proba(X_test)
preds = probs[:,1]
fpr, tpr, threshold = metrics.roc_curve(y_test, preds)
roc_auc = metrics.auc(fpr, tpr)

#  plt
import matplotlib.pyplot as plt
plt.title('Receiver Operating Characteristic for Decision Tree')
plt.plot(fpr, tpr, 'b', label = 'AUC = %0.2f' % roc_auc)
plt.legend(loc = 'lower right')
plt.plot([0, 1], [0, 1],'r--')
plt.xlim([0, 1])
plt.ylim([0, 1])
plt.ylabel('True Positive Rate')
plt.xlabel('False Positive Rate')
plt.show()


### Random Forest

In [None]:
rf = RandomForestClassifier()
rf.fit(X_train, y_train)
rf_Predict = rf.predict(X_test)

In [None]:
print(classification_report(y_test, rf_Predict))
rf_accuracy = accuracy_score(y_test, rf_Predict)
print("Accuracy of rf" + ' : ' + str(rf_accuracy))

In [None]:
cv_scores = cross_val_score(rf,X,y,cv=10)

print(cv_scores)
print("Average 10-Fold CV Score: {}".format(np.mean(cv_scores)))

In [None]:
# Plot ROC_AUC for random forest
probs = rf.predict_proba(X_test)
preds = probs[:,1]
fpr, tpr, threshold = metrics.roc_curve(y_test, preds)
roc_auc = metrics.auc(fpr, tpr)

#  plt
import matplotlib.pyplot as plt
plt.title('Receiver Operating Characteristic for Random Forest')
plt.plot(fpr, tpr, 'b', label = 'AUC = %0.2f' % roc_auc)
plt.legend(loc = 'lower right')
plt.plot([0, 1], [0, 1],'r--')
plt.xlim([0, 1])
plt.ylim([0, 1])
plt.ylabel('True Positive Rate')
plt.xlabel('False Positive Rate')
plt.show()


### Logistic Regression

In [None]:
from sklearn.linear_model import LogisticRegression
lr = LogisticRegression()
lr.fit(X_train, y_train)
lr_predict = lr.predict(X_test)

In [None]:
print(classification_report(y_test, lr_predict))
lr_accuracy = accuracy_score(y_test, lr_predict)
print("Accuracy of Logistic Regression" + ' : ' + str(lr_accuracy))

In [None]:
# Plot ROC_AUC for logistic regression
probs = lr.predict_proba(X_test)
preds = probs[:,1]
fpr, tpr, threshold = metrics.roc_curve(y_test, preds)
roc_auc = metrics.auc(fpr, tpr)


import matplotlib.pyplot as plt
plt.title('Receiver Operating Characteristic for Logistic Regression')
plt.plot(fpr, tpr, 'b', label = 'AUC = %0.2f' % roc_auc)
plt.legend(loc = 'lower right')
plt.plot([0, 1], [0, 1],'r--')
plt.xlim([0, 1])
plt.ylim([0, 1])
plt.ylabel('True Positive Rate')
plt.xlabel('False Positive Rate')
plt.show()


### KNN

In [None]:
# build the knn model and calculate the accuracy score when n=10
knn = KNeighborsClassifier(n_neighbors=10)
knn.fit(X_train, y_train)
knn_predict = knn.predict(X_test)

In [None]:

knn_accuracy = accuracy_score(y_test, knn_predict)
print("Accuracy of Logistic Regression" + ' : ' + str(knn_accuracy))

In [None]:
# Plot ROC_AUC for knn
probs = knn.predict_proba(X_test)
preds = probs[:,1]
fpr, tpr, threshold = metrics.roc_curve(y_test, preds)
roc_auc = metrics.auc(fpr, tpr)

#  plt
import matplotlib.pyplot as plt
plt.title('Receiver Operating Characteristic for KNN')
plt.plot(fpr, tpr, 'b', label = 'AUC = %0.2f' % roc_auc)
plt.legend(loc = 'lower right')
plt.plot([0, 1], [0, 1],'r--')
plt.xlim([0, 1])
plt.ylim([0, 1])
plt.ylabel('True Positive Rate')
plt.xlabel('False Positive Rate')
plt.show()
