In [0]:
# Import packages

import pandas as pd
import numpy as np
from sklearn.model_selection import train_test_split
from sklearn.tree import DecisionTreeClassifier
from sklearn.ensemble import RandomForestClassifier
from sklearn import metrics
from matplotlib import pyplot as plt
import seaborn as sns
from sklearn.metrics import roc_curve, auc
%matplotlib inline

In [0]:
# Read dataset
url= "https://raw.githubusercontent.com/shivckr/Insurance_claim/master/ClaimsData.csv"
insurance_data = pd.read_csv(url)

In [36]:
# Display first five rows 

insurance_data.head(15)

Unnamed: 0,age,alzheimers,arthritis,cancer,copd,depression,diabetes,heart.failure,ihd,kidney,osteoporosis,stroke,reimbursement2008,bucket2008,reimbursement2009,bucket2009
0,85,0,0,0,0,0,0,0,0,0,0,0,0,1,0,1
1,59,0,0,0,0,0,0,0,0,0,0,0,0,1,0,1
2,67,0,0,0,0,0,0,0,0,0,0,0,0,1,0,1
3,52,0,0,0,0,0,0,0,0,0,0,0,0,1,0,1
4,67,0,0,0,0,0,0,0,0,0,0,0,0,1,0,1
5,68,0,0,0,0,0,0,0,0,0,0,0,0,1,0,1
6,75,0,0,0,0,0,0,0,0,0,0,0,0,1,0,1
7,70,0,0,0,0,0,0,0,0,0,0,0,0,1,0,1
8,67,0,0,0,0,0,0,0,0,0,0,0,0,1,0,1
9,67,0,0,0,0,0,0,0,0,0,0,0,0,1,0,1


In [0]:
# Info of dataframe

insurance_data.info()

In [0]:
# Descriptive statistics of data

insurance_data.describe()

In [0]:
#Replace target values from 1 to 5 to 0 to 4

insurance_data [['bucket2008','bucket2009']]=insurance_data [['bucket2008','bucket2009']].apply(pd.Series.replace, to_replace=1, value=0)
insurance_data [['bucket2008','bucket2009']]=insurance_data [['bucket2008','bucket2009']].apply(pd.Series.replace, to_replace=2, value=1)
insurance_data [['bucket2008','bucket2009']]=insurance_data [['bucket2008','bucket2009']].apply(pd.Series.replace, to_replace=3, value=2)
insurance_data [['bucket2008','bucket2009']]=insurance_data [['bucket2008','bucket2009']].apply(pd.Series.replace, to_replace=4, value=3)
insurance_data [['bucket2008','bucket2009']]=insurance_data [['bucket2008','bucket2009']].apply(pd.Series.replace, to_replace=5, value=4)

### Visualizations

In [0]:
# Partition into age buckets 

def agetype_func(age):
    
    if age.item()<40:
        return 'Young'
    elif age.item()>=40 and age.item()<50:
        return 'Middle aged (40-50)' 
    elif age.item()>=50 and age.item()<60:
        return 'Middle aged(50-60)'
    else:
        return 'Old(>60)'
insurance_data['AgeType'] = insurance_data[['age']].apply(agetype_func, axis=1)

In [0]:
# Partition into buckets of insurance risk

def insurance_stat(yes):
    if yes.item()==0:
        return 'Low'
    elif yes.item()==1:
        return 'Emerging' 
    elif yes.item()==2:
        return 'Moderate' 
    elif yes.item()==3:
        return 'High' 
    else:
        return 'Very High'
insurance_data['bucket2009_type'] = insurance_data[['bucket2009']].apply(insurance_stat, axis=1)
insurance_data['bucket2008_type'] = insurance_data[['bucket2008']].apply(insurance_stat, axis=1)

In [0]:
# Percentage of patients in each insurance bucket by age?

mx = sns.factorplot(x="AgeType", hue='bucket2009_type',data=insurance_data, kind="count", size=6)
mx.set(ylabel='Number of Patients')
plt.title('Insurance risk factor by age');


In [0]:
# The amount of reimbursement for each age type and bucket type is shown in the year 2009
plt.figure(figsize=(10,5))
sns.boxplot('AgeType','reimbursement2009', data=insurance_data, hue="bucket2009_type")
plt.title('Insurance Reimbursement for each age and bucket type in 2009')

In [0]:
# The amount of reimbursement for each age type and bucket type is shown in the year 2008
plt.figure(figsize=(10,5))
sns.boxplot('AgeType','reimbursement2008', data=insurance_data, hue="bucket2008_type")
plt.title('Insurance Reimbursement for each age and bucket type in 2008 ')

In [0]:
# Insurance Reimbursement for people affected by depression
plt.figure(figsize=(10,5))
sns.boxplot('depression','reimbursement2009', data=insurance_data, hue="bucket2009_type")
plt.title('Insurance Reimbursement for people affected by depression ')

In [0]:
# Insurance Reimbursement for people affected by cancer in 2009
plt.figure(figsize=(10,5))
sns.boxplot('cancer','reimbursement2009', data=insurance_data, hue="bucket2009_type")
plt.title('Insurance Reimbursement for people affected by cancer in 2009 ')

In [0]:
# When the risk factor is high there are more chances of reimbursement.
sns.lmplot('age', # Horizontal axis
           'reimbursement2009', # Vertical axis
           data=insurance_data, # Data source
           fit_reg=False, # Don't fix a regression line
           hue="bucket2009_type", # Set color
           scatter_kws={"marker": "D", # Set marker style
                        "s": 100}) # S marker size

In [0]:
#How is the reimbursement with amongst ages with kidney problems

sns.factorplot(x="kidney", y="reimbursement2009", hue="AgeType", data=insurance_data, kind="bar");

In [0]:
##How is the reimbursement with amongst ages with stroke problems

plt.figure(figsize=(10,9))
sns.factorplot(x="stroke", y="reimbursement2009", hue="AgeType", data=insurance_data, kind="bar",col="bucket2009_type");

In [0]:
#How is the reimbursement with amongst ages with heart failure

plt.figure(figsize=(25,9))
sns.factorplot(x="heart.failure", y="reimbursement2009", hue="AgeType", data=insurance_data, kind="bar",col="bucket2009_type");

In [0]:
#Heatmap 

plt.figure(figsize=(16,9))
sns.heatmap(insurance_data.corr(),cmap="BrBG",annot=True)

In [0]:
#Let's remove highly correlated values

insurance_data.drop(['AgeType','bucket2009_type','bucket2008_type'],axis=1,inplace=True)

#### Prepare the data

In [34]:
insurance_data['bucket2008'].value_counts()

1    340202
2     63173
3     30847
4     19914
5      3869
Name: bucket2008, dtype: int64

In [0]:
# Split into train and test

X = insurance_data.drop(['bucket2009','reimbursement2009'],axis=1)

y = insurance_data.bucket2009

X_train, X_test, y_train, y_test = train_test_split(X,y,random_state=23)

In [0]:
# Feature Scaling

from sklearn.preprocessing import StandardScaler
scaler = StandardScaler()
X_train = scaler.fit_transform(X_train)
X_test = scaler.transform(X_test)

In [0]:
# Decision tree classifier

dt=DecisionTreeClassifier(criterion="gini",min_samples_leaf=10,class_weight='balanced')
dt_model=dt.fit(X_train,y_train)

In [52]:
# Let's understand feature imporatnce

#dt_model.feature_importances_
list(zip(X.columns,dt_model.feature_importances_))

[('age', 0.23364278453195636),
 ('alzheimers', 0.024420221642040534),
 ('arthritis', 0.02035007282555606),
 ('cancer', 0.008536542517377432),
 ('copd', 0.01212446670168245),
 ('depression', 0.030686584167529257),
 ('diabetes', 0.02011819577897609),
 ('heart.failure', 0.02019915797190785),
 ('ihd', 0.01509427785122811),
 ('kidney', 0.014574617875845531),
 ('osteoporosis', 0.026981146102651496),
 ('stroke', 0.008624935959561604),
 ('reimbursement2008', 0.5642222599968564),
 ('bucket2008', 0.00042473607683071344)]

In [0]:
# Let's export insurance model tree

from sklearn import tree
with open("insurance_tree.dot", 'w') as f:
    f = tree.export_graphviz(dt_model, out_file=f, feature_names=X.columns)

In [0]:
# Confusion matrix

dt_prd=dt_model.predict(X_test)
print(metrics.accuracy_score(y_test,dt_prd))
df_confusion = metrics.confusion_matrix(y_test,dt_prd)
df_confusion

In [0]:
print(metrics.classification_report(y_test,dt_prd))

In [0]:
# calculate null accuracy (for multi-class classification problems)
y_test.value_counts().head(1) / len(y_test)

In [0]:
# Random Forest Classifier

rf_clf=RandomForestClassifier(min_samples_leaf=20)
rf_model=rf_clf.fit(X_train,y_train)
rf_prediction=rf_model.predict(X_test)

In [0]:
print(metrics.accuracy_score(y_test,rf_prediction))
print(metrics.confusion_matrix(y_test,rf_prediction))
print(metrics.classification_report(y_test,rf_prediction))

#### Although the accuracy increases the f1 score of the high risk factor case is 0 which is a little dangerous.
#### So let us balance the class weight and increase number of estimators

In [0]:
rf_clf=RandomForestClassifier(n_estimators=100,class_weight="balanced",min_samples_leaf=40)
rf_model=rf_clf.fit(X_train,y_train)
rf_prediction=rf_model.predict(X_test)

In [54]:
print(metrics.accuracy_score(y_test,rf_prediction))
print(metrics.confusion_matrix(y_test,rf_prediction))
print(metrics.classification_report(y_test,rf_prediction))

0.6024174250231437
[[56212 13532  4469  1568  1241]
 [ 3558  9737  3941  2696  1716]
 [ 1829  3803  1932  1534  1054]
 [  695  1500   875   903  1049]
 [   66   150    95   153   194]]
              precision    recall  f1-score   support

           1       0.90      0.73      0.81     77022
           2       0.34      0.45      0.39     21648
           3       0.17      0.19      0.18     10152
           4       0.13      0.18      0.15      5022
           5       0.04      0.29      0.07       658

    accuracy                           0.60    114502
   macro avg       0.32      0.37      0.32    114502
weighted avg       0.69      0.60      0.64    114502

