In [56]:
import numpy as np
import pandas as pd
import matplotlib.pyplot as plt
import warnings
warnings.filterwarnings('ignore')

In [57]:
df=pd.read_csv("travel insurance.csv")

### Link for dataset https://www.kaggle.com/mhdzahier/travel-insurance

### Dataset

In [58]:
df.head()

Unnamed: 0,Agency,Agency Type,Distribution Channel,Product Name,Claim,Duration,Destination,Net Sales,Commision (in value),Gender,Age
0,CBH,Travel Agency,Offline,Comprehensive Plan,No,186,MALAYSIA,-29.0,9.57,F,81
1,CBH,Travel Agency,Offline,Comprehensive Plan,No,186,MALAYSIA,-29.0,9.57,F,71
2,CWT,Travel Agency,Online,Rental Vehicle Excess Insurance,No,65,AUSTRALIA,-49.5,29.7,,32
3,CWT,Travel Agency,Online,Rental Vehicle Excess Insurance,No,60,AUSTRALIA,-39.6,23.76,,32
4,CWT,Travel Agency,Online,Rental Vehicle Excess Insurance,No,79,ITALY,-19.8,11.88,,41


In [59]:
df.shape

(63326, 11)

In [60]:
df.isnull().sum()

Agency                      0
Agency Type                 0
Distribution Channel        0
Product Name                0
Claim                       0
Duration                    0
Destination                 0
Net Sales                   0
Commision (in value)        0
Gender                  45107
Age                         0
dtype: int64

### Dealing with missing values

In [61]:
df.Gender.unique()

array(['F', nan, 'M'], dtype=object)

In [62]:
df.Gender=df.Gender.map({'F':0,'M':1})

In [63]:
df.Gender.fillna(-1,inplace=True)

In [64]:
df.Gender.unique()

array([ 0., -1.,  1.])

### Categorical Columns

In [65]:
Agency_Mapping=dict( enumerate(df.Agency.astype('category').cat.categories ) )
df.Agency=df.Agency.astype('category').cat.codes

Agency_Type_Mapping=dict( enumerate(df["Agency Type"].astype('category').cat.categories ) )
df["Agency Type"]=df["Agency Type"].astype('category').cat.codes

Distribution_Channel_Mapping=dict( enumerate(df["Distribution Channel"].astype('category').cat.categories ) )
df["Distribution Channel"]=df["Distribution Channel"].astype('category').cat.codes

Product_Name_Mapping=dict( enumerate(df["Product Name"].astype('category').cat.categories ) )
df["Product Name"]=df["Product Name"].astype('category').cat.codes

Claim_Mapping=dict( enumerate(df["Claim"].astype('category').cat.categories ) )
df["Claim"]=df["Claim"].astype('category').cat.codes

Destination_Mapping=dict( enumerate(df["Destination"].astype('category').cat.categories ) )
df["Destination"]=df["Destination"].astype('category').cat.codes

In [66]:
df.head()

Unnamed: 0,Agency,Agency Type,Distribution Channel,Product Name,Claim,Duration,Destination,Net Sales,Commision (in value),Gender,Age
0,3,1,0,12,0,186,79,-29.0,9.57,0.0,81
1,3,1,0,12,0,186,79,-29.0,9.57,0.0,71
2,6,1,1,16,0,65,4,-49.5,29.7,-1.0,32
3,6,1,1,16,0,60,4,-39.6,23.76,-1.0,32
4,6,1,1,16,0,79,61,-19.8,11.88,-1.0,41


### Numeric Columns

In [67]:
min(df.Duration.unique())

-2

In [68]:
max(df.Age.unique())

118

In [69]:
df=df[df.Duration>0]

In [70]:
df=df[df.Age<100]

In [71]:
df.shape

(62305, 11)

In [72]:
df["Duration_Days"]=df.Duration.apply(lambda x: int(x/1440))
df["Duration_Hours"]=df.Duration.apply(lambda x: int((x%1440)/60))
df["Duration_Minutes"]=df.Duration.apply(lambda x: ((x%1440)%60))

df.drop("Duration",axis=1,inplace=True)

In [73]:
def age_bracket(age):
    if(age<12):
        return "Under 12"
    elif(age>=12 and age<=17):
        return "12-17"
    elif(age>=18 and age<=24):
        return "18-24"
    elif(age>=25 and age<=34):
        return "25-34"
    elif(age>=35 and age<=44):
        return "35-44"
    elif(age>=45 and age<=54):
        return "45-54"
    elif(age>=55 and age<=64):
        return "55-64"
    else:
        return "65+"
df.Age=df.Age.apply(lambda x: age_bracket(x))

Age_Mapping=dict( enumerate(df["Age"].astype('category').cat.categories ) )
df["Age"]=df["Age"].astype('category').cat.codes

### Normalizing the data

In [75]:
import pandas as pd
from sklearn import preprocessing
x = df.loc[:,df.columns!="Claim"] #returns a numpy array
columns=x.columns
x=x.values

y=df.loc[:,"Claim"]
scaler = preprocessing.StandardScaler()
x_scaled = scaler.fit_transform(x)
X = pd.DataFrame(x_scaled,columns=columns)

### Machine Learning Algorithms

In [81]:
from sklearn.tree import DecisionTreeClassifier # Import Decision Tree Classifier
from sklearn import metrics #Import scikit-learn metrics module for accuracy calculation



import pandas as pd 
import numpy as np
from sklearn.metrics import accuracy_score, confusion_matrix
from sklearn.ensemble import RandomForestClassifier
from sklearn.linear_model import LogisticRegression
from sklearn.naive_bayes import GaussianNB
from sklearn.neighbors import KNeighborsClassifier
from sklearn import svm, tree
import xgboost
from sklearn.neural_network import MLPClassifier
from sklearn.model_selection import RepeatedStratifiedKFold
from sklearn.model_selection import StratifiedKFold
from sklearn.model_selection import cross_val_score
from numpy import std
from numpy import mean

In [77]:
from sklearn.model_selection import train_test_split # Import train_test_split function

X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.3, random_state=1) # 70% training and 30% test

In [83]:
classifiers=[]
names=[]

model1 = xgboost.XGBClassifier()
classifiers.append(model1)
names.append('XGB Classifier')

model3 = tree.DecisionTreeClassifier()
classifiers.append(model3)
names.append('Decision Tree')

model4 = RandomForestClassifier()
classifiers.append(model4)
names.append('Random Forest')

model5 = LogisticRegression()
classifiers.append(model5)
names.append('Logistic Regression')

model7 = KNeighborsClassifier()
classifiers.append(model7)
names.append('K-Nearest Neighbor')

model8 = MLPClassifier(hidden_layer_sizes=(32,32,32), activation='relu', solver='adam', max_iter=100)
classifiers.append(model8)
names.append('MLP Classifier')

In [85]:
kfold = RepeatedStratifiedKFold(n_splits=5, n_repeats=3, random_state=1)
for i,clf in enumerate(classifiers):
    scores = cross_val_score(clf, X_train, y_train, scoring='accuracy', cv=kfold, n_jobs=-1)
    print(names[i])
    print('Accuracy: %.3f (%.3f)' % (mean(scores), std(scores)))

XGB Classifier
Accuracy: 0.985 (0.000)
Decision Tree
Accuracy: 0.972 (0.001)
Random Forest
Accuracy: 0.982 (0.001)
Logistic Regression
Accuracy: 0.985 (0.000)
K-Nearest Neighbor
Accuracy: 0.985 (0.000)
MLP Classifier
Accuracy: 0.985 (0.000)


In [87]:
from sklearn.metrics import accuracy_score
for i,clf in enumerate(classifiers):
    clf.fit(X_train, y_train)
    y_pred= clf.predict(X_test)
    acc = accuracy_score(y_test, y_pred)
    print("Accuracy of %s is %s"%(names[i], acc))
    cm = confusion_matrix(y_test, y_pred)
    print("Confusion Matrix of %s is %s"%(names[i], cm))

Accuracy of XGB Classifier is 0.9846993366145945
Confusion Matrix of XGB Classifier is [[18406     6]
 [  280     0]]
Accuracy of Decision Tree is 0.9742670661245453
Confusion Matrix of Decision Tree is [[18199   213]
 [  268    12]]
Accuracy of Random Forest is 0.9821848919323775
Confusion Matrix of Random Forest is [[18357    55]
 [  278     2]]
Accuracy of Logistic Regression is 0.984913331906698
Confusion Matrix of Logistic Regression is [[18410     2]
 [  280     0]]
Accuracy of K-Nearest Neighbor is 0.9846458377915686
Confusion Matrix of K-Nearest Neighbor is [[18404     8]
 [  279     1]]
Accuracy of MLP Classifier is 0.9849668307297239
Confusion Matrix of MLP Classifier is [[18411     1]
 [  280     0]]
