# <h1 style='background:#0B2265; border:#A71930; border-width: thick; border-style: solid; color:white'><center>Travel Insurance : EDA & Predection</center></h1> 

# Loading

In [None]:
import numpy as np # linear algebra
import pandas as pd # data processing, CSV file I/O (e.g. pd.read_csv)
import os
for dirname, _, filenames in os.walk('/kaggle/input'):
    for filename in filenames:
        print(os.path.join(dirname, filename))

import matplotlib.pyplot as plt        
import seaborn as sns
import missingno

import warnings
warnings.filterwarnings("ignore")

# <h1 style='background:#0B2265; border:#A71930; border-width: thick; border-style: solid; color:white'><center> 1st Down : Null Check & Definition</center></h1> 

# Null Check

In [None]:
# Parette Color Set

NY_GIANTS_Color = ["#0B2265", "#A71930", "#A5ACAF"]
NY_GIANTS = {1 : NY_GIANTS_Color[0], 0 : NY_GIANTS_Color[1], 
             "Yes" : NY_GIANTS_Color[0], "No" : NY_GIANTS_Color[1]}

# Loading Data
df = pd.read_csv('/kaggle/input/travel-insurance-prediction-data/TravelInsurancePrediction.csv', index_col='Unnamed: 0')

# Missing Value Check Visulization
missingno.matrix(df, color =(0.04, 0.13, 0.40))

No null found. Great! I love this dataset

# Definitions

* Age : Age Of The Customer
* Employment Type : The Sector In Which Customer Is Employed
* GraduateOrNot: Whether The Customer Is College Graduate Or Not
* AnnualIncome : The Yearly Income Of The Customer In Indian Rupees
* FamilyMembers :  Number Of Members In Customer's Family
* ChronicDisease : Whether The Customer Suffers From Any Major Disease
* FrequentFlyer- Customer's History Of Booking Air Tickets 
* EverTravelledAbroad- Has The Customer Ever Travelled To A Foreign Country
* TravelInsurance- Whether The Customer Bought The Travel Insurance Or Not

In [None]:
# For understand easier
df.loc[df["TravelInsurance"] == 1, "Travel_Insurance_Buy"] = "Yes"
df.loc[df["TravelInsurance"] == 0, "Travel_Insurance_Buy"] = "No"

# For Analysis easier
df.loc[df["GraduateOrNot"] == "Yes", "GraduateOrNot_num"] = 1
df.loc[df["GraduateOrNot"] == "No", "GraduateOrNot_num"] = 0

df.loc[df["FrequentFlyer"] == "Yes", "FrequentFlyer_num"] = 1
df.loc[df["FrequentFlyer"] == "No", "FrequentFlyer_num"] = 0

df.loc[df["EverTravelledAbroad"] == "Yes", "EverTravelledAbroad_num"] = 1
df.loc[df["EverTravelledAbroad"] == "No", "EverTravelledAbroad_num"] = 0

Indian Rupee to US Dollar Spot Exchange Rates for 2019
* Best exchange rate: 0.0146 USD on 16 Jul 2019.
* Average exchange rate in 2019: 0.0142 USD.
* Worst exchange rate: 0.0139 USD on 03 Dec 2019.

In [None]:
# Currency Indian Rupees to USD for easier understading
Exchange_Rate = 0.0142
df['Annual_Income_USD'] = round((df['AnnualIncome'] * Exchange_Rate), 2)

# Overall View

In [None]:
sns.countplot(x='Travel_Insurance_Buy', data=df, palette=NY_GIANTS)

# <h1 style='background:#0B2265; border:#A71930; border-width: thick; border-style: solid; color:white'><center> 2nd Down : Data Analysis</center></h1> 

# Features

In [None]:
df_pairplot = df
df_pairplot = df_pairplot.drop('TravelInsurance', axis=1)
df_pairplot = df_pairplot.drop('GraduateOrNot', axis=1)
df_pairplot = df_pairplot.drop('FrequentFlyer', axis=1)
df_pairplot = df_pairplot.drop('EverTravelledAbroad', axis=1)
df_pairplot = df_pairplot.drop('AnnualIncome', axis=1)

sns.pairplot(df_pairplot, hue='Travel_Insurance_Buy', palette=NY_GIANTS)

looks like 
* "Age"
* "Annual_Income_USD"
show meaningful distribution.

Let's check more.

# Correlation Check

In [None]:
df_corr = df_pairplot
df_corr = df_corr.drop('Travel_Insurance_Buy', axis=1)
df_corr['TravelInsurance'] = df['TravelInsurance']

In [None]:
sns.set_theme(style="white")

# Compute the correlation matrix
corr = df_corr.corr()

# Generate a mask for the upper triangle
mask = np.triu(np.ones_like(corr, dtype=bool))

# Set up the matplotlib figure
f, ax = plt.subplots(figsize=(15, 15))

# Generate a custom diverging colormap
cmap = sns.diverging_palette(10, 240, n=8)

# Draw the heatmap with the mask and correct aspect ratio
sns.heatmap(corr, mask=mask, cmap=cmap, center=0,vmax=0.5, vmin=-0.1,
            annot=True, square=True, linewidths=0.5, cbar_kws={"shrink": .5})

looks like 
* "Frequent Flyer : Yes"
* "Ever Trabelled Abroad : Yes"
* "Annual_Income_USD : Higher"

are meaningful 

In [None]:
fig = plt.figure(figsize=(8,15))

fig=plt.subplot(3,1,1)
sns.histplot(df_pairplot, x=df_pairplot['Annual_Income_USD'], hue="Travel_Insurance_Buy", element="step", palette=NY_GIANTS)

fig=plt.subplot(3,1,2)
sns.histplot(df_pairplot, x=df_pairplot['FamilyMembers'], hue="Travel_Insurance_Buy", element="step", palette=NY_GIANTS)

fig=plt.subplot(3,1,3)
sns.histplot(df_pairplot, x=df_pairplot['Age'], hue="Travel_Insurance_Buy", element="step", palette=NY_GIANTS)

# <h1 style='background:#0B2265; border:#A71930; border-width: thick; border-style: solid; color:white'><center> 3rd Down : Prediction Model</center></h1> 

# Models

In [None]:
df_ss = df_corr
one_hot = pd.get_dummies(df_ss['Employment Type'])
df_ss = df_ss.join(one_hot)
df_ss = df_ss.drop('Employment Type', axis=1)
last_col = df_ss['TravelInsurance']
df_ss = df_ss.drop('TravelInsurance', axis=1)
df_ss = df_ss.join(last_col)

In [None]:
train_input = df_ss.iloc[:,0:-1].to_numpy()
train_target = df_ss.iloc[:,-1:]

In [None]:
from sklearn.model_selection import cross_validate

## LGBM

In [None]:
from lightgbm import LGBMClassifier
lgb = LGBMClassifier(random_state=42)
scores = cross_validate(lgb, train_input, train_target, return_train_score=True, n_jobs=-1)

print("Train Accuracy : ", np.mean(scores['train_score']))
print("Test Accuracy : ",np.mean(scores['test_score']))

## XG Boost

In [None]:
from xgboost import XGBClassifier
xgb = XGBClassifier(random_state=42)
scores = cross_validate(xgb, train_input, train_target, return_train_score=True, n_jobs=-1)

print("Train Score : ", np.mean(scores['train_score']))
print("Test Score : ", np.mean(scores['test_score']))

## Hist Gradient Boosting

In [None]:
from sklearn.experimental import enable_hist_gradient_boosting
from sklearn.ensemble import HistGradientBoostingClassifier

hgb = HistGradientBoostingClassifier(random_state=42)
scores = cross_validate(hgb, train_input, train_target, return_train_score=True)

print("Train Score : ",np.mean(scores['train_score']))
print("Test Score : ",np.mean(scores['test_score']))

# <h1 style='background:#0B2265; border:#A71930; border-width: thick; border-style: solid; color:white'><center> 4th Down : Conclusion</center></h1> 

# Conclusion

To find out who buy travel insurance or not :

You may focus
* "Ever Trabelled Abroad : Yes"
* "Annual_Income_USD : Higher"

Only vanilla setting, achieved 81% by using
* LGBM
* Hist Gradient Boosting

You may have higher Acc. by parameter setting.