# Predict the purchase of travel insurance

## Exploratory data analysis

In [None]:
# Packages imports
import numpy as np
import pandas as pd
import seaborn as sns
import matplotlib.pyplot as plt
%matplotlib inline
import warnings
warnings.filterwarnings("ignore")

In [None]:
# Import DataSet
df = pd.read_csv('../input/travel-insurance-prediction-data/TravelInsurancePrediction.csv')

In [None]:
# Head
df.head(10)

In [None]:
# Drop Unnamed: 0 column
df_v2 = df.drop('Unnamed: 0', axis = 1)
df_v2.head(10)

In [None]:
# Check if exists missing values
df_v2.isna().any().any()

### Analysis of quantitative variables

In [None]:
# Creating a dataset with only quantitative variables
quant_variables = df_v2.drop(['Employment Type', 'GraduateOrNot', 'ChronicDiseases', 'FrequentFlyer', 'EverTravelledAbroad', 'TravelInsurance'], axis = 1)
quant_variables.head(10)

In [None]:
# Statistical summary of quantitative variables
quant_variables.describe()

In [None]:
# Checking the correlation between quantitative variables
quant_variables.corr()

In [None]:
# Checking the skew of each attribute
quant_variables.skew()

In [None]:
# Univariate Histogram
quant_variables.hist(layout = (2,2))
plt.show()

In [None]:
# Univariate Density Plot
quant_variables.plot(kind = 'density', subplots = True, layout = (2,2), sharex = False)
plt.show()

In [None]:
# Boxplot with vertical orientation
quant_variables.plot(kind = 'box', subplots = True, layout = (2,2), sharex = False, sharey = False)
plt.show()

In [None]:
# pairplot
sns.pairplot(quant_variables)

### Analysis of qualitatives variables

In [None]:
# Creating a dataset with only qualitatives variables
quali_variables = df_v2.drop(['Age', 'AnnualIncome', 'FamilyMembers', 'TravelInsurance'], axis = 1)
quali_variables.head(10)

In [None]:
# Distribution of qualitative variables
Distr = [quali_variables.groupby(variable).size() for variable in quali_variables]
print(Distr)

In [None]:
# Distribution of qualitative variables graphically
for variable in quali_variables:
    quali_variables.groupby(variable).size().plot(kind = 'bar')
    plt.show()

## Transformation of Variables

In [None]:
# Packages imports
from sklearn.preprocessing import OrdinalEncoder

### transformation of qualitative variables

In [None]:
# Creating the dataset of the qualitative variables that will be processed
quali_variables_process = quali_variables.drop('ChronicDiseases', axis = 1)
quali_variables_process.head(10)

In [None]:
# OrdinalEncoder
ord_enc = OrdinalEncoder()

In [None]:
# Transforming the Employment Type variable
quali_variables_process['Employment Type_1'] = ord_enc.fit_transform(quali_variables_process[['Employment Type']])
df_Employ = quali_variables_process[['Employment Type_1']]

# Transforming the GraduateOrNot variable
quali_variables_process['GraduateOrNot_1'] = ord_enc.fit_transform(quali_variables_process[['GraduateOrNot']])
df_Gradu = quali_variables_process[['GraduateOrNot_1']]

# Transforming the FrequentFlyer variable
quali_variables_process['FrequentFlyer_1'] = ord_enc.fit_transform(quali_variables_process[['FrequentFlyer']])
df_Freq = quali_variables_process[['FrequentFlyer_1']]

# Transforming the EverTravelledAbroad variable
quali_variables_process['EverTravelledAbroad_1'] = ord_enc.fit_transform(quali_variables_process[['EverTravelledAbroad']])
df_Ever = quali_variables_process[['EverTravelledAbroad_1']]

In [None]:
# Creating a DataFrame with transformed qualitative variables
frames_quali = [df_Employ, df_Gradu, df_Freq, df_Ever]
df_quali_num = pd.concat(frames_quali, axis = 1)

# Transforming float data type to integer
df_quali_num = df_quali_num.astype({'Employment Type_1': int, 'GraduateOrNot_1': int, 'FrequentFlyer_1': int, 'EverTravelledAbroad_1': int})

# Head
df_quali_num.head(10)

### transformation of quantitatives variables

In [None]:
# Transforming quantitative variables into ordinal qualitative variables

# DataFrame Age
df_age = quant_variables[['Age']]
df_age.head(10)

In [None]:
# Making sure the Age column data is Int
df_age = df_age.astype({'Age': int})

In [None]:
# Taking Age values and putting them in an np array
values_age = df_age['Age'].values

In [None]:
# Creating Function for variable transformation
def prencAge(num):
    
    if num <= 28:
        return '25-28' 
    elif num > 28 and num <= 30:
        return '29-30'
    elif num > 30 and num <= 32:
        return '31-32'
    else:
        return '>32'

prencAge(33)

In [None]:
# Check Formula
prencAge(df_age['Age'].loc[0])

In [None]:
# Using the formula in a numpy array with for loop and list comprehension
age_ord = [(prencAge(i)) for i in values_age]
age_ord

In [None]:
# Turning the list comprehension into a data frame
df_age_ord = pd.DataFrame(age_ord, columns = ['Age'])
df_age_ord.head(10)# Using the formula in a numpy array with for loop and list comprehension

In [None]:
# DataFrame AnnualIncome
df_income = quant_variables[['AnnualIncome']]
df_income.head(10)

In [None]:
# Making sure the AnnualIncome column data is float
df_income = df_income.astype({'AnnualIncome': float})

In [None]:
# Taking the values of AnnualIncome and putting it in an np array
values_income = df_income['AnnualIncome'].values

In [None]:
# Creating Function for variable transformation
def prencIncome(num):
    
    if num <= 600000:
        return '300.000-600.000' 
    elif num > 600000 and num <= 900000:
        return '600.000-900.000'
    elif num > 900000 and num <= 1250000:
        return '900.000-1.250.000'
    else:
        return '>1.250.000'

prencIncome(730000)

In [None]:
# Using the formula in a numpy array with for loop and list comprehension
income_ord = [(prencIncome(i)) for i in values_income]
income_ord

In [None]:
# Turning the list comprehension into a data frame
df_income_ord = pd.DataFrame(income_ord, columns = ['AnnualIncome'])
df_income_ord.head(10)

In [None]:
# DataFrame FamilyMembers
df_family = quant_variables[['FamilyMembers']]
df_family.head(10)

In [None]:
# Making sure the FamilyMembers column data is int
df_family = df_family.astype({'FamilyMembers': int})

In [None]:
# Taking the values of FamilyMembers and putting it in an np array
values_family = df_family['FamilyMembers'].values

In [None]:
# Creating Function for variable transformation
def prencFamily(num):
    
    if num <= 4:
        return '2-4' 
    elif num > 4 and num <= 5:
        return '4-5'
    elif num > 5 and num <= 6:
        return '5-6'
    else:
        return '>6'

prencFamily(8)

In [None]:
# Using the formula in a numpy array with for loop and list comprehension
family_ord = [(prencFamily(i)) for i in values_family]
family_ord

In [None]:
# Turning the list comprehension into a data frame
df_family_ord = pd.DataFrame(family_ord, columns = ['FamilyMembers'])
df_family_ord.head(10)

In [None]:
# Transforming the Age_ord variable into quant
df_age_ord['Age_quant'] = ord_enc.fit_transform(df_age_ord[['Age']])
df_age_final = df_age_ord[['Age_quant']]

# Transforming the AnnualIncome variable into quant
df_income_ord['Income_quant'] = ord_enc.fit_transform(df_income_ord[['AnnualIncome']])
df_income_final = df_income_ord[['Income_quant']]

# Transforming the FamilyMembers variable into quant
df_family_ord['Family_quant'] = ord_enc.fit_transform(df_family_ord[['FamilyMembers']])
df_family_final = df_family_ord[['Family_quant']]

In [None]:
# Creating a DataFrame with quantified ordinal variables
frames_ord = [df_age_final, df_income_final, df_family_final]
df_ord_num = pd.concat(frames_ord, axis = 1)

# Transforming data type from float to integer
df_ord_num = df_ord_num.astype({'Age_quant': int, 'Income_quant': int, 'Family_quant': int})

# Head
df_ord_num.head(10)

In [None]:
# Dataset Transform

# join the dfs
frames = [df_quali_num, df_ord_num, df_v2[['ChronicDiseases', 'TravelInsurance']]]
df_transform = pd.concat(frames, axis = 1)

# Head
df_transform.head(10)

## Feature Selection

In [None]:
# Import packages
from sklearn.feature_selection import SelectKBest
from sklearn.feature_selection import chi2
from sklearn.feature_selection import RFE
from sklearn.linear_model import LogisticRegression
from sklearn.ensemble import ExtraTreesClassifier

### Univariate Selection Method

In [None]:
# Creating the array with dataset values
array = df_transform.values

# Separating the array into input and output components
X = array[:,0:8]
Y = array[:,8]

# Function for selecting variables
best_var = SelectKBest(score_func = chi2, k = 4)

# Run the scoring function on (X, Y) and get the selected features
fit = best_var.fit(X, Y)

# Reduces X for selected features
features = fit.transform(X)

# Results
print('\nOriginal number of features:', X.shape[1])
print('\nReduced number of features:', features.shape[1])
print('\nFeatures (Selected Variables): \n\n', features)

- The selected variables were FrequentFlyer_1 EverTravelledAbroad_1, Age_quant and Income_quant

### Attribute Recursive Elimination Method

In [None]:
# Creating the array with dataset values
array = df_transform.values

# Separating the array into input and output components
X = array[:,0:8]
Y = array[:,8]

# Model creation
modelo = LogisticRegression()

# RFE
rfe = RFE(modelo, 4)
fit = rfe.fit(X, Y)

# Results
print("Predictive Variables:", df_transform.columns[0:8])
print("Selected Variables: %s" % fit.support_)
print("Attribute Ranking: %s" % fit.ranking_)
print("Number of Best Attributes: %d" % fit.n_features_)

- The selected variables were FrequentFlyer_1, EverTravelledAbroad_1, Income_quant and Family_quant

### Ensemble Method for Variable Selection

In [None]:
# Creating the array with dataset values
array = df_transform.values

# Separating the array into input and output components
X = array[:,0:8]
Y = array[:,8]

# Model creation
modelo = ExtraTreesClassifier()
modelo.fit(X, Y)

# Results
print(df_transform.columns[0:8])
print(modelo.feature_importances_)

- The selected variables were EverTravelledAbroad_1, Age_quant, Income_quant and Family_quant

Summarizing Feature Selection:

- Employment Type_1: 0
- GraduateOrNot_1: 0    
- FrequentFlyer_1: 2    
- EverTravelledAbroad_1: 3
- Age_quant: 2
- Income_quant: 3    
- Family_quant: 2
- ChronicDiseases: 0

Use the 5 main variables that are: EverTravelledAbroad_1, Income_quant, FrequentFlyer_1, Age_quant and Family_quant

## choose the algorithm

In [None]:
# Import packages
import matplotlib.pyplot as plt
from sklearn.model_selection import KFold
from sklearn.model_selection import cross_val_score
from sklearn.linear_model import LogisticRegression
from sklearn.tree import DecisionTreeClassifier
from sklearn.neighbors import KNeighborsClassifier
from sklearn.discriminant_analysis import LinearDiscriminantAnalysis
from sklearn.naive_bayes import GaussianNB
from sklearn.svm import SVC

In [None]:
# Creating the dataframe with the variables selected in feature selection
df_feature = df_transform[['EverTravelledAbroad_1', 'Income_quant', 'FrequentFlyer_1', 'Age_quant', 'Family_quant', 'TravelInsurance']]
df_feature.head(10)

In [None]:
# Transforming dataframe values into an array
array = df_feature.values

# Separating the array into input and output components
X = array[:,0:5]
Y = array[:,5]

# Setting the values for the number of folds
num_folds = 10

# Preparing the model list
models = []
models.append(('LR', LogisticRegression()))
models.append(('LDA', LinearDiscriminantAnalysis()))
models.append(('NB', GaussianNB()))
models.append(('KNN', KNeighborsClassifier()))
models.append(('CART', DecisionTreeClassifier()))
models.append(('SVM', SVC()))

# Evaluating each model in a loop
results = []
names = []

for name, model in models:
    kfold = KFold(n_splits = num_folds)
    cv_results = cross_val_score(model, X, Y, cv = kfold, scoring = 'accuracy')
    results.append(cv_results)
    names.append(name)
    msg = "%s: %f (%f)" % (name, cv_results.mean(), cv_results.std())
    print(msg)

# Boxplot to compare algorithms
fig = plt.figure()
fig.suptitle('Comparison of Classification Algorithms')
ax = fig.add_subplot(111)
plt.boxplot(results)
ax.set_xticklabels(names)
plt.show()

- SVM had the best accuracy


## Using Gradient Boost

In [None]:
# Import
from sklearn.ensemble import GradientBoostingClassifier

# Transforming dataframe values into an array
array = df_feature.values

# Separating the array into input and output components
X = array[:,0:5]
Y = array[:,5]

# Setting the values for the number of folds
num_folds = 10

# Defining the number of trees
num_trees = 100

# Separating the dice into folds
kfold = KFold(num_folds, True)

# Creating the model
modelo = GradientBoostingClassifier(n_estimators = num_trees)

# Cross Validation
resultado = cross_val_score(modelo, X, Y, cv = kfold)

# Print result
print("Accuracy: %.3f" % (resultado.mean() * 100))

 - Gradient Boost showed better accuracy than SVM

## Optimizing Gradient Boost hyperparameters and applying SMOTE

In [None]:
# Import
from sklearn.model_selection import GridSearchCV

# Transforming dataframe values into an array
array = df_feature.values

# Separating the array into input and output components
X = array[:,0:5]
Y = array[:,5]

# Defining the values to be tested
valores_grid = {'learning_rate':[0.15,0.1,0.05,0.01,0.005,0.001], 'n_estimators':[100,250,500,750,1000,1250,1500,1750]}

# Creating the Model
modelo = GradientBoostingClassifier()

# Creating the grid
grid = GridSearchCV(estimator = modelo, param_grid = valores_grid)
grid.fit(X, Y)

# Print result
print("Accuracy: %.3f" % (grid.best_score_ * 100))
print("Best Model Parameters:\n", grid.best_estimator_)

In [None]:
# install package
!pip install imblearn --user

In [None]:
# Imports
from imblearn.pipeline import Pipeline
from imblearn.over_sampling import SMOTE

# Transforming dataframe values into an array
array = df_feature.values

# Separating the array into input and output components
X = array[:,0:5]
Y = array[:,5]

# Setting the values for the number of folds
num_folds = 10

# Defining the number of trees
lea_rate = 0.01
num_trees = 250

# Separating the dice into folds
kfold = KFold(num_folds, True)

# Creating the pipeline
steps = [('over', SMOTE()),('model',GradientBoostingClassifier(learning_rate = lea_rate, n_estimators = num_trees))]
pipeline = Pipeline(steps = steps)

# Cross Validation
resultado = cross_val_score(pipeline, X, Y, cv = kfold)

# Print result
print("Accuracy: %.3f" % (resultado.mean() * 100))

## Evaluating the Model

In [None]:
# AUC

# Transforming dataframe values into an array
array = df_feature.values

# Separating the array into input and output components
X = array[:,0:5]
Y = array[:,5]

# Setting the values for the number of folds
num_folds = 10

# Defining the number of trees
lea_rate = 0.01
num_trees = 250

# Separating the dice into folds
kfold = KFold(num_folds, True)

# Creating the pipeline
steps = [('over', SMOTE()),('model',GradientBoostingClassifier(learning_rate = lea_rate, n_estimators = num_trees))]
pipeline = Pipeline(steps = steps)

# Cross Validation
resultado = cross_val_score(pipeline, X, Y, cv = kfold, scoring = 'roc_auc')

# Print result
print("AUC: %.3f" % (resultado.mean() * 100))

In [None]:
# Precision

# Transforming dataframe values into an array
array = df_feature.values

# Separating the array into input and output components
X = array[:,0:5]
Y = array[:,5]

# Setting the values for the number of folds
num_folds = 10

# Defining the number of trees
lea_rate = 0.01
num_trees = 250

# Separating the dice into folds
kfold = KFold(num_folds, True)

# Creating the pipeline
steps = [('over', SMOTE()),('model',GradientBoostingClassifier(learning_rate = lea_rate, n_estimators = num_trees))]
pipeline = Pipeline(steps = steps)

# Cross Validation
resultado = cross_val_score(pipeline, X, Y, cv = kfold, scoring = 'precision')

# Print result
print("Precision: %.3f" % (resultado.mean() * 100))

In [None]:
# Recall

# Transforming dataframe values into an array
array = df_feature.values

# Separating the array into input and output components
X = array[:,0:5]
Y = array[:,5]

# Setting the values for the number of folds
num_folds = 10

# Defining the number of trees
lea_rate = 0.01
num_trees = 250

# Separating the dice into folds
kfold = KFold(num_folds, True)

# Creating the pipeline
steps = [('over', SMOTE()),('model',GradientBoostingClassifier(learning_rate = lea_rate, n_estimators = num_trees))]
pipeline = Pipeline(steps = steps)

# Cross Validation
resultado = cross_val_score(pipeline, X, Y, cv = kfold, scoring = 'recall')

# Print result
print("Recall: %.3f" % (resultado.mean() * 100))

In [None]:
# F1

# Transforming dataframe values into an array
array = df_feature.values

# Separating the array into input and output components
X = array[:,0:5]
Y = array[:,5]

# Setting the values for the number of folds
num_folds = 10

# Defining the number of trees
lea_rate = 0.01
num_trees = 250

# Separating the dice into folds
kfold = KFold(num_folds, True)

# Creating the pipeline
steps = [('over', SMOTE()),('model',GradientBoostingClassifier(learning_rate = lea_rate, n_estimators = num_trees))]
pipeline = Pipeline(steps = steps)

# Cross Validation
resultado = cross_val_score(pipeline, X, Y, cv = kfold, scoring = 'f1')

# Print result
print("F1: %.3f" % (resultado.mean() * 100))

###### Summarizing

- Accuracy: 83.444
- AUC: 77.887
- Precision: 89.991
- Recall: 60.590
- F1-Score: 71.824