In [None]:
# This Python 3 environment comes with many helpful analytics libraries installed
# It is defined by the kaggle/python Docker image: https://github.com/kaggle/docker-python
# For example, here's several helpful packages to load

import numpy as np # linear algebra
import pandas as pd # data processing, CSV file I/O (e.g. pd.read_csv)

# Input data files are available in the read-only "../input/" directory
# For example, running this (by clicking run or pressing Shift+Enter) will list all files under the input directory

import os
for dirname, _, filenames in os.walk('/kaggle/input'):
    for filename in filenames:
        print(os.path.join(dirname, filename))
pd.options.mode.chained_assignment = None  # disabled chaining errors as some columns overwritten below

# You can write up to 20GB to the current directory (/kaggle/working/) that gets preserved as output when you create a version using "Save & Run All" 
# You can also write temporary files to /kaggle/temp/, but they won't be saved outside of the current session

In [None]:
import sys
print(sys.version)

**Context**

A Tour & Travels Company Is Offering Travel Insurance Package To Their Customers.
The New Insurance Package Also Includes Covid Cover.
The Company Requires To Know The Which Customers Would Be Interested To Buy It Based On Its Database History.

The Insurance Was Offered To Some Of The Customers In 2019 And The Given Data Has Been Extracted From The Performance/Sales Of The Package During That Period.
The Data Is Provided For Almost 2000 Of Its Previous Customers And You Are Required To Build An Intelligent Model That Can Predict If The Customer Will Be Interested To Buy The Travel Insurance Package Based On Certain Parameters Given Below.

**Content**

*Age*- Age Of The Customer

*Employment Type*- The Sector In Which Customer Is Employed

*GraduateOrNot*- Whether The Customer Is College Graduate Or Not

*AnnualIncome*- The Yearly Income Of The Customer In Indian Rupees[Rounded To Nearest 50 Thousand Rupees]
*FamilyMembers*- Number Of Members In Customer's Family

*ChronicDisease*- Whether The Customer Suffers From Any Major Disease Or Conditions Like Diabetes/High BP or Asthama,etc.

*FrequentFlyer*- Derived Data Based On Customer's History Of Booking Air Tickets On Atleast 4 Different Instances In The Last 2 Years[2017-2019].

*EverTravelledAbroad*- Has The Customer Ever Travelled To A Foreign Country[Not Necessarily Using The Company's Services]

*TravelInsurance*- Did The Customer Buy Travel Insurance Package During Introductory Offering Held In The Year 2019.

### Libraries

In [None]:
import matplotlib.pyplot as plt
%matplotlib inline
from sklearn.preprocessing import LabelEncoder
from scipy.stats import levene
import seaborn as sns
from scipy.stats import shapiro
from sklearn.model_selection import train_test_split, cross_val_score
from sklearn.neighbors import KNeighborsClassifier
from sklearn.tree import DecisionTreeClassifier
from sklearn.ensemble import RandomForestClassifier
from sklearn.naive_bayes import GaussianNB
from sklearn.linear_model import LogisticRegression
from sklearn.svm import SVC
from sklearn.metrics import accuracy_score
from sklearn.preprocessing import StandardScaler
from sklearn.decomposition import PCA
from sklearn.decomposition import KernelPCA

### Importing the dataset

In [None]:
dataset_df = pd.read_csv("../input/travel-insurance-prediction-data/TravelInsurancePrediction.csv")

In [None]:
dataset_df.head()

In [None]:
dataset_df.info()

Drop index Unnamed

In [None]:
dataset = dataset_df.loc[:, ~dataset_df.columns.str.contains('^Unnamed')]

In [None]:
dataset.describe().columns

### Exploratory Data Analysis

In [None]:
le = LabelEncoder()
to_encode = ['Employment Type','GraduateOrNot','FrequentFlyer', 'EverTravelledAbroad']
for index in to_encode:
    dataset[index] = le.fit_transform(dataset[index])
dataset['AnnualIncome'] = np.log10(dataset['AnnualIncome'])

### **CORRELATION, HOMOGENEITY AND NORMALITY**

In [None]:
corrPearson = dataset.corr(method="pearson")
corrSpearman = dataset.corr(method="spearman")

#### Pearson Correlation

In [None]:
figure = plt.figure(figsize=(10,8))
sns.heatmap(corrPearson,annot=True,cmap='RdYlGn', vmin=-1, vmax=+1)
plt.title("PEARSON")
plt.xlabel("COLUMNS")
plt.ylabel("COLUMNS")
plt.show()

#### Spearman Correlation

In [None]:
figure = plt.figure(figsize=(10,8))
sns.heatmap(corrSpearman,annot=True,cmap='RdYlGn', vmin=-1, vmax=+1)
plt.title("SPEARMAN")
plt.xlabel("COLUMNS")
plt.ylabel("COLUMNS")
plt.show()

#### HOMOGENEITY

In [None]:
print("%.4f - %.4f " % levene(dataset["Age"],dataset["Employment Type"],dataset["GraduateOrNot"],dataset["AnnualIncome"],dataset["FamilyMembers"],
                             dataset["ChronicDiseases"],dataset["FrequentFlyer"],dataset["EverTravelledAbroad"],dataset["TravelInsurance"]))

#### NORMALITY

In [None]:
for i in dataset.columns:
    print("-----"*10  )
    print("%.3f - %.3f" % shapiro(dataset[i]), i)

### Dataset Visual Analysis

In [None]:
dataset.hist(figsize=(25,15))
plt.show()

### Barplots

In [None]:
figure = plt.figure(figsize=(6,6))
sns.barplot(x="Employment Type",y="TravelInsurance",data=dataset)
plt.show()

In [None]:
figure = plt.figure(figsize=(6,6))
sns.barplot(x="GraduateOrNot",y="TravelInsurance",data=dataset)
plt.show()

In [None]:
figure = plt.figure(figsize=(6,6))
sns.barplot(x="ChronicDiseases",y="TravelInsurance",data=dataset)
plt.show()

In [None]:
figure = plt.figure(figsize=(6,6))
sns.barplot(x="FrequentFlyer",y="TravelInsurance",data=dataset)
plt.show()

In [None]:
figure = plt.figure(figsize=(6,6))
sns.barplot(x="EverTravelledAbroad",y="TravelInsurance",data=dataset)
plt.show()

#### Boxplots

In [None]:
datasetV = dataset.copy()
datasetV["Employment Type"] = pd.Categorical(datasetV["Employment Type"])
datasetV["GraduateOrNot"] = pd.Categorical(datasetV["GraduateOrNot"])
datasetV["FrequentFlyer"] = pd.Categorical(datasetV["FrequentFlyer"])
datasetV["EverTravelledAbroad"] = pd.Categorical(datasetV["EverTravelledAbroad"])
datasetV["TravelInsurance"] = pd.Categorical(datasetV["TravelInsurance"])

In [None]:
to_boxplot = ["Age", "AnnualIncome","FamilyMembers"]
for index in to_boxplot:
    figure = plt.figure(figsize=(10,5))
    sns.boxplot(x=index, y="TravelInsurance" ,data=datasetV, whis=[0, 100], width=.6, palette="vlag")
plt.show()


#### LinePlots

In [None]:
rand = 0
for index in dataset.columns:
    rand = rand + 1
    if rand < 8: # total 14 columns but we don't need output columns
        figure = plt.figure(figsize=(10,6))
        sns.lineplot(x="TravelInsurance",y=index,data=dataset)
        plt.show()
    else:
        break

#### X and y and split for models

In [None]:
X = dataset.iloc[:,:-1].values
y = dataset.iloc[:, -1].values
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.25, random_state = 188)

### Prediction

In [None]:
cKNN = KNeighborsClassifier(n_neighbors = 10, metric = 'minkowski', p = 2).fit(X_train, y_train)
cDT = DecisionTreeClassifier(criterion = 'entropy', random_state = 0).fit(X_train, y_train)
cRF = RandomForestClassifier(n_estimators = 10, criterion ='entropy', random_state = 0).fit(X_train, y_train)
cNB = GaussianNB().fit(X_train, y_train)
cLoR = LogisticRegression(solver = 'liblinear', random_state = 0).fit(X_train, y_train)
cSVM = SVC(kernel = 'rbf', random_state=0).fit(X_train, y_train)



#### Comparing the models

In [None]:
model_lst = [cKNN, cDT, cRF, cLoR, cNB, cSVM]

for model in model_lst:
    model_name = model.__class__.__name__
    predict = model.predict(X_test)
    R2CV = cross_val_score(model, X_test, y_test, cv = 10, verbose = False).mean()
    error = -cross_val_score(model, X_test, y_test, cv = 10, scoring = 'neg_mean_squared_error',verbose = False).mean()
    print(model_name + "--> ")
    print("*" * 10)
    print(accuracy_score(y_test, predict))
    print(R2CV)
    print(np.sqrt(error))
    print('*' * 30)

## Improving the Model

#### Feature Scaling

In [None]:
sc = StandardScaler()
X_train_scaled = sc.fit_transform(X_train)
X_test_scaled = sc.transform(X_test)

### Feature Extraction

#### Principal Component Analysis

In [None]:
pca = PCA(n_components = 3)
X_train_pca = pca.fit_transform(X_train_scaled)
X_test_pca = pca.transform(X_test_scaled)

#### Fitting

In [None]:
cKNN_pca = KNeighborsClassifier(n_neighbors = 10, metric = 'minkowski', p = 2).fit(X_train_pca, y_train)
cDT_pca = DecisionTreeClassifier(criterion = 'entropy', random_state = 0).fit(X_train_pca, y_train)
cRF_pca = RandomForestClassifier(n_estimators = 10, criterion ='entropy', random_state = 0).fit(X_train_pca, y_train)
cNB_pca = GaussianNB().fit(X_train_pca, y_train)
cLoR_pca = LogisticRegression(solver = 'liblinear', random_state = 0).fit(X_train_pca, y_train)
cSVM_pca = SVC(kernel = 'rbf', random_state=0).fit(X_train_pca, y_train)

#### Comparing models with extracted features

In [None]:
model_lst_pca = [cKNN_pca, cDT_pca, cRF_pca, cLoR_pca, cNB_pca, cSVM_pca]
for model in model_lst_pca:
    model_name = model.__class__.__name__
    predict = model.predict(X_test_pca)
    R2CV = cross_val_score(model, X_test_pca, y_test, cv = 10, verbose = False).mean()
    error = -cross_val_score(model, X_test_pca, y_test, cv = 10, scoring = 'neg_mean_squared_error',verbose = False).mean()
    print(model_name + "--> ")
    print("*" * 10)
    print(accuracy_score(y_test, predict))
    print(R2CV)
    print(np.sqrt(error))
    print('*' * 30)

#### PCA with Kernel

In [None]:
kpca = KernelPCA(n_components = 3, kernel = 'rbf') #radial basis function
X_train_kpca = kpca.fit_transform(X_train_scaled)
X_test_kpca = kpca.transform(X_test_scaled)

#### Fitting

In [None]:
cKNN_kpca = KNeighborsClassifier(n_neighbors = 10, metric = 'minkowski', p = 2).fit(X_train_kpca, y_train)
cDT_kpca = DecisionTreeClassifier(criterion = 'entropy', random_state = 0).fit(X_train_kpca, y_train)
cRF_kpca = RandomForestClassifier(n_estimators = 10, criterion ='entropy', random_state = 0).fit(X_train_kpca, y_train)
cNB_kpca = GaussianNB().fit(X_train_kpca, y_train)
cLoR_kpca = LogisticRegression(solver = 'liblinear', random_state = 0).fit(X_train_kpca, y_train)
cSVM_kpca = SVC(kernel = 'rbf', random_state=0).fit(X_train_kpca, y_train)

#### Comparing models with extracted features


In [None]:
model_lst_kpca = [cKNN_kpca, cDT_kpca, cRF_kpca, cLoR_kpca, cNB_kpca, cSVM_kpca]
for model in model_lst_kpca:
    model_name = model.__class__.__name__
    predict = model.predict(X_test_kpca)
    R2CV = cross_val_score(model, X_test_kpca, y_test, cv = 10, verbose = False).mean()
    error = -cross_val_score(model, X_test_kpca, y_test, cv = 10, scoring = 'neg_mean_squared_error',verbose = False).mean()
    print(model_name + "--> ")
    print("*" * 10)
    print(accuracy_score(y_test, predict))
    print(R2CV)
    print(np.sqrt(error))
    print('*' * 30)

# Conclusion

**BEST SCORE :  KNN 0.8651911468812877, without scaling and feature extraction**.

In [None]:
r = pd.DataFrame(columns=["MODELS","R2CV"])
for model in model_lst:
    name = model.__class__.__name__
    R2CV = cross_val_score(model,X_test,y_test,cv=10,verbose=False).mean()
    result = pd.DataFrame([[name,R2CV*100]],columns=["MODELS","R2CV"])
    r = r.append(result)
    
figure = plt.figure(figsize=(20,8))   
sns.barplot(x="R2CV",y="MODELS",data=r,color="k")
plt.xlabel("R2CV")
plt.ylabel("MODELS")
plt.xlim(0,100)
plt.title("MODEL ACCURACY COMPARISON")
plt.show()