In [None]:
# This Python 3 environment comes with many helpful analytics libraries installed
# It is defined by the kaggle/python Docker image: https://github.com/kaggle/docker-python
# For example, here's several helpful packages to load

import numpy as np # linear algebra
import pandas as pd # data processing, CSV file I/O (e.g. pd.read_csv)

# Input data files are available in the read-only "../input/" directory
# For example, running this (by clicking run or pressing Shift+Enter) will list all files under the input directory

import os
for dirname, _, filenames in os.walk('/kaggle/input'):
    for filename in filenames:
        print(os.path.join(dirname, filename))

# You can write up to 20GB to the current directory (/kaggle/working/) that gets preserved as output when you create a version using "Save & Run All" 
# You can also write temporary files to /kaggle/temp/, but they won't be saved outside of the current session

In [None]:
import matplotlib.pyplot as plt
import seaborn as sns
import warnings
warnings.simplefilter(action='ignore', category=FutureWarning)

In [None]:
df = pd.read_csv("../input/travel-insurance-prediction-data/TravelInsurancePrediction.csv")

Let's take a first look on the data:

In [None]:
df.head()

In [None]:
df.drop(columns=["Unnamed: 0"], inplace=True)

# EDA

In [None]:
df.isnull().sum()

Ok, no missing values.

In [None]:
df.TravelInsurance.value_counts()

The dataset is kinda imbalanced. Better to do some oversampling later.

In [None]:
plt.figure(figsize=(40,10))
sns.countplot('AnnualIncome',hue='TravelInsurance',data=df)
plt.show()

We can see that people with annual income greater that 1350000 have a strong tendention to take a travel insurance. We will use this info later for feature engineering.

In [None]:
sns.countplot('FrequentFlyer',hue='TravelInsurance',data=df)
plt.show()

Frequent flyers are more likely to take travel insurance.

In [None]:
sns.countplot('ChronicDiseases',hue='TravelInsurance',data=df)
plt.show()

"ChronicDiseases" doesnt seem to have much inpact on the target.

In [None]:
sns.countplot('EverTravelledAbroad',hue='TravelInsurance',data=df)
plt.show()

People who has travelled abroad are more likely to take travel insurance.

In [None]:
sns.countplot('Age',hue='TravelInsurance',data=df)
plt.show()

People older that 33 usually take travel insurance. Take that for feature engineering.

In [None]:
sns.countplot('FamilyMembers',hue='TravelInsurance',data=df)
plt.show()

In [None]:
sns.countplot('GraduateOrNot',hue='TravelInsurance',data=df)
plt.show()

# Feature engineering

We will use np.select to create categorical features.

In [None]:
# Creating a numpy array of Age values 
x = df['Age'].values
# List of new categories
cat = [0, 1]
# New column based on certain conditions
df['AgeCategory']=np.select([x<=32, x>32],cat)

In [None]:
x = df['AnnualIncome'].values
cat = [0, 1]
df['AnnualCategory']=np.select([x<=1300000, x>1300000],cat)

Replace words with 0/1 categories.

In [None]:
df['Employment Type'].replace({'Private Sector/Self Employed':0, 'Government Sector':1},inplace=True)
df['GraduateOrNot'].replace({'No':0, 'Yes':1},inplace=True)
df['FrequentFlyer'].replace({'No':0, 'Yes':1},inplace=True)
df['EverTravelledAbroad'].replace({'No':0, 'Yes':1},inplace=True)

We no more in need of those columns:

In [None]:
df_use = df.drop(columns=["AnnualIncome", "Age"])

In [None]:
df_use.head()

# Data preprocessing

In [None]:
X = df_use.drop(columns=["TravelInsurance"])
y = df_use.TravelInsurance

Let's do oversampling to create a balanced dataset.

In [None]:
X_imbalanced = np.vstack((X[y == 1], X[y == 0][:30]))
y_imbalanced = np.hstack((y[y == 1], y[y == 0][:30]))

In [None]:
from sklearn.utils import resample

# Create oversampled training data set for minority class
X_oversampled, y_oversampled = resample(X_imbalanced[y_imbalanced == 0],
                                        y_imbalanced[y_imbalanced == 0],
                                        replace=True,
                                        n_samples=X_imbalanced[y_imbalanced == 1].shape[0],
                                        random_state=123)

# Append the oversampled minority class to training data and related labels
X_balanced = np.vstack((X[y == 1], X_oversampled))
y_balanced = np.hstack((y[y == 1], y_oversampled))

In [None]:
from sklearn.model_selection import train_test_split, GridSearchCV
X_train, X_test, y_train, y_test = train_test_split(X_balanced, y_balanced, test_size=0.33, random_state=42)

# Model learning

In [None]:
from xgboost import XGBClassifier
parameters = {'alpha': (0.1, 10), 'max_depth': (5, 10), 'learning_rate': (0.1, 1)}
clf = GridSearchCV(XGBClassifier(), parameters)
clf.fit(X_train, y_train)

In [None]:
clf.best_params_

In [None]:
from sklearn.metrics import accuracy_score, plot_confusion_matrix, f1_score, precision_score, recall_score

In [None]:
xgb = clf.best_estimator_
y_pred = xgb.predict(X_test)
print('XGBoost model accuracy score: {0:0.4f}'. format(accuracy_score(y_test, y_pred)))
print('XGBoost model F1 score: {0:0.4f}'. format(f1_score(y_test, y_pred)))
print('XGBoost model precision score: {0:0.4f}'. format(precision_score(y_test, y_pred)))
print('XGBoost model recall score: {0:0.4f}'. format(recall_score(y_test, y_pred)))

In [None]:
plot_confusion_matrix(xgb, X_test, y_test)  
plt.show() 