In [None]:
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
import seaborn as sns

from sklearn.preprocessing import MinMaxScaler
from sklearn.model_selection import train_test_split, RandomizedSearchCV, GridSearchCV
from sklearn.metrics import confusion_matrix, accuracy_score, classification_report

from sklearn.linear_model import LogisticRegression
from sklearn.neighbors import KNeighborsClassifier
from sklearn.svm import SVC
from sklearn.ensemble import RandomForestClassifier
from xgboost import XGBClassifier

import os
for dirname, _, filenames in os.walk('/kaggle/input'):
    for filename in filenames:
        print(os.path.join(dirname, filename))

In [None]:
df = pd.read_csv('/kaggle/input/travel-insurance-prediction-data/TravelInsurancePrediction.csv')

print(df.shape)
df.head()

In [None]:
df.drop('Unnamed: 0', axis = 1, inplace = True)
df['ChronicDiseases'] = np.where(df['ChronicDiseases'] == 1, 'Yes', 'No')

# EDA

## Target Variable

In [None]:
target = 'TravelInsurance'

In [None]:
df.groupby(target)['Age'].count().plot.bar()
plt.xlabel(target)
plt.ylabel('count')
plt.show()

In [None]:
print('N: {}%'.format((df[target].value_counts()[0] / len(df)) * 100))
print('Y: {}%'.format((df[target].value_counts()[1] / len(df)) * 100))

## Missing Values

In [None]:
df.isnull().sum()

## Numeric Features

In [None]:
num_features = [feature for feature in df.columns if df[feature].dtype != 'O' and feature != target]

df[num_features].head()

### Discrete Features

In [None]:
dis_features = [feature for feature in num_features if len(df[feature].unique()) < 20]

print(dis_features)

#### Distribution

In [None]:
for feature in dis_features:
    df.groupby(feature)[target].count().plot.bar()
    
    plt.xlabel(feature)
    plt.ylabel('count')
    plt.show()

#### vs Target Variable

In [None]:
for feature in dis_features:
    df.groupby(feature)[target].mean().plot.bar()
    
    plt.xlabel(feature)
    plt.ylabel('count')
    plt.show()

### Continuous Features

In [None]:
con_features = [feature for feature in num_features if feature not in dis_features]

print(con_features)

#### Distribution

In [None]:
df[con_features].hist(bins = 25)
plt.show()

## Categorical Features

In [None]:
cat_features = [feature for feature in df.columns if feature not in num_features and feature != target]

df[cat_features].head()

In [None]:
for feature in cat_features:
    print('{}: {} categories'.format(feature, len(df[feature].unique())))

### Distribution

In [None]:
for feature in cat_features:
    df.groupby(feature)[target].count().plot.bar()
    
    plt.xlabel(feature)
    plt.ylabel('count')
    plt.show()

### vs Target Variable

In [None]:
for feature in cat_features:
    df.groupby(feature)[target].mean().plot.bar()
    
    plt.xlabel(feature)
    plt.ylabel(target)
    plt.show()

### Observations
1. Frequenet fliers are more likely to purchase insurance
2. People who have travelled abroad are more likely to purchase insurance

# Feature Engineering

In [None]:
dummy_df = pd.get_dummies(df, drop_first = True)
dummy_df.head()

# Feature Selection

In [None]:
X = dummy_df.drop(target, axis = 1)
y = dummy_df[target]

In [None]:
cor = X[num_features].corr()

sns.heatmap(cor, annot = True, cmap = plt.cm.CMRmap_r)
plt.show()

# Models

## Scaling

In [None]:
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size = 0.2, random_state = 0)

In [None]:
scaler = MinMaxScaler()

scaler.fit(X_train)

In [None]:
X_train_scaled = scaler.transform(X_train)
X_test_scaled = scaler.transform(X_test)

## Linear Models

### Logistic Regression

In [None]:
model = LogisticRegression()
model.fit(X_train_scaled, y_train)

In [None]:
y_pred_lr = model.predict(X_test_scaled)

In [None]:
print(confusion_matrix(y_test, y_pred_lr))
print(classification_report(y_test, y_pred_lr, digits = 4))
print('score: ', accuracy_score(y_test, y_pred_lr))

### KNN

In [None]:
scores = []
neighbors = np.arange(5, 13)

for i in range(5, 12):
    model = KNeighborsClassifier(n_neighbors = i)
    model.fit(X_train_scaled, y_train)
    scores.append(model.score(X_test_scaled, y_test))
    
best_neighbors = neighbors[scores.index(max(scores))]
print(best_neighbors)

In [None]:
model = KNeighborsClassifier(n_neighbors = best_neighbors)
model.fit(X_train_scaled, y_train)

In [None]:
y_pred_knn = model.predict(X_test_scaled)

In [None]:
print(confusion_matrix(y_test, y_pred_knn))
print(classification_report(y_test, y_pred_knn, digits = 4))
print('score: ', accuracy_score(y_test, y_pred_knn))

### SVM

In [None]:
model = SVC(kernel = 'poly', degree = 4)
model.fit(X_train_scaled, y_train)

In [None]:
y_pred_svm = model.predict(X_test_scaled)

In [None]:
print(confusion_matrix(y_test, y_pred_svm))
print(classification_report(y_test, y_pred_svm, digits = 4))
print('score: ', accuracy_score(y_test, y_pred_svm))

## Ensemble Models

In [None]:
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size = 0.2, random_state = 0)

### Random Forest

In [None]:
model = RandomForestClassifier()
model.fit(X_train, y_train)

In [None]:
y_pred_rf = model.predict(X_test)

In [None]:
print(confusion_matrix(y_test, y_pred_rf))
print(classification_report(y_test, y_pred_rf, digits = 4))
print('score: ', accuracy_score(y_test, y_pred_rf))

### XGBoost

In [None]:
model = XGBClassifier(use_label_encoder = False)
model.fit(X_train, y_train)

In [None]:
y_pred_xgb = model.predict(X_test)

In [None]:
print(confusion_matrix(y_test, y_pred_xgb))
print(classification_report(y_test, y_pred_xgb, digits = 4))
print('score: ', accuracy_score(y_test, y_pred_xgb))

# Hyperparameter Tuning

## RandomForest

In [None]:
random_grid = {'n_estimators': [50, 100, 200, 400, 500, 700, 900],
               'criterion': ['gini', 'entropy'],
               'max_features': ['auto', 'sqrt', 'log2'],
               'max_depth': [None],
               'min_samples_split': [2, 5, 10],
               'min_samples_leaf': [1, 2, 4],
               'bootstrap': [True, False]}

In [None]:
model = RandomForestClassifier()

rf_random = RandomizedSearchCV(estimator = model, 
                               param_distributions = random_grid, 
                               n_iter = 100, 
                               cv = 3, 
                               verbose=2, 
                               random_state=42, 
                               n_jobs = -1)

rf_random.fit(X_train, y_train)

rf_random.best_params_

In [None]:
param_grid = {'n_estimators': [800,900, 1000, 1100],
               'criterion': ['gini'],
               'max_features': ['log2'],
               'max_depth': [None],
               'min_samples_split': [2, 3, 4],
               'min_samples_leaf': [3, 4, 5, 6],
               'bootstrap': [True]}

In [None]:
model = RandomForestClassifier()

rf_grid = GridSearchCV(estimator = model, param_grid = param_grid, cv = 3, n_jobs = -1, verbose = 2)

rf_grid.fit(X_train, y_train)

rf_grid.best_params_

In [None]:
model = RandomForestClassifier(n_estimators = 800,
                               min_samples_split = 2,
                               min_samples_leaf = 4,
                               max_features = 'log2',
                               max_depth = None,
                               criterion = 'gini',
                               bootstrap = True)

model.fit(X_train, y_train)

In [None]:
y_pred_hp_rf = model.predict(X_test)

In [None]:
print(confusion_matrix(y_test, y_pred_hp_rf))
print(classification_report(y_test, y_pred_hp_rf, digits = 4))
print('score: ', accuracy_score(y_test, y_pred_hp_rf))