In [None]:
# This Python 3 environment comes with many helpful analytics libraries installed
# It is defined by the kaggle/python Docker image: https://github.com/kaggle/docker-python
# For example, here's several helpful packages to load

import numpy as np # linear algebra
import pandas as pd # data processing, CSV file I/O (e.g. pd.read_csv)

# Input data files are available in the read-only "../input/" directory
# For example, running this (by clicking run or pressing Shift+Enter) will list all files under the input directory

import os
for dirname, _, filenames in os.walk('/kaggle/input'):
    for filename in filenames:
        print(os.path.join(dirname, filename))

# You can write up to 20GB to the current directory (/kaggle/working/) that gets preserved as output when you create a version using "Save & Run All" 
# You can also write temporary files to /kaggle/temp/, but they won't be saved outside of the current session

# TravelInsurance Prediction

**Load Data**

In [None]:
import numpy as np
import pandas as pd
import matplotlib.pyplot as plt
import seaborn as sns

In [None]:
df = pd.read_csv("/kaggle/input/travel-insurance-prediction-data/TravelInsurancePrediction.csv")
df.head(n=5)

In [None]:
df.info()

# Exploratory Data Analysis

In [None]:
df.isnull().sum()

In [None]:
from sklearn.preprocessing import LabelEncoder
le = LabelEncoder()
col=['Employment Type','Employment Type','FrequentFlyer','EverTravelledAbroad','GraduateOrNot']
df[col]=df[col].apply(le.fit_transform)
df.head(n=5)

# Data insights


In [None]:
df.describe()

In [None]:
df.duplicated().sum()

**visualisation**

In [None]:
sns.countplot(x='TravelInsurance',data=df)
plt.show()

In [None]:
sns.countplot(x='Employment Type',data=df)
plt.show()

In [None]:
sns.countplot(x='GraduateOrNot',data=df)
plt.show()

In [None]:
sns.countplot(x='ChronicDiseases',data=df)
plt.show()

In [None]:
sns.countplot(x='FrequentFlyer',data=df)
plt.show()

In [None]:
sns.countplot(x='EverTravelledAbroad',data=df)
plt.show()

In [None]:
sns.distplot(df['Age'],hist=False)
plt.show()

In [None]:
sns.distplot(df['AnnualIncome'],hist=False)
plt.show()

In [None]:
sns.displot(df['FamilyMembers'])
plt.show()

In [None]:
plt.figure(figsize=(10,6))
sns.heatmap(df.corr(),cmap='coolwarm',annot=True)

**Outlier Treatment**

In [None]:
plt.figure(figsize=(10,8))
sns.boxplot(data=df,orient='h')

In [None]:
x = df.drop(['Unnamed: 0','TravelInsurance'],axis=True)
y = df['TravelInsurance']

In [None]:
from sklearn.model_selection import train_test_split
x_train,x_test,y_train,y_test = train_test_split(x,y,test_size=0.2,random_state=101)

**normalisation**

In [None]:
from sklearn.preprocessing import MinMaxScaler
mscale=MinMaxScaler()
mscale.fit_transform(x_train)
mscale.transform(x_test)

# Models

In [None]:
from sklearn.linear_model import LogisticRegression
from sklearn.tree import DecisionTreeClassifier
from sklearn.ensemble import RandomForestClassifier
from sklearn.neighbors import KNeighborsClassifier
from sklearn.svm import SVC
from sklearn.naive_bayes import GaussianNB
from sklearn.metrics import accuracy_score,confusion_matrix

In [None]:
keys = ['KNN','LogisticRegression','DecisionTree','Randomforest','supportvector','Gaussian']
values = [KNeighborsClassifier(),LogisticRegression(),DecisionTreeClassifier(),RandomForestClassifier(),SVC(),GaussianNB()]
models = dict(zip(keys,values))
print(models)

In [None]:
for name,algo in models.items():
    model = algo
    model.fit(x_train,y_train)
    predict = model.predict(x_test)
    train_acc = model.score(x_train,y_train)
    test_acc = model.score(x_test,y_test)
    print(name,'Training acc:',train_acc,'Test acc:',test_acc)

* consider decision and it looks like overfitted

**Gridsearch**

In [None]:
from sklearn.model_selection import  GridSearchCV
params = {'max_depth': [2,4,6,8,10,12,14,16],
'min_samples_split': [2,3,4,5,6],
'min_samples_leaf': [1,2,3]}
clf = DecisionTreeClassifier()
gcv = GridSearchCV(estimator=clf,param_grid=params)
gcv.fit(x_train,y_train)

In [None]:
model = gcv.best_estimator_
model.fit(x_train,y_train)
y_train_pred = model.predict(x_train)
y_test_pred = model.predict(x_test)
print(f'Train score {accuracy_score(y_train_pred,y_train)}')
print(f'Test score {accuracy_score(y_test_pred,y_test)}')

**pre pruning**

In [None]:
from sklearn import tree
plt.figure(figsize=(10,10))
features = x.columns
classes = ['0','1']
tree.plot_tree(model,feature_names=features,class_names=classes,filled=True)
plt.title('Afetr Pre Pruning')
plt.show()

**Post Pruning**

In [None]:
path = clf.cost_complexity_pruning_path(x_train, y_train)
ccp_alphas, impurities = path.ccp_alphas, path.impurities

In [None]:
clfs = []
for ccp_alpha in ccp_alphas:
    clf = DecisionTreeClassifier(random_state=0, ccp_alpha=ccp_alpha)
    clf.fit(x_train, y_train)
    clfs.append(clf)
print("Number of nodes in the last tree is: {} with ccp_alpha: {}".format(
    clfs[-1].tree_.node_count, ccp_alphas[-1]))

In [None]:
clf = DecisionTreeClassifier(random_state=0, ccp_alpha=0.013)
clf.fit(x_train,y_train)

In [None]:
pred=clf.predict(x_test)
from sklearn.metrics import accuracy_score
print("Training Accuracy :", clf.score(x_train, y_train))
print("Testing Accuracy :", accuracy_score(y_test,pred))

In [None]:
from sklearn import tree
plt.figure(figsize=(10,10))
features = x.columns
classes = ['0','1']
tree.plot_tree(clf,feature_names=features,class_names=classes,filled=True)
plt.title('Afetr Post Pruning')
plt.show()

In [None]:
confusion_matrix(pred,y_test)

# Done