In [None]:

import numpy as np 
import pandas as pd
import missingno
import matplotlib.pyplot as plt
import seaborn as sns
from sklearn.ensemble import RandomForestClassifier
from sklearn.linear_model import LogisticRegression
from sklearn.tree import DecisionTreeClassifier
from sklearn.metrics import recall_score
from sklearn.model_selection import train_test_split
from sklearn.metrics import classification_report
from sklearn.feature_selection import RFE
import sklearn.metrics as metrics
import scipy.stats as ss
from imblearn.under_sampling import RandomUnderSampler
from imblearn.over_sampling import SMOTE

import os
print(os.listdir("../input"))


In [None]:
df=pd.read_csv("../input/travel insurance.csv")
df1=df
df.head(7)

In [None]:
df.info()

So, there are 4 numerical columns and 7 categorical columns

Now, lets check which columns have the null values

In [None]:
missingno.matrix(df)

"Black" in the data depicts the column is fill with data and "White" depicts they have null values in that particular area

So, we can conclude that only "Gender" have the null values and seems quite much

Lets see whats the number of null values in the "Gender" column

In [None]:
df['Gender'].isnull().sum()

45107/63326 are null values, nearly 71.2% data in the column are null values.

I have removed the Gender feature since only less than 30% of data exist.

In [None]:
df=df.drop(['Gender'], axis = 1) 

In [None]:
df.isnull().sum()

There is no null values now

In [None]:

sns.set_style("whitegrid");
sns.pairplot(df, hue="Claim");
plt.show()

From the above we can come to below conclusions:

1. Here we clearly see that Commision and Net Values are corelated and distributions look same (linearlly distributed).
2. So we can drop any of the feature 'Commision' or 'Net Sales'. So that it should not effect the Model. 
3. Also there won't be a -Ve values in Net Sales Amount. Might be outliers, these should be removed. 
4. We observe ~20% of insured are in 35-40 Age.

Lets remove Commision Feature

In [None]:
df=df.drop(['Commision (in value)'], axis = 1) 

In [None]:
print(df['Agency Type'].unique())
print(df['Product Name'].unique())
print(df['Distribution Channel'].unique())


**Lets check the data in "Duration" column**

In [None]:
df['Duration'].describe()

We have negative values in this Duration column but can time be negative? **YES** (telling based on domain knowledge)

This is due to timezone difference. for ex: 12:10 AM 18/3/2021 you tavelled by a flight which is one way and reached some other country where the timezone is now 11:50 PM 17/3/2021.

In this case they will try to store the duration value as -1 in the database. 





**Lets see how many negative values we have in Duration column**

In [None]:
for i in range(len(df)):
    if df['Duration'][i]<0:
        print(df['Duration'].iloc[i])


In [None]:
df.loc[df['Duration'] < 0, 'Duration'] = 1

Net sales can't be negative. so removing those records.

In [None]:
df=df[df['Net Sales'] > 0]

df['Net Sales'].describe()

In [None]:
df_numerical=df._get_numeric_data()
df_numerical.info()

In [None]:
plt.figure(figsize=(15,10))
plt.subplot(2,2,1)
sns.countplot(df['Agency Type'])
plt.title('Agency Type')
plt.subplot(2,2,2)
sns.countplot(df['Distribution Channel'])
plt.title('Distribution Channel')

plt.subplot(2,2,3)
sns.countplot(df['Agency'])
plt.xticks(rotation=90)
plt.title('Agency')


#checking distriubution for destination in dataset, Please ignore the mess at pie we can consider those as others
plt.figure(figsize=(15,10))

wedges, texts = plt.pie(df['Destination'].value_counts(),    
                                  labels = df['Destination'].unique(), 
                                  shadow = True,
                                  textprops = dict(color ="magenta")) 


plt.pie(df['Destination'].value_counts(),labels=df['Destination'].unique())
plt.title('Destination')

Taking only Top 25 Destinations and keeping the rest destinations as "Others" 

In [None]:
dest=df['Destination'].value_counts()
dest=dict(dest[:25])
dest=dest.keys()
df.loc[~df.Destination.isin(dest), 'Destination'] = 'Others'
dest=df['Destination'].value_counts()
print(dest)

Converting Agency, Product Name, Destination, Agency Type and Distribution Channel Features from categorical to numeric feature using one hot encoding.

In [None]:
df_onehot = df.copy()
df_onehot = pd.get_dummies(df_onehot, columns=['Agency Type','Distribution Channel'])
# df_onehot=df_onehot.drop(['Commision (in value)'], axis = 1)
print(df_onehot.shape)

In [None]:
df_numerical=df_onehot._get_numeric_data()
df_numerical.info()

Updating the Claim column to 1 or 0. 
here, '1' says 'Claimed' and '0' says 'Not Claimed'

In [None]:
df['Claim'].value_counts()
df.loc[df['Claim'] == 'Yes', 'Claim'] = 1
df.loc[df['Claim'] == 'No', 'Claim'] = 0
df['Claim'].value_counts()

In [None]:
df_onehot = df.copy()
df_onehot = pd.get_dummies(df_onehot, columns=['Agency Type','Distribution Channel'])
# df_onehot=df_onehot.drop(['Commision (in value)'], axis = 1)
print(df_onehot.head())
df_numerical=df_onehot._get_numeric_data()
df_numerical.info()
df['Claim'].value_counts()
df.loc[df['Claim'] == 'Yes', 'Claim'] = 1
df.loc[df['Claim'] == 'No', 'Claim'] = 0
df['Claim'].value_counts()

Here is can see the data set is Completely imbalanced.

I have tried below methods to get the best model,
1. OverSampling
2. UnderSampling
3. SMOTE
4. giving Weighted sums
5. changing models like logistic regression, SVM, DT, Random Forest and Bossting algorithm also.

Fortunately Random Forest with balancing the Class weight according to dataset gave me the best Metrics.

In [None]:

from sklearn.ensemble import RandomForestClassifier
Y = df['Claim']
X=df_numerical
print(X.shape)
print(Y.shape)
# Splitting the dataset into train and test 
X_train, X_test, y_train, y_test = train_test_split(  
X, Y, test_size = 0.3, random_state = 100, stratify=Y)

clf =  RandomForestClassifier(n_estimators=100,random_state=0,class_weight={0: 1, 1: 98.5})
clf.fit(X_train, y_train)

pred = clf.predict(X_test)
print('score on test set:', clf.score(X_test, y_test))
print(metrics.classification_report(y_true=y_test, y_pred=pred))

In [None]:
from sklearn.metrics import confusion_matrix
mat=confusion_matrix(y_test, pred)
sns.heatmap(mat,square=True,annot=True,fmt='d',cbar='True', cmap=plt.cm.Greens)

In [None]:
from sklearn.metrics import roc_curve, auc, roc_auc_score
scores = clf.predict_proba(X_test)
print(scores)

In [None]:
y_test.value_counts()

In [None]:
try:
    print(roc_auc_score(y_test,pred))
except ValueError:
    pass


Finally my model predicts mostly correct. 
F1-Score-1.0 for the test data.
