In [None]:
#Importing the libraries needed

import pandas as pd
import numpy as np
import seaborn as sns
import matplotlib.pyplot as plt
import warnings
warnings.simplefilter("ignore")


In [None]:
from imblearn.over_sampling import RandomOverSampler
sns.set(style='whitegrid')

In [None]:
train  = pd.read_csv("../input/health-insurance-cross-sell-prediction/train.csv")
test = pd.read_csv("../input/health-insurance-cross-sell-prediction/test.csv")

In [None]:
train.head(2)

In [None]:
test.head(2)

In [None]:
print("Shape of train dataset ",train.shape )
print("Shape of test dataset ",test.shape )

In [None]:
#checking for null values

train.isnull().sum()

#we founnd that there are no null values in the dataset

In [None]:
train.isnull().values.any()

In [None]:
train = train.dropna(how='any',axis=0)

In [None]:
train.info()

In [None]:
#Concatinating both training and testing file for analysis

data= pd.concat([train,test],axis=0,sort=False)

In [None]:
data.nunique()

In [None]:
#Code to find numeric data
numeric_data = train.select_dtypes(include = np.number)
numeric_col = numeric_data.columns
numeric_data.head(1)

In [None]:
#code to find categorical data
categorical_data = train.select_dtypes(exclude= np.number)
categorical_col = categorical_data.columns
categorical_data.head(1)

In [None]:
a = train['Response'].value_counts()/len(train)*100
b = train['Gender'].value_counts()
c = train['Vehicle_Age'].value_counts()
d = train['Vehicle_Damage'].value_counts()

print(a)
print('\n')
print(b)
print('\n')
print(c)
print('\n')
print(d)




In [None]:
#Outlier Analysis

def outliers(df, variable, distance):
     IQR = df[variable].quantile(0.75) - df[variable].quantile(0.25)
     lower_boundary = df[variable].quantile(0.25) - (IQR * distance)
     upper_boundary = df[variable].quantile(0.75) + (IQR * distance) 
        
     return upper_boundary, lower_boundary
    
upper_limit, lower_limit = outliers(train, 'Annual_Premium', 1.5)
upper_limit, lower_limit  

In [None]:
data['Annual_Premium']= np.where(data['Annual_Premium'] > upper_limit, upper_limit, 
                                 np.where(data['Annual_Premium'] < lower_limit, lower_limit, data['Annual_Premium']))

In [None]:
plt.hist(data['Annual_Premium'],bins=30);

**Exploratory** **data analysis**

In [None]:
train.columns

In [None]:
#Split the data set into numerical and categorical variables
numerical_columns = ['Age', 'Region_Code', 'Annual_Premium', 'Vintage']
categorical_columns =['Gender', 'Driving_License', 'Previously_Insured','Vehicle_Age', 'Vehicle_Damage', 'Policy_Sales_Channel', 'Response' ] 

In [None]:
train[numerical_columns].describe()

#The maximum age given is 85
#50% people are at the age of 36
#mean is 38.8
#standard deviation is 15.5

In [None]:
col = train.columns.tolist()
col.remove('id')
train[col].describe(percentiles = [.25,.50,.75,.95,.99])

In [None]:
# In the Annual_Premium the 99th percentile is 72544 and the max is 540165 this represents the outliers in this column. Hence it is an outlier

In [None]:
#Here we have Response as the Independent varaiable or the target variable

In [None]:
plt.subplot(1,2,1)
sns.countplot(train['Response'],palette = 'cool')
plt.title("Target variable count")



plt.subplot(1,2,2)
count = train['Response'].value_counts()
count.plot.pie(    autopct = '%1.1f%%',   colors=['pink', 'green'], figsize= (10,7), explode= [0,0.1], title = "pie chart of percentage of target class")

From the diagram we read that it is an imbalanced binary problem
Only 12.3 % customers are interested

In [None]:
print( "Percentage of target class\n")
print(train['Response'].value_counts()/len(train)*100)

In [None]:
%matplotlib inline
import matplotlib.pyplot as plt
train.hist(bins= 50, figsize= (10,10))
plt.show()

Plot on Gender

In [None]:
plt.figure(figsize= (9,4))
plt.subplot(1,2,1)
sns.countplot(train['Gender'])
plt.title("Count of male and female")



plt.subplot(1,2,2)
sns.countplot(train['Gender'], hue= train['Response'], palette= "rocket_r")
plt.title("Response in male and female category")
plt.show()


In [None]:
train['Gender'].value_counts()

In [None]:
#There are 30619 males and 26223 females

#The chance that male would buy the insurance is high than females

EDA on Age

In [None]:
plt.figure(figsize=(20,4))
sns.countplot(train['Age'], palette= 'hsv')
plt.title("count on age")
plt.show()

In [None]:
plt.figure(figsize= (15,4))
plt.subplot(1,2,1)
sns.distplot(train['Age'])


plt.subplot(1,2,2)
sns.boxplot(train['Age'], palette= 'Accent')

EDA on Driving License

In [None]:
plt.figure(figsize= (9,4))
plt.subplot(1,2,1)
sns.countplot(train['Driving_License'])


plt.subplot(1,2,2)
sns.countplot(train['Driving_License'], hue= train['Response'], data= train, palette= "rocket_r")

print("Percentage of  Driving_License feature\n ")
print(train['Driving_License'].value_counts()/len(train)*100)

In [None]:
#Customers who have the DL are 99%
#Among people has DL who need insurance is less compared to who require no insurance

Region distribution

In [None]:
plt.figure(figsize=(26,20))
plt.subplot(3,1,1)
sns.countplot(train["Region_Code"], data= train, palette= 'hsv')


plt.subplot(3,1,2)
sns.distplot(train['Region_Code'])
plt.title('Distribution of Region code')


plt.subplot(3,1,3)
sns.boxplot(train['Region_Code'])


In [None]:
#Customers from Region Code 28 has more chance of buying insurance

EDA on Previously Insured

In [None]:
plt.figure(figsize=(12,4))
plt.subplot(1,2,1)
sns.countplot(train['Previously_Insured'])

plt.subplot(1,2,2)
sns.countplot(train['Previously_Insured'], hue=train['Response'], palette= 'hsv' )

print("Percentage of Previously_Insured feature ")
print(train['Previously_Insured'].value_counts()/len(train)*100)

In [None]:
#Cutomers who has not previously insured are likely to be interested

Vehicle Age analysis

In [None]:
#plt.figure(figsize =(9,4))
#plt.subplot(1,2,1)
#sns.countplot(train['Vehicle_Age'])


plt.figure(figsize =(9,4))
plt.subplot(1,2,1)
ax = sns.countplot(train['Vehicle_Age'], data=train,
                   facecolor=(0, 0, 0, 0),
                   linewidth=5,
                   edgecolor=sns.color_palette("dark", 7))

plt.subplot(1,2,2)
sns.countplot(train['Vehicle_Age'], hue=train['Response'], facecolor=(0, 0, 0, 0),
                   linewidth=5, edgecolor=sns.color_palette("dark", 7))

In [None]:
#There are less number of customers with vehicle age greater than 2 years
#customers with vehicle age 1-2 years are more likely to buy the insurance compared to other 

In [None]:
plt.figure(figsize=(7,7))
train['Vehicle_Age'].value_counts().plot.pie(autopct='%1.1f%%', colors = ['r', 'b', 'g'])

In [None]:
#Damage Vehicle and Response

plt.figure(figsize= (9,4))
plt.subplot(1,2,1)
sns.countplot(train['Vehicle_Damage'])


plt.subplot(1,2,2)
sns.countplot(train['Vehicle_Damage'], hue= train['Response'], palette ='binary_r')

In [None]:
#Customers with vehicle damage are equallly distributed
#Customers with vehicle damage are more interested in the insurance

Annual Premiums

In [None]:
plt.figure(figsize =(11,6))
plt.subplot (2,1,1)
sns.distplot(train['Annual_Premium'])
plt.show()

plt.subplot(2,1,2)
sns.boxplot(train['Annual_Premium'])
plt.show()

In [None]:
#There are more outliers in the annual premium feature

In [None]:
plt.figure(figsize=(10,10))
plt.title("Correlation plot")
sns.heatmap(train.corr(),linewidths=5, annot=True, square=True,annot_kws={'size': 10},cmap='YlGnBu')

In [None]:
#policy sales channel has slightly low correlation with Age variable

In [None]:
train.head(5)

In [None]:
#Age vs Annual Premium

In [None]:
sns.scatterplot(x= train['Age'], y= train['Annual_Premium'])

In [None]:
df =train.groupby(['Gender'])['Driving_License'].count().to_frame().reset_index()
df

In [None]:
sns.catplot(x='Gender', y='Driving_License', data=df, kind='bar')

In [None]:
#Vintage= Number of Days, Customer has been associated with the company

In [None]:
sns.distplot(train.Vintage)

# Data Preprocessing

In [None]:
train.columns

In [None]:
train.head(5)

In [None]:
train.columns

In [None]:
train['Gender'] = train['Gender'].map( {'Female': 0, 'Male': 1} ).astype(int)

In [None]:
train=pd.get_dummies(train,drop_first=True)

In [None]:
train.head(5)

In [None]:
#Renaming the column name

In [None]:
train = train.rename(columns ={"Vehicle_Age_< 1 Year": "Vehicle_Age_lt_1 Year", "Vehicle_Age_> 2 Years" : "Vehicle_Age_gt_ 2 Years"})

In [None]:
train.head(5)

In [None]:
num_feat = ['Age','Vintage']

In [None]:
from sklearn.preprocessing import StandardScaler
from sklearn.preprocessing import MinMaxScaler

#Apply Standardscaler for Age and Vintage
ss = StandardScaler()
train[num_feat]= ss.fit_transform(train[num_feat])

#Apply minmaxscaler for annual premium
mm=MinMaxScaler()
train['Annual_Premium']= mm.fit_transform(train[['Annual_Premium']])

In [None]:
train.head(5)

For Test Data

In [None]:
test['Gender'] = test['Gender'].map( {'Female': 0, 'Male': 1} ).astype(int)
test=pd.get_dummies(test,drop_first=True)
test=test.rename(columns={"Vehicle_Age_< 1 Year": "Vehicle_Age_lt_1_Year", "Vehicle_Age_> 2 Years": "Vehicle_Age_gt_2_Years"})
test['Vehicle_Age_lt_1_Year']=test['Vehicle_Age_lt_1_Year'].astype('int')
test['Vehicle_Age_gt_2_Years']=test['Vehicle_Age_gt_2_Years'].astype('int')
test['Vehicle_Damage_Yes']=test['Vehicle_Damage_Yes'].astype('int')

In [None]:
from sklearn.preprocessing import LabelEncoder, StandardScaler, MinMaxScaler, RobustScaler
ss = StandardScaler()
test[num_feat] = ss.fit_transform(test[num_feat])

mm = MinMaxScaler()
test[['Annual_Premium']] = mm.fit_transform(test[['Annual_Premium']])

In [None]:
train.head()

In [None]:
test.head()

Splitting Training and Testing

In [None]:
from sklearn.model_selection import train_test_split

In [None]:
train_target=train['Response']
train=train.drop(['Response'], axis = 1)

X_train,X_test,y_train,y_test = train_test_split(train,train_target, random_state = 0)

Modelling

In [None]:
from sklearn.model_selection import RandomizedSearchCV
from sklearn.model_selection import cross_val_score
from sklearn.ensemble import RandomForestClassifier

Decision Tree Classifier

In [None]:
from sklearn.tree import DecisionTreeClassifier

In [None]:
X_train, X_test, y_train, y_test = train_test_split(train,train_target, test_size=0.3, random_state= 124)

In [None]:
print(X_train.shape, X_test.shape)

In [None]:
model= DecisionTreeClassifier()

In [None]:
model.fit(X_train, y_train)

In [None]:
prediction = model.predict(X_test)

In [None]:
from sklearn.metrics import classification_report, confusion_matrix

In [None]:
print (confusion_matrix(y_test, prediction))

In [None]:
print (classification_report (y_test, prediction))

Random Forest Classifier

In [None]:
from sklearn.ensemble.forest import RandomForestClassifier

In [None]:
rf_model =RandomForestClassifier(n_estimators =150)

In [None]:
rf_model.fit(X_train, y_train)

In [None]:
rf_prediction = rf_model.predict(X_test)

In [None]:
print(classification_report(y_test, rf_prediction))

In [None]:
print(confusion_matrix(y_test, rf_prediction))

In [None]:
#Feature Importance

In [None]:
feat_importances = pd.Series(rf_model.feature_importances_, index=X_train.columns)
feat_importances.nlargest(15).plot(kind='barh')
#feat_importances.nsmallest(20).plot(kind='barh')
plt.show()

In [None]:
#Hyper parameter tuning

In [None]:
#LGBM CLASSIFIER

In [None]:
from lightgbm import LGBMClassifier
from sklearn.model_selection import ShuffleSplit, cross_val_score
from sklearn.metrics import roc_auc_score

In [None]:
model=LGBMClassifier(colsample_bytree=0.5, learning_rate=0.03,
                     n_estimators=600, objective='binary', reg_alpha=0.1,
                     random_state=101,reg_lambda=0.8)

model.fit(X_train,y_train)

In [None]:
cv = ShuffleSplit(n_splits=5, test_size=0.3, random_state=0)
scores=cross_val_score(model, X_test, y_test, cv=cv,scoring='roc_auc')
scores.mean()

In [None]:
val_pred= model.predict_proba(X_test)[:,1]

In [None]:
val_pred

In [None]:
print(roc_auc_score(y_test,val_pred))

LOGISTIC REGRESSION

In [None]:
#Logistic Regression

from sklearn.linear_model import LogisticRegression
model = LogisticRegression()
model


In [None]:
model.fit (X_train, y_train)

In [None]:
prediction = model.predict(X_test)

In [None]:
prediction

In [None]:
from sklearn.metrics import classification_report

In [None]:
print (classification_report (y_test, prediction))

In [None]:
from sklearn.metrics import confusion_matrix, accuracy_score, classification_report

In [None]:
cm = confusion_matrix(y_test, prediction)
plt.rcParams['figure.figsize'] = (5, 5)
sns.heatmap(cm, annot = True, annot_kws = {'size':11}, cmap = 'PuBu',fmt=".1f");

In [None]:
#Gaussian Naive Bayes

In [None]:
from sklearn.naive_bayes import GaussianNB
nb = GaussianNB()
nb_model = nb.fit(X_train, y_train)
nb_model

In [None]:
y_pred_nb = nb_model.predict(X_test)

In [None]:
accuracy_score(y_test, y_pred_nb)

In [None]:
# Cofusion Matrix
cm = confusion_matrix(y_test, y_pred_nb)
plt.rcParams['figure.figsize'] = (5, 5)
sns.heatmap(cm, annot = True, annot_kws = {'size':15}, cmap = 'PuBu',fmt=".1f")

In [None]:
print(classification_report(y_test, y_pred_nb))

In [None]:
#KNN

In [None]:
from sklearn.neighbors import KNeighborsClassifier
knn = KNeighborsClassifier()
knn_model = knn.fit(X_train, y_train)
knn_model

In [None]:
y_pred_knn = knn_model.predict(X_test)

In [None]:
accuracy_score(y_test, y_pred_knn)

In [None]:
# Cofusion Matrix
cm = confusion_matrix(y_test, y_pred_knn)
plt.rcParams['figure.figsize'] = (5, 5)
sns.heatmap(cm, annot = True, annot_kws = {'size':15}, cmap = 'PuBu',fmt=".1f")

In [None]:
print(classification_report(y_test, y_pred_knn))

In [None]:
#Random Forest

In [None]:
from sklearn.ensemble import RandomForestClassifier
rf_model = RandomForestClassifier().fit(X_train, y_train)


In [None]:
y_pred_rf = rf_model.predict(X_test)
accuracy_score(y_test, y_pred_rf)

In [None]:
# Cofusion Matrix
cm = confusion_matrix(y_test, y_pred_rf)
plt.rcParams['figure.figsize'] = (5, 5)
sns.heatmap(cm, annot = True, annot_kws = {'size':15}, cmap = 'PuBu',fmt=".1f")

In [None]:
print(classification_report(y_test, y_pred_rf))

In [None]:
Importance = pd.DataFrame({"Importance": rf_model.feature_importances_*100},
                         index = X_train.columns)

In [None]:
Importance.sort_values(by = "Importance", 
                       axis = 0, 
                       ascending = True).plot(kind ="barh", color = "r")

plt.xlabel("Variable Significance Levels")

In [None]:
#Gradient Boosting Machines

In [None]:
from sklearn.ensemble import GradientBoostingClassifier

In [None]:
gbm_model = GradientBoostingClassifier().fit(X_train, y_train)

In [None]:
y_pred_gbm = gbm_model.predict(X_test)
accuracy_score(y_test, y_pred_gbm)

In [None]:
print(classification_report(y_test, y_pred_gbm))

In [None]:
# Cofusion Matrix
cm = confusion_matrix(y_test, y_pred_gbm)
plt.rcParams['figure.figsize'] = (5, 5)
sns.heatmap(cm, annot = True, annot_kws = {'size':15}, cmap = 'PuBu',fmt=".1f")

In [None]:
#XG Boost

In [None]:
from xgboost import XGBClassifier
import xgboost as xgb
xgb_model = XGBClassifier().fit(X_train, y_train)

In [None]:
y_pred_xgb_model = xgb_model.predict(X_test)
accuracy_score(y_test, y_pred_xgb_model)

In [None]:
print(classification_report(y_test, y_pred_xgb_model))

In [None]:
# Cofusion Matrix
cm = confusion_matrix(y_test, y_pred_xgb_model)
plt.rcParams['figure.figsize'] = (5, 5)
sns.heatmap(cm, annot = True, annot_kws = {'size':15}, cmap = 'PuBu',fmt=".1f")

In [None]:
from xgboost import XGBClassifier
# Model Tuning
XGB_model = XGBClassifier(random_state = 42, max_depth = 8, n_estimators = 3000, 
                          reg_lambda = 1.2, reg_alpha = 1.2, 
                          min_child_weight = 1,objective = 'binary:logistic',
                         learning_rate = 0.15, gamma = 0.3, colsample_bytree = 0.5,
                          eval_metric = 'auc').fit(X_train, y_train)

In [None]:
y_pred_XGB_model = XGB_model.predict(X_test)
accuracy_score(y_test, y_pred_XGB_model)

In [None]:
print(classification_report(y_test, y_pred_XGB_model))

In [None]:
# Cofusion Matrix
cm = confusion_matrix(y_test, y_pred_XGB_model)
plt.rcParams['figure.figsize'] = (5, 5)
sns.heatmap(cm, annot = True, annot_kws = {'size':15}, cmap = 'PuBu',fmt=".1f")

In [None]:
models = [
    knn_model,
    nb_model,
    gbm_model,
    xgb_model,
    XGB_model,
    rf_model,
      
]

for model in models:
    names = model.__class__.__name__
    y_pred = model.predict(X_test)
    accuracy = accuracy_score(y_test, y_pred)
    print("-"*28)
    print(names + ":" )
    print("Accuracy: {:.4%}".format(accuracy))

In [None]:
# Create submission file
submmission = pd.DataFrame()
submmission['id'] = test['id'].values
submmission['Response'] = y_test
submmission.to_csv('cat_submitfinal.csv', float_format='%.6f', index=False)

submmission.head()