In [None]:
# This Python 3 environment comes with many helpful analytics libraries installed
# It is defined by the kaggle/python Docker image: https://github.com/kaggle/docker-python
# For example, here's several helpful packages to load

import numpy as np # linear algebra
import pandas as pd # data processing, CSV file I/O (e.g. pd.read_csv)

# Input data files are available in the read-only "../input/" directory
# For example, running this (by clicking run or pressing Shift+Enter) will list all files under the input directory

import os
for dirname, _, filenames in os.walk('/kaggle/input'):
    for filename in filenames:
        print(os.path.join(dirname, filename))

# You can write up to 20GB to the current directory (/kaggle/working/) that gets preserved as output when you create a version using "Save & Run All" 
# You can also write temporary files to /kaggle/temp/, but they won't be saved outside of the current session

In [None]:
#Import required libraries
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
import seaborn as sns

from warnings import filterwarnings
filterwarnings('ignore')

from sklearn.preprocessing import PowerTransformer
pt = PowerTransformer()

from scipy import stats

from sklearn.model_selection import train_test_split

#Import (Z-Scaler) StandardScaler
from sklearn.preprocessing import StandardScaler
ss = StandardScaler()

In [None]:
#Read dataset from the kaggle

df= pd.read_csv('/kaggle/input/health-insurance-cross-sell-prediction/train.csv')
df.head()

In [None]:
#some varaibles which is categorical feature but has datatype as int, so changing those datatype to 
#object

df[['Driving_License', 'Previously_Insured', 'Response']] = df[['Driving_License', 'Previously_Insured', 'Response']].astype('object')

In [None]:
df.info()

In [None]:
#Shape
df.shape

In [None]:
df_copy = df.copy(deep=True)

## Remove insignificant variable id

In [None]:
# Remove insignificant variable id
df.drop('id', axis=1, inplace=True)

df.head(1)

## Duplicate records 

In [None]:
#Remove duplicates
df.duplicated(keep='first').value_counts(normalize=True) * 100

#There are totally 0.07% of duplicate records

In [None]:
df.duplicated(keep='first').value_counts()

In [None]:
#We are keeping the first duplicate record and delete the rest
df.drop_duplicates(keep='first', inplace=True)

df.shape

#Shape after removing duplicates

In [None]:
df.head(1)

## Outlier treatment

In [None]:
#As seen in eda, only Annual_Premium variable had significant number of outliers

q1 = df['Annual_Premium'].quantile(0.25)
q3 = df['Annual_Premium'].quantile(0.75)

iqr = q3-q1

ll = q1 - 1.5*iqr
ul = q3 + 1.5*iqr

df[(df['Annual_Premium']<ll)|(df['Annual_Premium']>ul)].shape

In [None]:
#There are 10331 outliers

10331/len(df_copy) * 100

#2.71% outliers

In [None]:
#We treat the outlier by Power transforming the Annual_Premium feature

print('Skewness of Annual_premium variable before Power transformation :', df['Annual_Premium'].skew())

df['Annual_Premium'] = pt.fit_transform(df[['Annual_Premium']])

print('\nSkewness of Annual_premium variable after Power transformation :', df['Annual_Premium'].skew())

In [None]:
#Skewness is reduced after applying Power transformation

#distribution plot
plt.figure(figsize=(12,8))

plt.subplots_adjust(hspace=0.3)

plt.subplot(2,1,1)
sns.distplot(df_copy['Annual_Premium'])
plt.title('Distribution before transformation')

plt.subplot(2,1,2)
sns.distplot(df['Annual_Premium'])
plt.title('Distribution after transformation')

plt.show()

# Statistical Analysis for feature important

In [None]:
df.info()

In [None]:
#Dividing the dataset to customers whose Response = 1 as res_1 and customers whose Response = 0 as res_0

#Customer who have not responded
res_0 = df[df['Response']==0]

#Customer who have responded
res_1 = df[df['Response']==1]

## Statistical analysis for Numerical columns

In [None]:
#Features whose datatype is 'numeric'
num_cols = list(df.select_dtypes(include='number'))

num_cols

#### Test of Normality for numerical data (Shapiro test)

In [None]:
# Test of normality
# Ho: skew=0 (normal)
# Ha : skew !=0(not normal)

#Shapiro test

for col in num_cols:
    print(f'\nShapiro test for {col} feature :')
    print('Response = 0 :',stats.shapiro(res_0[col]))
    print('Response = 1 :',stats.shapiro(res_1[col]))

For all the numerical features<br>
>pval = 0<br>
sig lvl = 0.05<br>
pval < sig lvl<br>
We reject Null hypothesis<br>
None of the Data is not normally distributed

#### Test for equality of variances (levene test)

In [None]:
#equality of variances
# Ho: Variance is equal
# Ha : Variance is not equal

for col in num_cols:
    print(f'\nLevene test for {col} feature :')
    print(stats.levene(res_0[col], res_1[col]))

For features ==> 'Age', 'Region_Code', 'Annual_Premium', 'Policy_Sales_Channel'
>   pval = 0<br>
    sig lvl = 0.05<br>
    pval < sig lvl<br>
    We reject Null hypothesis
 Population variances are not equal<br>

For 'Vintage' Feature :
>pval = 0.89<br>
sig lvl = 0.05<br>
pval > sig lvl<br>
We fail to reject Null hypothesis <br>
Population variances are equal

#### Mannwhitneyu (non-parametric ttest)

In [None]:
#As all the features are not normal, we cannot perform parametric test
#We will perform non-parametric test (Mannwhitneyu)

# Ho : mu1 = mu2(no relation)
# Ha : mu1 != mu (relation)

for col in num_cols:
    print(f'\nNon-parametric 2-sample Unpaired test for {col} feature and Response feature :')
    print(stats.mannwhitneyu(res_0[col], res_1[col]))

For features :  'Age', 'Annual_Premium', 'Policy_Sales_Channel'<br>
>pval = 0<br>
sig lvl = 0.05<br>
pval < sig lvl<br>
We reject Null hypothesis
###### There is a relation between ('Age', 'Annual_Premium', 'Policy_Sales_Channel) and 'Response'

For features : 'Region_Code', 'Vintage'
>pval = 0.22, 0.26 (Region_Code and Vintage respectively)<br>
sig lvl = 0.05<br>
pval > sig lvl<br>
We fail to reject Null hypothesis<br>
######  'There is a no relation between (Region_code, Vintage) and Response'

## Statistical analysis for Categorical columns

In [None]:
#Chi-Square Test for Independence : It is a non-parametric test (hence no assumptions)
#H0 : The variables are independent
#H1 : The variables are not independent (i.e. variables are dependent)

#List of categorical features
cat_cols = list(df.select_dtypes(exclude='number'))

#Remove the target feature from the list
cat_cols.remove('Response')

cat_cols

In [None]:
#perform chi2_contingency for all the categorical features and Response target
for col in cat_cols:
    print(f'{col} vs Response :')
    obs = pd.crosstab(index=df['Response'], columns=df[col]) #create a cross-tab for feature and target
    print('Observed values :\n',obs )
    print(stats.chi2_contingency(obs))
    print('Pvalue =',stats.chi2_contingency(obs)[1] )
    print('\n\n')

For features : 'Gender', 'Driving_License', 'Previously_Insured', 'Vehicle_Age', 'Vehicle_Damage'
>pval = 0<br>
sig lvl = 0.05<br>
pval < sig lvl<br>
We reject Null hypothesis<br>

There is relationship between these ('Gender',  'Driving_License',  'Previously_Insured', 'Vehicle_Age',  'Vehicle_Damage') and 'RESPONSE' variable<br>
##### Response (target variable) is dependent on all the categorical variable.

#Except Region_Code and Vintage, Response is dependent on all other variables

## Label encoding / One hot encoding

In [None]:
df.head(1)

In [None]:
#Driving_License, Previously_Insured, Response are already encoded but Data-type is object
#Change the datatype to int

df[['Driving_License', 'Previously_Insured', 'Response']] = df[['Driving_License', 'Previously_Insured', 'Response']].astype('int')

In [None]:
df.info()

In [None]:
#One hot encoding the rest of categorical variables, by droping the first column after encoding

cat_cols = list(df.select_dtypes(exclude='number'))

df = pd.get_dummies(df, columns=cat_cols, drop_first=True)
df.head(1)

In [None]:
df.info()

In [None]:
df.shape

## Train-Test split

In [None]:
# X y split
y = df['Response']
X = df.drop('Response', axis=1)

#Train & test split with 0.3%
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.30, random_state=4)

print('X train Shape :',X_train.shape)
print('X test Shape :',X_test.shape)

### Model evaluation metrics

In [None]:
from sklearn.metrics import accuracy_score, confusion_matrix, f1_score, roc_auc_score, roc_curve, precision_score, recall_score

result_df = pd.DataFrame(columns=['Model_Name', 'Accuracy_score_train', 'roc_auc_score_train','f1_score_train','precision_score_train', 
                                  'recall_score_train','Accuracy_score_test', 'roc_auc_score_test','f1_score_test', 'precision_score_test', 'recall_score_test'  ])

result_df

In [None]:
#Defining a function to append metrics in dataframe

def model_score_card(algo,  name, X_train=X_train, X_test=X_test, y_train=y_train, y_test=y_test):
    algo.fit(X_train, y_train)
    
    #train datset
    y_train_pred = algo.predict(X_train)
    y_train_proba = algo.predict_proba(X_train)[::, 1]
    
    #test datset
    y_test_pred = algo.predict(X_test)
    y_test_proba = algo.predict_proba(X_test)[::, 1]
    
    global result_df
    
    result_df = result_df.append({'Model_Name' : name,
                                    
                                    'Accuracy_score_train' :accuracy_score(y_train, y_train_pred) ,
                                    'roc_auc_score_train' : roc_auc_score(y_train, y_train_proba),
                                    'f1_score_train' : f1_score(y_train, y_train_pred), 
                                    'precision_score_train' : precision_score(y_train, y_train_pred), 
                                    'recall_score_train' : recall_score(y_train, y_train_pred),
                                    
                                    'Accuracy_score_test':accuracy_score(y_test, y_test_pred),
                                      'f1_score_test' :f1_score(y_test, y_test_pred) ,
                                    'roc_auc_score_test' : roc_auc_score(y_test, y_test_proba),
                                     
                                    'precision_score_test' : precision_score(y_test, y_test_pred), 
                                    'recall_score_test' : recall_score(y_test, y_test_pred)}, ignore_index = True)
    
    return result_df

In [None]:
#Defining a function to get evaluation metrics

def model_eval(algo,  X_train=X_train, X_test=X_test, y_train=y_train, y_test=y_test):
    algo.fit(X_train, y_train)
    
    #train datset
    y_train_pred = algo.predict(X_train)
    y_train_proba = algo.predict_proba(X_train)[::, 1]

    print('Train dataset :')
    print('Confusion matrix :\n', confusion_matrix(y_train, y_train_pred))
    print('Accuracy :',accuracy_score(y_train, y_train_pred) )
    print('AUC score :', roc_auc_score(y_train, y_train_proba))
    print('F1-score :', f1_score(y_train, y_train_pred))
    print('Precision score :', precision_score(y_train, y_train_pred))
    print('Recall score :', recall_score(y_train, y_train_pred))
    
    #test datset
    y_test_pred = algo.predict(X_test)
    y_test_proba = algo.predict_proba(X_test)[::, 1]
    print('\n\nTest dataset :')
    print('Confusion matrix :\n', confusion_matrix(y_test, y_test_pred))
    print('Accuracy :',accuracy_score(y_test, y_test_pred) )
    print('AUC score :', roc_auc_score(y_test, y_test_proba))
    print('F1-score :', f1_score(y_test, y_test_pred))
    print('Precision score :', precision_score(y_test, y_test_pred))
    print('Recall score :', recall_score(y_test, y_test_pred))

# Model building

## 1.Logistic Regression as a base model

In [None]:
from sklearn.linear_model import LogisticRegression

lor = LogisticRegression(solver='liblinear',random_state=4)

model_eval(lor)

In [None]:
## Appending the evaluation metrics in a DataFrame for further reference

lor = LogisticRegression(solver='liblinear',random_state=4)

model_score_card(lor, 'Logistic Regression')

In [None]:
lor = LogisticRegression(solver='liblinear',random_state=4)
lor.fit(X_train, y_train)
    
    #train datset
y_train_pred = lor.predict(X_train)
y_train_proba = lor.predict_proba(X_train)[::, 1]
    
    #test datset
y_test_pred = lor.predict(X_test)
y_test_proba = lor.predict_proba(X_test)[::, 1]

fpr_train,tpr_train,threshold_train = roc_curve(y_train,y_train_proba )
fpr_test,tpr_test,threshold_test = roc_curve(y_test,y_test_proba )

print('Logistic Regression Base model :')

plt.figure(figsize=(14,5))
plt.subplot(1,2,1)
plt.plot(fpr_train,fpr_train)
plt.plot(fpr_train,tpr_train)
plt.xlabel('FPR')
plt.ylabel('TPR')
plt.title('Train Data')

plt.subplot(1,2,2)
plt.plot(fpr_test,fpr_test)
plt.plot(fpr_test,tpr_test)
plt.xlabel('FPR')
plt.ylabel('TPR')
plt.title('Test Data')
plt.show()

## 2. Improving the base model (Logistic Regression):
>2.1 Data Transformation (Power transformation / Standard Scaler)<br>
2.2 Feature selection (Recursive Feature Elimination)<br>
2.3 SMOTE analysis

> ### 2.1 Data Transformation (Power transformation / Standard Scaler)

In [None]:
#First we check for skewness & then transform the data to reduce the skewness 
#We are checking skewness for numerical columns & only for Train data and transform test data to avoid Data-leakage

X_train[num_cols].skew()

In [None]:
#As Age feature is right skewed, we will use Power transformation to reduce the reduce

print('Skewness before transformation :', X_train['Age'].skew())

X_train['Age'] = pt.fit_transform(X_train[['Age']])
X_test['Age'] = pt.transform(X_test[['Age']])

print('\nSkewness after Power transformation :', X_train['Age'].skew())

In [None]:
#Skewness of Region Code is -0.113, which is almost equal to 0
#So we will avoid transformation for Region_code feature

In [None]:
#Policy_Sales_Channel is left skewed, so Power tranformation will not work properly
#So, after trail & error, (To the power of 6) gives best results

print('Skewness before transformation :', X_train['Policy_Sales_Channel'].skew())

X_train['Policy_Sales_Channel'] = X_train['Policy_Sales_Channel']**6
X_test['Policy_Sales_Channel'] = X_test['Policy_Sales_Channel']**6

print('\nSkewness after Power transformation :', X_train['Policy_Sales_Channel'].skew())

In [None]:
#Scaling the whole data using StandardScaler
# Fit on Train data and tranform it on Test data to avoid Data-Leakage

X_train[num_cols] = ss.fit_transform(X_train[num_cols])

X_test[num_cols] = ss.transform(X_test[num_cols])

In [None]:
#Building the Logistic regression on transformed data to check the improvement

lor = LogisticRegression(solver='liblinear',random_state=4)

model_eval(lor)

In [None]:
##### There is no change in base model after data transformation

# X and Y dataset after transformation

#concat train and test dataset for variable Y
y_full = pd.concat([y_train, y_test], axis=0)
y_full.shape

In [None]:
#concat train and test dataset for variable X

X_full = pd.concat([X_train, X_test], axis=0)
X_full.shape

>### 2.2 Feature selection (Recursive Feature Elimination)

In [None]:
from sklearn.feature_selection import RFE, RFECV

#estimator used is LogisticRegression
lor = LogisticRegression(solver='liblinear',random_state=4)

#RUN RFECV to find out the best number of features to be selected
rfe_n = RFECV(estimator=lor, cv=3, scoring='roc_auc', verbose=2, n_jobs=-1)
rfe_n.fit(X_full, y_full)

In [None]:
#Number
print('Number of features selected :', rfe_n.n_features_)

In [None]:
#Selected features
selected = list(X_full.columns[rfe_n.support_])
print('\nSelected features :',selected)

In [None]:
#selecting only features from RFE in both train & test dataset

X_train_sel = X_train[selected]
X_test_sel = X_test[selected]

In [None]:
lor = LogisticRegression(solver='liblinear',random_state=4)

model_eval(lor, X_train_sel, X_test_sel, y_train, y_test)

##### There is no change in base model after Feature selection (RFE)

>### 2.3 SMOTE analysis

In [None]:
from imblearn.over_sampling import SMOTE

smote = SMOTE(sampling_strategy='minority')

#SMOTE analysis on train dataset
X_train_sm, y_train_sm = smote.fit_resample(X_train_sel, y_train)

print('Shape of X train', X_train_sm.shape)

print('\nCount of target variable :')
print(y_train_sm.value_counts())
#after smote analysis, target variable is equally distributed

In [None]:
#building LogisticRegression on smote analysed data
lor = LogisticRegression(solver='liblinear',random_state=4)

#Model evaluation
model_eval(lor, X_train_sm,X_test_sel, y_train_sm, y_test)

Even though there is huge change in Accuracy<br>
Recall, Precision and F1-score has been increased<br>
But the model is Over-fitting

In [None]:
## Appending the evaluation metrics in a DataFrame for further reference
##This is the final base model
model_score_card(lor, 'Logistic Regression Final base_model', X_train_sm,X_test_sel, y_train_sm, y_test)

In [None]:
lor = LogisticRegression(solver='liblinear',random_state=4)
lor.fit(X_train_sm, y_train_sm)
    
    #train datset
y_train_sm_pred = lor.predict(X_train_sm)
y_train_sm_proba = lor.predict_proba(X_train_sm)[::, 1]
    
    #test datset
y_test_pred = lor.predict(X_test_sel)
y_test_proba = lor.predict_proba(X_test_sel)[::, 1]

fpr_train,tpr_train,threshold_train = roc_curve(y_train_sm,y_train_sm_proba )
fpr_test,tpr_test,threshold_test = roc_curve(y_test,y_test_proba )

print('Logistic Regression Final-Base_model :')

plt.figure(figsize=(14,5))
plt.subplot(1,2,1)
plt.plot(fpr_train,fpr_train)
plt.plot(fpr_train,tpr_train)
plt.xlabel('FPR')
plt.ylabel('TPR')
plt.title('Train Data')

plt.subplot(1,2,2)
plt.plot(fpr_test,fpr_test)
plt.plot(fpr_test,tpr_test)
plt.xlabel('FPR')
plt.ylabel('TPR')
plt.title('Test Data')
plt.show()

## 3. Decision Tree Classifier
>  3.1 Decision Tree Classifier<br>
>  3.2 Decision Tree Classifier with Hyper-parameter tuning

>### 3.1 Decision Tree Classifier

In [None]:
from sklearn.tree import DecisionTreeClassifier

In [None]:
#Build DTC model on Dataset which are : RFE selected features, smote analysed

dtc = DecisionTreeClassifier()

model_eval(dtc, X_train_sm,X_test_sel, y_train_sm, y_test)

>### 3.2 Decision Tree Classifier with Hyper-parameter tuning

In [None]:
from sklearn.model_selection import GridSearchCV, RandomizedSearchCV

In [None]:
from scipy.stats import randint as sp_randint

In [None]:
# GridSearchCV & RandomizedSearchCV results were almost similar
# We are considering RandomizedSearchCV for Hyper-parameter tuning

dtc = DecisionTreeClassifier(random_state=4)

params = {'max_depth' : sp_randint(2,10),
         'min_samples_leaf' : sp_randint(1,12),
         'criterion' : ['gini', 'entropy']}

rsearch = RandomizedSearchCV(dtc, param_distributions=params, n_iter=100, n_jobs=-1, 
                             cv=3, scoring='roc_auc', random_state=4)

# RandomizedSearchCV on overall transformed datasets
rsearch.fit(X_full,y_full)

In [None]:
#Best parameters
print(rsearch.best_params_)

In [None]:
dtc = DecisionTreeClassifier(**rsearch.best_params_, random_state=4)

model_eval(dtc, X_train_sm, X_test_sel, y_train_sm, y_test)

##### Decision Tree Classifier after hyper-parameter tuning is giving better results. It is better fit model compared to previous model
##### Recall , Precision , F1-score, AUC score is increased compared to previous model.

In [None]:
## Appending the evaluation metrics in a DataFrame for further reference

dtc = DecisionTreeClassifier(**rsearch.best_params_, random_state=4)

model_score_card(dtc, 'DecisionTreeClassifier HyperParameter-tuning', X_train_sm,X_test_sel, y_train_sm, y_test)

In [None]:
dtc = DecisionTreeClassifier(**rsearch.best_params_, random_state=4)
dtc.fit(X_train_sm, y_train_sm)
    
    #train datset
y_train_sm_pred = dtc.predict(X_train_sm)
y_train_sm_proba = dtc.predict_proba(X_train_sm)[::, 1]
    
    #test datset
y_test_pred = dtc.predict(X_test_sel)
y_test_proba = dtc.predict_proba(X_test_sel)[::, 1]

fpr_train,tpr_train,threshold_train = roc_curve(y_train_sm,y_train_sm_proba )
fpr_test,tpr_test,threshold_test = roc_curve(y_test,y_test_proba )

print('DecisionTreeClassifier HyperParameter-tuning :')

plt.figure(figsize=(14,5))
plt.subplot(1,2,1)
plt.plot(fpr_train,fpr_train)
plt.plot(fpr_train,tpr_train)
plt.xlabel('FPR')
plt.ylabel('TPR')
plt.title('Train Data')

plt.subplot(1,2,2)
plt.plot(fpr_test,fpr_test)
plt.plot(fpr_test,tpr_test)
plt.xlabel('FPR')
plt.ylabel('TPR')
plt.title('Test Data')
plt.show()

## 4. Random forest Classifier
>  4.1 Random forest Classifier<br>
>  4.2 Random forest Classifier with Hyper-parameter tuning

>### 4.1 Random forest Classifier

In [None]:
from sklearn.ensemble import RandomForestClassifier

In [None]:
#Build RFC model on Dataset which are : RFE selected features, smote analysed

rfc = RandomForestClassifier(random_state=4)

#model evaluation
model_eval(rfc, X_train_sm, X_test_sel, y_train_sm, y_test)

>### 4.2 Random forest Classifier with Hyper-parameter tuning

In [None]:
# GridSearchCV & RandomizedSearchCV results were almost similar
# We are considering RandomizedSearchCV for Hyper-parameter tuning

rfc = RandomForestClassifier(random_state=4)

params = {'n_estimators': sp_randint(50,200),
         'max_features': sp_randint(1,15),
         'min_samples_leaf' : sp_randint(1,25),
          'max_depth' : sp_randint(1,10),
         'criterion' : ['gini', 'entropy']}

rsearch = RandomizedSearchCV(rfc, param_distributions=params, cv=3, n_iter=10,verbose=2, 
                             scoring='roc_auc', random_state=4, n_jobs=-1)
rsearch.fit(X_full, y_full)

In [None]:
#Best parameters
print(rsearch.best_params_)

In [None]:
rfc = RandomForestClassifier(**rsearch.best_params_, random_state=4)

model_eval(rfc, X_train_sm, X_test_sel, y_train_sm, y_test)

##### Random Forest Classifier after hyper-parameter tuning is giving better results. It is better fit model compared to previous model
##### Recall , F1-score, AUC score is increased compared to previous model.

In [None]:
## Appending the evaluation metrics in a DataFrame for further reference

rfc = RandomForestClassifier(**rsearch.best_params_, random_state=4)

model_score_card(rfc, 'RandomForestClassifier HyperParameter-tuning', X_train_sm,X_test_sel, y_train_sm, y_test)

In [None]:
rfc = RandomForestClassifier(**rsearch.best_params_, random_state=4)
rfc.fit(X_train_sm, y_train_sm)
    
    #train datset
y_train_sm_pred = rfc.predict(X_train_sm)
y_train_sm_proba = rfc.predict_proba(X_train_sm)[::, 1]
    
    #test datset
y_test_pred = rfc.predict(X_test_sel)
y_test_proba = rfc.predict_proba(X_test_sel)[::, 1]

fpr_train,tpr_train,threshold_train = roc_curve(y_train_sm,y_train_sm_proba )
fpr_test,tpr_test,threshold_test = roc_curve(y_test,y_test_proba )

print('RandomForestClassifier HyperParameter-tuning :')

plt.figure(figsize=(14,5))
plt.subplot(1,2,1)
plt.plot(fpr_train,fpr_train)
plt.plot(fpr_train,tpr_train)
plt.xlabel('FPR')
plt.ylabel('TPR')
plt.title('Train Data')

plt.subplot(1,2,2)
plt.plot(fpr_test,fpr_test)
plt.plot(fpr_test,tpr_test)
plt.xlabel('FPR')
plt.ylabel('TPR')
plt.title('Test Data')
plt.show()

## 5. LGBMClassifier
>  5.1 LGBMClassifier<br>
>  5.2 LGBMClassifier with Hyper-parameter tuning

>### 5.1 LGBMClassifier

In [None]:
import lightgbm as lgb

In [None]:
#Build RFC model on Dataset which are : RFE selected features, smote analysed

lgbc = lgb.LGBMClassifier()

#model evaluation
model_eval(lgbc, X_train_sm, X_test_sel, y_train_sm, y_test)

>### 5.2 LGBMClassifier with Hyper-parameter tuning

In [None]:
from scipy.stats import uniform as sp_uniform

In [None]:
lgbc = lgb.LGBMClassifier()

params = {'n_estimators':sp_randint(50,250),
         'max_depth' : sp_randint(1,50),
         'learning_rate' : sp_uniform(0,0.5)}

rsearch = RandomizedSearchCV(lgbc, param_distributions=params, scoring='roc_auc', cv=3, n_iter=10,
                             n_jobs=-1, random_state=4)
rsearch.fit(X_full, y_full)

In [None]:
#Best parameters
print(rsearch.best_params_)

In [None]:
lgbc = lgb.LGBMClassifier(**rsearch.best_params_, random_state=4)

model_eval(lgbc,  X_train_sm, X_test_sel, y_train_sm, y_test)

##### LGBMClassifier model results before and after hyper-parameter tuning is same, Model is slightly over-fit

In [None]:
## Appending the evaluation metrics in a DataFrame for further reference

lgbc = lgb.LGBMClassifier(random_state=4)

model_score_card(lgbc, 'LGBMClassifier', X_train_sm,X_test_sel, y_train_sm, y_test)

In [None]:
lgbc = lgb.LGBMClassifier(random_state=4)
lgbc.fit(X_train_sm, y_train_sm)
    
    #train datset
y_train_sm_pred = lgbc.predict(X_train_sm)
y_train_sm_proba = lgbc.predict_proba(X_train_sm)[::, 1]
    
    #test datset
y_test_pred = lgbc.predict(X_test_sel)
y_test_proba = lgbc.predict_proba(X_test_sel)[::, 1]

fpr_train,tpr_train,threshold_train = roc_curve(y_train_sm,y_train_sm_proba )
fpr_test,tpr_test,threshold_test = roc_curve(y_test,y_test_proba )

print('LGBMClassifier :')

plt.figure(figsize=(14,5))
plt.subplot(1,2,1)
plt.plot(fpr_train,fpr_train)
plt.plot(fpr_train,tpr_train)
plt.xlabel('FPR')
plt.ylabel('TPR')
plt.title('Train Data')

plt.subplot(1,2,2)
plt.plot(fpr_test,fpr_test)
plt.plot(fpr_test,tpr_test)
plt.xlabel('FPR')
plt.ylabel('TPR')
plt.title('Test Data')
plt.show()

## 6. Naive Bayes
>  6.1 GaussianNB<br>
>  6.2 GaussianNB SMOTE analysis<br>
>  6.3 GaussianNB Hyper-parameter Tuning<br>

In [None]:
from sklearn.naive_bayes import GaussianNB

from sklearn.metrics import roc_curve

>###  6.1 GaussianNB<br>

In [None]:
#Model built on transformed data
nb = GaussianNB()

model_eval(nb, X_train_sel, X_test_sel, y_train, y_test)

In [None]:
nb = GaussianNB()
nb.fit(X_train_sm, y_train_sm)
    
    #train datset
y_train_sm_pred = nb.predict(X_train_sm)
y_train_sm_proba = nb.predict_proba(X_train_sm)[::, 1]
    
    #test datset
y_test_pred = nb.predict(X_test_sel)
y_test_proba = nb.predict_proba(X_test_sel)[::, 1]

fpr_train,tpr_train,threshold_train = roc_curve(y_train_sm,y_train_sm_proba )
fpr_test,tpr_test,threshold_test = roc_curve(y_test,y_test_proba )

print(' GaussianNB :')

plt.figure(figsize=(14,5))
plt.subplot(1,2,1)
plt.plot(fpr_train,fpr_train)
plt.plot(fpr_train,tpr_train)
plt.xlabel('FPR')
plt.ylabel('TPR')
plt.title('Train Data')

plt.subplot(1,2,2)
plt.plot(fpr_test,fpr_test)
plt.plot(fpr_test,tpr_test)
plt.xlabel('FPR')
plt.ylabel('TPR')
plt.title('Test Data')
plt.show()

> ###  6.2 GaussianNB SMOTE analysis

In [None]:
nb = GaussianNB()

model_eval(nb, X_train_sm, X_test_sel, y_train_sm, y_test)

#Recall is increased but the model is over fitted, so we cant consider this model

> ### 6.3 GaussianNB Hyper-parameter Tuning

In [None]:
nb_classifier = GaussianNB()

#default var_smoothing is 1e-09
#We can try a range between 1e-0.15 to 1e-0.5

params_NB = {'var_smoothing': np.logspace(-5, -15, num=200)}
gs_NB = GridSearchCV(estimator=nb_classifier, 
                 param_grid=params_NB, 
                 cv=3,   # use any cross validation technique 
                 verbose=1, 
                 scoring='accuracy') 
gs_NB.fit(X_full, y_full)

In [None]:
#Best parameters
print(gs_NB.best_params_)

In [None]:
gnb = GaussianNB(**gs_NB.best_params_)

#As we got better result on GaussianNB for data without SMOTE analysis, we will use the same dataset
model_eval(gnb,  X_train_sel, X_test_sel, y_train, y_test)

##### GaussianNB model results before and after hyper-parameter tuning is same, Model is better fit to rest of the models

In [None]:
## Appending the evaluation metrics in a DataFrame for further reference
## We can consider GaussianNB before hyper-parameter tuning

gnb = GaussianNB()

model_score_card(gnb, 'GaussianNB', X_train_sel, X_test_sel, y_train, y_test)

## Overall results of all the models built

In [None]:
result_df

# Final model selected

In [None]:
result_df.iloc[5]

In [None]:
nb = GaussianNB()
nb.fit(X_train_sm, y_train_sm)
    
    #train datset
y_train_sm_pred = nb.predict(X_train_sm)
y_train_sm_proba = nb.predict_proba(X_train_sm)[::, 1]
    
    #test datset
y_test_pred = nb.predict(X_test_sel)
y_test_proba = nb.predict_proba(X_test_sel)[::, 1]

fpr_train,tpr_train,threshold_train = roc_curve(y_train_sm,y_train_sm_proba )
fpr_test,tpr_test,threshold_test = roc_curve(y_test,y_test_proba )

print(' GaussianNB :')

plt.figure(figsize=(14,5))
plt.subplot(1,2,1)
plt.plot(fpr_train,fpr_train)
plt.plot(fpr_train,tpr_train)
plt.xlabel('FPR')
plt.ylabel('TPR')
plt.title('Train Data')

plt.subplot(1,2,2)
plt.plot(fpr_test,fpr_test)
plt.plot(fpr_test,tpr_test)
plt.xlabel('FPR')
plt.ylabel('TPR')
plt.title('Test Data')
plt.show()

##### This model is better fit compared to other models

### Feature importance

In [None]:
from sklearn.inspection import permutation_importance

In [None]:
imps = permutation_importance(gnb, X_test_sel, y_test)
print(imps.importances_mean)

features = list(X_test_sel.columns)

In [None]:
# Print the feature ranking
importances = imps.importances_mean
std = imps.importances_std
indices = np.argsort(importances)[::-1]

print("Feature ranking:")
for f in range(X_test_sel.shape[1]):
    print("%d. %s (%f)" % (f + 1, features[indices[f]], importances[indices[f]]))

In [None]:
import matplotlib.pyplot as plt

plt.figure(figsize=(19, 8))
plt.title("Feature importances")
plt.bar(range(X_test_sel.shape[1]), importances[indices], color="r", yerr=std[indices], align="center")
plt.xticks(range(X_test_sel.shape[1]), [features[indices[i]] for i in range(9)])
plt.xlim([-1, X_test_sel.shape[1]])
plt.show()

In [None]:
#If we remove the less important features, then recall and AUC_score will be reduced

#                                                     Thank you

#### I have tried other models also and have updated only the model with best performances
##### Other models tried :
>Logistic Regression<br>
Logistic Regression Data_transformed<br>
Logistic Regression SMOTE<br>
Logistic Regression RFE<br>
DecisionTreeClassifier<br>
DecisionTreeClassifier HP-tuning<br>
RandomForestClassifier<br>
RandomForestClassifier HP-tuning<br>
KNeighborsClassifier<br>
KNeighborsClassifier Hp-tuning<br>
LGBMClassifier<br>
LGBMClassifier HP-tuning<br>
GaussianNB w/o SMOTE<br>
GradientBoostingClassifier<br>
XGBoost<br>
AdaBoost<br>


#### Please let me know, if i need to upload a notebook with these models.

# Thank You