In [None]:
# This Python 3 environment comes with many helpful analytics libraries installed
# It is defined by the kaggle/python Docker image: https://github.com/kaggle/docker-python
# For example, here's several helpful packages to load

import numpy as np # linear algebra
import pandas as pd # data processing, CSV file I/O (e.g. pd.read_csv)

# Input data files are available in the read-only "../input/" directory
# For example, running this (by clicking run or pressing Shift+Enter) will list all files under the input directory

import os
for dirname, _, filenames in os.walk('/kaggle/input'):
    for filename in filenames:
        print(os.path.join(dirname, filename))

# You can write up to 20GB to the current directory (/kaggle/working/) that gets preserved as output when you create a version using "Save & Run All" 
# You can also write temporary files to /kaggle/temp/, but they won't be saved outside of the current session

In [None]:
import seaborn as sns
import plotly.express as px
import plotly.graph_objects as go
import matplotlib.pyplot as plt
import plotly.tools

#Libraries for handling imbalance data
from imblearn.over_sampling import SMOTE
from imblearn.under_sampling import RandomUnderSampler
from imblearn.pipeline import Pipeline
from imblearn.over_sampling import BorderlineSMOTE
from imblearn.combine import SMOTEENN
from imblearn.combine import SMOTETomek

from hyperopt import hp,fmin,tpe,STATUS_OK,Trials
from sklearn.linear_model import LogisticRegression
from sklearn.tree import DecisionTreeClassifier
from lightgbm import LGBMClassifier
from xgboost import XGBClassifier
from sklearn.ensemble import RandomForestClassifier
from sklearn.metrics import accuracy_score, confusion_matrix
from sklearn import metrics

from sklearn.model_selection import RepeatedStratifiedKFold, cross_val_score

In [None]:
df = pd.read_csv('/kaggle/input/ibm-hr-analytics-attrition-dataset/WA_Fn-UseC_-HR-Employee-Attrition.csv')
df.head()

In [None]:
df.shape

In [None]:
df.info()

In [None]:
df.describe()

In [None]:
df.isna().sum()

In [None]:
df.nunique()

In [None]:
# Droping columns
df.drop(['EmployeeNumber','EmployeeCount','Over18','StandardHours'],1, inplace=True)

# **Step 1**


# Analysing Data

In [None]:
fig = px.bar(x=df['Attrition'].value_counts().index,
             y=df['Attrition'].value_counts(), 
             title='Attrition Distribution', 
             text=(df['Attrition'].value_counts()/len(df['Attrition'])*100))

fig.update_traces(textposition='outside', 
                  texttemplate='%{text:.4s}%',
                  marker = dict(color = ['silver','gainsboro'],line = dict(color = "black", width = 3)))

fig['layout'].update(height=500, width=600)
fig.show()

Data is highly imbalanced as only 16% of the employees took attrition and 83.3% continued their jobs.

Will use handling imbalance data techniques later.


In [None]:
fig = px.histogram(df['Age'],nbins=100, height=500, width=700, template='ggplot2')
fig.show()

In [None]:
fig = px.histogram(df, x=df['Age'], color=df['Attrition'],nbins=70, height=500, width=700, template='ggplot2')
fig.show()

Majority of employees that got attrition range between **28** to **35**.

In [None]:
fig = px.bar(x=df['BusinessTravel'].value_counts().index, 
             y=df['BusinessTravel'].value_counts(), 
             title='Business Travel',
             text=(df['BusinessTravel'].value_counts()/len(df['BusinessTravel'])*100), 
             height=500,
             width=600)

fig.update_traces(textposition='outside', 
                  texttemplate='%{text:.4s}%',
                  marker = dict(color = ['peachpuff','moccasin','papayawhip'],line = dict(color = "black", width = 3)))
fig.show()



Almost **71%** of employess are rarely required to travel for work.

**18.8%** employees travel frequently for their Jobs.

**10.2%** employees job dosen't require travel at all.

In [None]:

fig = px.histogram(x=df['BusinessTravel'], 
                   color=df['Attrition'],
                   barmode='group',
                   color_discrete_sequence=['lemonchiffon','darkkhaki'], 
                   height=500, 
                   width=600,
                  title='Business Travel VS Attrition')
fig.show()

**Majority of employees that took attritions are the ones whose job require them to rarely travel.**

In [None]:
fig = px.histogram(x=df['DailyRate'],
                   nbins=100, 
                   height=500,
                   width=700, 
                   title='Daily Rate Distribution',
                   color_discrete_sequence=['darkgray'])
fig.show()


fig = px.histogram(x=df['HourlyRate'],
                   nbins=100,
                   height=500,
                   width=700,
                   title='Hourly Rate Distribution',
                   color_discrete_sequence=['darkslategray'])
fig.show()



In [None]:
fig = px.histogram(x=df['DailyRate'],
                   color=df['Attrition'],
                   nbins=100, 
                   height=500,
                   width=700, 
                   title='Daily Rates VS Attrition', 
                   color_discrete_sequence=['black','silver'])
fig.show()


fig = px.histogram(x=df['HourlyRate'],
                   color=df['Attrition'],
                   nbins=100, 
                   height=500,
                   width=700, 
                   title='Hourly Rates VS Attrition', 
                   color_discrete_sequence=['ghostwhite','darkslategray'])
fig.show()

In [None]:
fig = px.pie(values=df['Department'].value_counts(),
             names=df['Department'].value_counts().index, 
             title='Department')
fig.update_traces(marker = dict(colors = ['violet','plum','thistle'],line = dict(color = "mediumpurple", width = 2)))
fig.show()


**65.4%** of employees belong to **Research and Development Department**

**30.3%** belong to **Sales** Department

Only **4.2%** belong to **Human Resources**

In [None]:
fig = px.histogram(x=df['Department'], 
                   color=df['Attrition'],
                   barmode='group',
                   color_discrete_sequence=['plum','purple'], 
                   height=500, 
                   width=600,
                  title='Department VS Attrition')
fig.show()

In [None]:
fig = px.histogram(x=df['DistanceFromHome'], nbins=100, height=500, width=700, color_discrete_sequence=['deeppink'])
fig.show()

**Most of the employees stay nearby to Office.**

In [None]:
fig = px.histogram(x=df['DistanceFromHome'],
                  marginal='box',
                  color=df['Attrition'],
                  barmode='group',
                  nbins=25,
                  title='Distance From Home VS Attrition',
                  height=500,
                  width=800,
                  color_discrete_sequence=['deeppink','lightpink'])
fig.show()

In [None]:
fig = px.pie(values=df['Education'].value_counts(), 
             names=df['Education'].value_counts().index, 
             title='Education Level')
fig.update_traces(marker=dict(colors=['darkorange','orange','gold','goldenrod','khaki'],line=dict(color='chocolate',width=2)))
fig.show()

**Education**

1 'Below College'

2 'College'

3 'Bachelor'

4 'Master'

5 'Doctor'

**38.9%** of employees have Bachelor's Qualification.

**27%** employees have a Master's Degree

Only 3% employees are Doctor

In [None]:
fig = px.histogram(x=df['Education'], color=df['Attrition'], barmode='group', height=500, width=600,color_discrete_sequence=['gold','orange'])

fig.show()

In [None]:
fig = px.bar(x=df['EducationField'].value_counts().index,
             y=df['EducationField'].value_counts(),
             height=500, width=700,
           text=(df['EducationField'].value_counts()/len(df['EducationField'])*100), 
             title='Education Fields Count')

fig.update_traces(textposition='outside', 
                  texttemplate='%{text:.4s}%',
                  marker=dict(color=['dodgerblue','deepskyblue','skyblue','lightskyblue','lightblue','powderblue'],
                                                                                  line=dict(color='navy', width=2)))
fig.show()

**71%** of the employees are associated with **Science** field

**10**% are associated with **Marketing**

Only **1.8%** work in **Human resources**

In [None]:
fig = px.histogram(x=df['EducationField'],
                   color=df['Attrition'],
                  barmode='group',
                  height=500,
                  width=700,
                  color_discrete_sequence=['cornflowerblue','steelblue'])
fig.show()

In [None]:
fig = px.pie(values = df['EnvironmentSatisfaction'].value_counts(),
            names=df['EnvironmentSatisfaction'].value_counts().index,
            title='Environment Satisfaction Distribution ')

fig.update_traces(marker=dict(colors=['lightcoral','darksalmon','salmon','lightsalmon'], line=dict(color='darkred', width=2)))
fig.show()

**Enviroment Satisfaction**

1 'Low'

2 'Medium'

3 'High'

4 'Very High'

**Almost 80% Employees are highly satisfied with work Environment.**

****And 20% employees are not satisfied ****

In [None]:
fig = px.histogram(x=df['EnvironmentSatisfaction'],
                  color=df['Attrition'],
                  barmode='group',
                  height=500,
                  width=700,
                  title='Environment Satisfaction VS Attrition',
                  color_discrete_sequence=['palevioletred','orangered'])

fig.show()

In [None]:

fig = px.bar(x=df['Gender'].value_counts().index, y=df['Gender'].value_counts(), 
                text=(df['Gender'].value_counts()/len(df['Gender'])*100),
            height=500,
            width=500,
            title='Gender Distribution ')

fig.update_traces(textposition='outside',
                 texttemplate='%{text:.4s}%',
                 marker=dict(color=['powderblue','pink'],line=dict(color=['darkblue','mediumvioletred'], width=2)))

fig.show()

In [None]:
fig = px.histogram(x=df['Gender'],
                  color=df['Attrition'],
                  barmode='group',
                  color_discrete_sequence=['mediumvioletred','darkblue'],
                  height=500,
                  width=700,
                  title='Gender VS Attrition')
fig.show()

In [None]:
fig = px.pie(values=df['JobInvolvement'].value_counts(),
            names=df['JobInvolvement'].value_counts().index,
            title='Job Involvement')

fig.update_traces(marker=dict(colors = ['darkcyan','turquoise','mediumturquoise','paleturquoise'], line=dict(color='white', width=2)))

fig.show()

**Job Involvement**

1 'Low'

2 'Medium'

3 'High'

4 'Very High'

**70% Employees are highly involved with their job.**


In [None]:
fig = px.histogram(x=df['JobInvolvement'],
                  color=df['Attrition'],
                  barmode='group',
                  height=500,
                  width=700,
                  color_discrete_sequence=['turquoise','darkcyan'],
                  title='Job Involvement VS Attriition')
fig.show()

In [None]:
fig = px.bar(x=df['JobRole'].value_counts().index, 
             y=df['JobRole'].value_counts(),
            text = (df['JobRole'].value_counts()/len(df['JobRole'])*100),
            title='Job Role',
            height=500,
            width=700)
fig.update_traces(textposition='outside', texttemplate = '%{text:.4s}%', marker=dict(color='snow', line=dict(color='black', width=3)))

In [None]:
fig = px.histogram(x=df['JobRole'],
                  color=df['Attrition'],
                  barmode='group',
                  color_discrete_sequence=['chocolate','burlywood'],
                  height=500,
                  width=900,
                  title='Job Role VS Attrition')

fig.show()

Highest Attrition rate is seen among **Lab technicians**.

In [None]:
fig = px.pie(values=df['JobSatisfaction'].value_counts(), 
             names=df['JobSatisfaction'].value_counts().index,
            title='Job Satisfaction')
fig.update_traces(marker = dict(colors = ['dimgray','gray','darkgray','silver'], line = dict(color=['black'], width=2)))
fig.show()

Job Satisfaction

1 'Low'

2 'Medium'

3 'High'

4 'Very High'

**60% Employees are Satisfied with their Job**

In [None]:
fig = px.histogram(x=df['JobSatisfaction'],
                  color=df['Attrition'],
                  color_discrete_sequence=['black','silver'],
                  barmode='group',
                  height=500,
                  width=700,
                  title='Job Satisfaction VS Attrition')
fig.show()

In [None]:
fig = px.histogram(x = df['MonthlyIncome'], 
                  nbins = 100,
                  title='Monthly Income Distribution',
                  height=500,
                  width=600,
                  color_discrete_sequence=['lightgreen'])
fig.show()

**Monthly Income range between 1000 to upto 20k.**

In [None]:
fig = px.histogram(x=df['MonthlyIncome'],
                  color=df['Attrition'],
                  height=500,
                  width=700,
                  color_discrete_sequence=['yellowgreen','olive'],
                  barmode='group',
                  title='Monthly Income VS Attrition')
fig.show()

*****Average Monthly Income Among Male and Female*****

In [None]:
gen_income = df.groupby('Gender')['MonthlyIncome'].mean().reset_index()

fig = px.bar(x=gen_income['Gender'], 
            y=gen_income['MonthlyIncome'],
            title='Average Monthly Income Of Gender',
            height=500,
            width=600,
            )
fig.update_traces(marker = dict(color = 'whitesmoke', line=dict(color='olivedrab', width=3)))
fig.show()

**Slight Difference Between average salary of Male and Female.**

**Female earning slightly more than Males.**

# Step 2  
# Data PreProcessing

In [None]:
df['Attrition'] = pd.factorize(df['Attrition'])[0]
df1 = df.drop('Attrition', 1)

In [None]:
df1 = pd.get_dummies(df1)
df1['Attrition'] = df['Attrition']
df1.head(3)

In [None]:
x = df1.drop('Attrition',1)
y = df['Attrition']

print(x.shape)
print(y.shape)

In [None]:
from sklearn.model_selection import train_test_split
x_train, x_test, y_train, y_test = train_test_split(x,y, random_state=101, test_size=0.2)

In [None]:
accuracies =dict()

recall = dict()

# **Model Selection**

# RandomForest

In [None]:
rf = RandomForestClassifier()
rf.fit(x_train, y_train)

y_pred = rf.predict(x_test)

In [None]:
accuracies['RandomForest Classifier'] = accuracy_score(y_pred, y_test)
recall['RandomForest Classifier'] = metrics.recall_score(y_test,y_pred)

print('Accuracy of RandomForest Classifier is: ', accuracy_score(y_test,y_pred))
print('Recall Score of RndomForest Classifier is: ', metrics.recall_score(y_test, y_pred))

# XGBOOST

In [None]:
xgb = XGBClassifier(use_label_encoder=False)
xgb.fit(x_train, y_train)


#predicting x_test
y_pred = xgb.predict(x_test)

#appending accuracy score to accuracies dict
accuracies['XGB Classifier'] = accuracy_score(y_pred, y_test)
recall['XGB Classifier'] = metrics.recall_score(y_test,y_pred)

print('Accuracy Score of XGB Classifier is: ', accuracy_score(y_test,y_pred))
print('Recall Score of XGB Classifier is: ', metrics.recall_score(y_test, y_pred))

# Logistic Regression Classifier

In [None]:
lgr = LogisticRegression(max_iter=100000)
lgr.fit(x_train, y_train)

y_pred = lgr.predict(x_test)

accuracies['Logistic Regression'] = accuracy_score(y_test, y_pred)
recall['Logistic Regression'] = metrics.recall_score(y_test,y_pred)

print('Accuracy Score of Logistic Regression is: ', accuracy_score(y_test, y_pred))
print('Recall Score of Logistic Regression Model is: ', metrics.recall_score(y_test, y_pred))

# LGBM Classifier

In [None]:
lgbm = LGBMClassifier()

lgbm.fit(x_train, y_train)
y_pred = lgbm.predict(x_test)


accuracies['LGBM Classifier'] = accuracy_score( y_test, y_pred)
recall['LGBM Classifier'] = metrics.recall_score(y_test,y_pred)


print('Accuracy Score of LGBM Classifier is: ', accuracy_score(y_test, y_pred))
print('Recall Score of LGBM Classifier Model is: ', metrics.recall_score(y_test, y_pred))

In [None]:
recall = pd.DataFrame(list(recall.items()), columns=['Model', 'Recall Score'])
accuracies = pd.DataFrame(list(accuracies.items()),columns=['Model','Accuracy Score'])

In [None]:
accuracies_df = pd.merge(accuracies, recall, on='Model')
accuracies_df

**XGB & LGBM Classifier has the highest Accuracy Score, whereas the RandomForest Classifier has the highest Recall score.**

# **Step 3** 
# Handling Imbalance Data

# Smote (OverSampling)

Since data is highly imbalance accuracy scores of trivial prediction model is completely useless as it has absolutely no discriminatory power.
Data needs to be balance for more accurate prediction.

SMOTE is an oversampling technique where the synthetic samples are generated for the minority class. This algorithm helps to overcome the overfitting problem posed by random oversampling.


In [None]:
x_train1, x_test1, y_train1, y_test1 = train_test_split(x,y, random_state=22, test_size=0.2, shuffle=True)

In [None]:
# before applying smote

one_count=0
zero_count =0
for i in y_train1:
    if i==1:
        one_count +=1
    else:
        zero_count +=1
        
print('Number of one count is:', one_count)
print('Number of zero count is: ', zero_count)
    

Number of one in y before applying smote is **1233**
Number of zero in y before applying smote is **237**

Lets apply SMOTE and see how the values changes.

In [None]:

oversample = SMOTE(random_state=101)

x_train1, y_train1 = oversample.fit_resample(x_train1, y_train1)

In [None]:
# after applying SMOTE

one_count = 0
zero_count =0

for i in y_train1:
    if i==1:
        one_count +=1
    else:
        zero_count +=1

print('Number of one count after applying SMOTE is: ', one_count)
print('Number of zero count after applying SMOTE is: ', zero_count)

Both one's and zero's has same values now.

Lets apply Classifier model to Oversampled data and see if the prediction has improved or not.

**Oversampling and Undersampling Techniques should only be applied to train data not test data.**

In [None]:
accuracies_smote = dict()
recall_smote = dict()

In [None]:
#LGBM Classifier After SMOTE

lgbm.fit(x_train1, y_train1)
y_smote_pred = lgbm.predict(x_test1)


print('Accuracy Score of LGBM Classifier after applying SMOTE is: ', accuracy_score(y_test1,y_smote_pred ))
print('Recall:',metrics.recall_score(y_test1,y_smote_pred))


accuracies_smote['LGBM Classifier'] = accuracy_score(y_test1,y_smote_pred)
recall_smote['LGBM Classifier'] = metrics.recall_score(y_test1,y_smote_pred)

In [None]:
#RandomForest After SMOTE

rf.fit(x_train1, y_train1)
y_smote_pred =rf.predict(x_test1)


accuracies_smote['RandomForest Classifier'] = accuracy_score(y_test1,y_smote_pred )
recall_smote['RandomForest Classifier'] = metrics.recall_score(y_test1,y_smote_pred)


print('Accuracy Score of RandomForest Classifier after applying SMOTE is: ', accuracy_score(y_test1,y_smote_pred ))
print('Recall:',metrics.recall_score(y_test,y_smote_pred))

In [None]:
#Logistic Regression After SMOTE

lgr.fit(x_train1, y_train1)
y_smote_pred =lgr.predict(x_test1)


accuracies_smote['Logistic Regression'] = accuracy_score(y_test1,y_smote_pred )
recall_smote['Logistic Regression'] = metrics.recall_score(y_test1,y_smote_pred)


print('Accuracy Score of Logistic Regression after applying SMOTE is: ', accuracy_score(y_test1,y_smote_pred ))
print('Recall:',metrics.recall_score(y_test,y_smote_pred))

In [None]:
# XGB Classifier After SMOTE

xgb.fit(x_train1, y_train1)
y_smote_pred =xgb.predict(x_test1)


accuracies_smote['XGB Classifier'] = accuracy_score(y_test1,y_smote_pred )
recall_smote['XGB Classifier'] = metrics.recall_score(y_test1,y_smote_pred)


print('Accuracy Score of XGB Classifier after applying SMOTE is: ', accuracy_score(y_test1,y_smote_pred ))
print('Recall:',metrics.recall_score(y_test,y_smote_pred))

In [None]:
recall_smote = pd.DataFrame(list(recall_smote.items()),columns=['Model','Recall Score'])
accuracies_smote = pd.DataFrame(list(accuracies_smote.items()), columns=['Model', 'Accuracy Score'])

In [None]:
smote_df = pd.merge(accuracies_smote, recall_smote, on='Model')
smote_df

**RandomForest Classifier has the highest Accuracy and Recall Score after using SMOTE(OverSampling)**

# RandomUnder Sampling

In [None]:
accuracies_under = dict()
recall_under = dict()

In [None]:
x_train2, x_test2, y_train2, y_test2 = train_test_split(x,y, random_state=101, test_size= 0.2)

In [None]:
# Before Applying RandomUNDER Sampling

one_count = 0
zero_count = 0

for i in y_train2:
    if i == 1:
        one_count +=1
    else:
        zero_count +=1
        
print('Number of one count after applying RandomUnder Sampler is: ', one_count)
print('Number of zero count after applying RandomUnder Sampler is: ', zero_count)

In [None]:
under = RandomUnderSampler(sampling_strategy= 0.6)

x_train2, y_train2 = under.fit_resample(x_train2, y_train2)

In [None]:
# After Applying RandomUnder Sampling
one_count = 0
zero_count = 0

for i in y_train2:
    if i == 1:
        one_count +=1
    else:
        zero_count +=1
        
print('Number of one count after applying RandomUnder Sampler is: ', one_count)
print('Number of zero count after applying RandomUnder Sampler is: ', zero_count)

In [None]:
#logistic Regression After Applying RandomUnder Sampling

lgr.fit(x_train2, y_train2)
y_under_pred = lgr.predict(x_test2)


print('Accuracy Score of Logistic Regression is: ',accuracy_score(y_test2, y_under_pred))
print('Recall: ',metrics.recall_score(y_test2, y_under_pred))


accuracies_under['Logistic Regression'] = accuracy_score(y_test2, y_under_pred)
recall_under['Logistic Regression'] = metrics.recall_score(y_test2, y_under_pred)

In [None]:
#LGBM Classifier After Applying RandomUnder Sampling

lgbm.fit(x_train2, y_train2)
y_under_pred = lgbm.predict(x_test2)


print('Accuracy Score of LGBM Classifier is: ',accuracy_score(y_test2, y_under_pred))
print('Recall: ',metrics.recall_score(y_test2, y_under_pred))


accuracies_under['LGBM Classifier'] = accuracy_score(y_test2, y_under_pred)
recall_under['LGBM Classifier'] = metrics.recall_score(y_test2, y_under_pred)

In [None]:
#RandomForest Classifier After Applying RandomUnder Sampling

rf.fit(x_train2, y_train2)
y_under_pred = rf.predict(x_test2)


print('Accuracy Score of RandomForest Classifier is: ',accuracy_score(y_test2, y_under_pred))
print('Recall: ',metrics.recall_score(y_test2, y_under_pred))


accuracies_under['RandomForest Classifier'] = accuracy_score(y_test2, y_under_pred)
recall_under['RandomForest Classifier'] = metrics.recall_score(y_test2, y_under_pred)

In [None]:
#XGB Classifier After Applying RandomUnder Sampling

xgb.fit(x_train2, y_train2)
y_under_pred = xgb.predict(x_test2)


print('Accuracy Score of xgb Classifier is: ',accuracy_score(y_test2, y_under_pred))
print('Recall: ',metrics.recall_score(y_test2, y_under_pred))


accuracies_under['XGB Classifier'] = accuracy_score(y_test2, y_under_pred)
recall_under['XGB Classifier'] = metrics.recall_score(y_test2, y_under_pred)

In [None]:
accuracies_under = pd.DataFrame(list(accuracies_under.items()), columns=['Model','Accuracy Score'])
recall_under = pd.DataFrame(list(recall_under.items()), columns=['Model','Recall Score'])

In [None]:
under_df = pd.merge(accuracies_under, recall_under, on='Model')
under_df

**Although Accuracy Scores have droped a little for this method, i will select RandomForest Classifier, as recall score is highest for this model.**

# SMOTE Tomek (OverSampling and UnderSampling Combined)

SMOTE may be the most popular oversampling technique and can be combined with many different undersampling techniques.

SMOTE+TOMEK is such a hybrid technique that aims to clean overlapping data points for each of the classes distributed in sample space.


In [None]:
accuracies_tomek = dict()
recall_tomek = dict()

In [None]:
x_train3, x_test3, y_train3, y_test3 = train_test_split(x,y, random_state=22, test_size=0.2, shuffle=True)


In [None]:
# Before applying SMOTE Tomek

one_count = 0
zero_count = 0

for i in y_train3:
    if i==1:
        one_count +=1
    else:
        zero_count +=1
        
print('Number of one after applying SMOTE Tomek is: ', one_count)
print('Number of zero after applying SMOTE Tomek is: ', zero_count)

In [None]:

combine = SMOTETomek()

x_train3, y_train3 = combine.fit_resample(x_train3, y_train3)

In [None]:
# After applying SMOTE Tomek

one_count = 0
zero_count = 0

for i in y_train3:
    if i==1:
        one_count +=1
    else:
        zero_count +=1
        
print('Number of one after applying SMOTE Tomek is: ', one_count)
print('Number of zero after applying SMOTE Tomek is: ', zero_count)

In [None]:
#Logistic Regression

lgr.fit(x_train3, y_train3)
y_tomek_pred = lgr.predict(x_test3)


print('Accuracy Score of Logistic Regression is: ',accuracy_score(y_test3, y_tomek_pred))
print('Recall: ',metrics.recall_score(y_test3,y_tomek_pred))


accuracies_tomek['Logistic Regression'] = accuracy_score(y_test3, y_tomek_pred)
recall_tomek['Logistic Regression'] = metrics.recall_score(y_test3, y_tomek_pred)

In [None]:
#LGBM Classifier

lgbm.fit(x_train3, y_train3)

y_tomek_pred = lgbm.predict(x_test3)


print('Accuracy Score of LGBM Classifier is: ',accuracy_score(y_test3, y_tomek_pred))
print('Recall: ',metrics.recall_score(y_test3,y_tomek_pred))


accuracies_tomek['LGBM Classifier'] = accuracy_score(y_test3,y_tomek_pred )
recall_tomek['LGBM Classifier'] = metrics.recall_score(y_test3,y_tomek_pred)

In [None]:
#RandomForest Classifier

rf.fit(x_train3, y_train3)
y_tomek_pred =rf.predict(x_test3)


print('Accuracy Score of RandomForest Classifier is: ',accuracy_score(y_test3, y_tomek_pred))
print('Recall: ',metrics.recall_score(y_test3,y_tomek_pred))


accuracies_tomek['RandomForest Classifier'] = accuracy_score(y_test3,y_tomek_pred )
recall_tomek['RandomForest Classifier'] = metrics.recall_score(y_test3,y_tomek_pred)

In [None]:
# XGBOOST Classifier

xgb.fit(x_train3, y_train3)
y_tomek_pred =xgb.predict(x_test3)


print('Accuracy Score of XGB Classifier is: ',accuracy_score(y_test3, y_tomek_pred))
print('Recall: ',metrics.recall_score(y_test3,y_tomek_pred))


accuracies_tomek['XGB Classifier'] = accuracy_score(y_test3,y_tomek_pred )
recall_tomek['XGB Classifier'] = metrics.recall_score(y_test3,y_tomek_pred)

In [None]:
recall_tomek = pd.DataFrame(list(recall_tomek.items()), columns=['Model', 'Recall Score'])
accuracies_tomek = pd.DataFrame(list(accuracies_tomek.items()), columns=['Model','Accuracy Score'])

In [None]:
tomek_df = pd.merge(accuracies_tomek, recall_tomek, on='Model')
tomek_df

**In this method, again RandomForest Classifier has the highest Accuracy & Recall score.**

In [None]:
f, ax = plt.subplots(figsize=(8, 6))
sns.set_color_codes('pastel')
sns.barplot(y='Model', x='Accuracy Score', data=accuracies, color='pink', edgecolor='black')
plt.title('Accuracy Score Before Using Imbalance Handling Data Technique', fontsize=18)
plt.show()

In [None]:
f, ax = plt.subplots(figsize=(8, 6))
sns.set_color_codes('pastel')
sns.barplot(y='Model', x='Accuracy Score', data=accuracies_smote, color='lightblue', edgecolor='black')
plt.title('Accuracy Score After Using SMOTE', fontsize=18)
plt.show()

In [None]:
f, ax = plt.subplots(figsize=(8, 6))
sns.set_color_codes('pastel')
sns.barplot(y='Model', x='Accuracy Score', data=accuracies_under, color='lightsalmon', edgecolor='black')
plt.title('Accuracy Score After Using RandomUnder Sampling', fontsize=18)
plt.show()

In [None]:
f, ax = plt.subplots(figsize=(8, 6))
sns.set_color_codes('pastel')
sns.barplot(y='Model', x='Accuracy Score', data=accuracies_tomek, color='plum', edgecolor='black')
plt.title('Accuracy Score After Using SMOTE Tomek', fontsize=18)
plt.show()

# Bayesian optimisation

In [None]:
space = {'criterion' : hp.choice('criterion',['entropy','gini']), 
        'max_depth': hp.quniform('max_depth', 10,1200,10),
        'max_features': hp.choice('max_features',['auto','sqrt','log2']),
        'min_samples_leaf': hp.uniform('min_samples_leaf',0, 0.5),
        'min_samples_split': hp.uniform('min_samples_split',0,1),
        'n_estimators': hp.choice('n_estimators',[10,50,300,750,1200,1300])}

In [None]:
def objective(space):
    model = RandomForestClassifier(criterion= space['criterion'],
                                  max_depth=space['max_depth'],
                                  max_features=space['max_features'],
                                  min_samples_leaf=space['min_samples_leaf'],
                                  min_samples_split=space['min_samples_split'],
                                  n_estimators=space['n_estimators'])
    
    # we aim to maximize the accuracy, therefore we return as negative value
    accuracy = cross_val_score(model, x_train3, y_train3, cv=5).mean()
    return{'loss': -accuracy, 'status':STATUS_OK}
    
    

In [None]:
trials = Trials()

best = fmin(fn = objective,
            space=space,
           algo = tpe.suggest,
           max_evals=80,
           trials=trials)

best

In [None]:
crit = {0: 'entropy', 1:'gini'}
feat = {0:'auto', 1:'sqrt', 2:'log2'}
n_est = {0: 10, 1: 50, 2: 300, 3: 750, 4: 1200,5: 1300}

In [None]:
rf_clf = RandomForestClassifier(criterion= crit[best['criterion']],
                           max_depth= best['max_depth'],
                           max_features= feat[best['max_features']],
                           min_samples_leaf= best['min_samples_leaf'],
                           min_samples_split= best['min_samples_split'],
                           n_estimators= n_est[best['n_estimators']])
rf = rf_clf.fit(x_train3,y_train3)
pred = rf.predict(x_test3)
print(confusion_matrix(y_test3, pred))
print('Recall score after using  Bayesian optimisation is: ',metrics.recall_score(y_test3,pred))
print('Accuracy score after using  Bayesian optimisation is: ',accuracy_score(y_test3, pred))