# EMPLOYEE ATTRITION | COMPARATIVE ANALYSIS OF CLASSIFICATION ML TECHNIQUES

# **TABLE OF CONTENT**

1. [Importing libraries](#1.)
1. [Data Description](#2.)
 * [Checking for null values](#2.1)
 * [Checking for imbalanced dataset or not](#2.2)
1. [Exploratory Data Analysis](#3.)
 * [Analyzing Numerical features](#3.1) 
 * [Outliers in the dataset](#3.2)
 * [Analyzing Categorical features](#3.3)
 * [Numerical Features v/s Attrition](#3.4)
    * [Age feature](#3.4.1)
    * [DistanceFromHome feature](#3.4.2)
    * [MonthlyIncome feature](#3.4.3)
    * [PercentSalaryHike feature](#3.4.4)
    * [YearsAtCompany feature](#3.4.5)
    * [YearsSinceLastPromotion feature](#3.4.6)   
 * [Categorical Features v/s Attrition](#3.5)
    * [BusinessTravel feature](#3.5.1)
    * [Department feature](#3.5.2)
    * [EducationField feature](#3.5.3)
    * [Gender feature](#3.5.4)
    * [OverTime feature](#3.5.5)
    * [EnvironmentSatisfaction feature](#3.5.6)
 * [Inferences](#3.6)
1. [Feature Engineering](#4.)
 * [Correlation between features](#4.1)
 * [Feature Scaling](#4.2)
1. [Train-Test Split](#5.)
 * [Stratified K-Fold](#5.1)
1. [Model Building](#6.)
 * [Model Accuracies](#6.1)
 * [Mean Accuracy of Classification models](#6.2)
 * [Confusion Matrix of Classification models](#6.3)


 
 
 

<a id="1."></a>
# IMPORTING LIBRARIERS

In [None]:
import numpy as np
import pandas as pd
import matplotlib.pyplot as plt
import seaborn as sns

from sklearn.preprocessing import LabelEncoder
from sklearn.preprocessing import StandardScaler

from sklearn.model_selection import StratifiedKFold
from sklearn.model_selection import cross_val_score 
from sklearn.model_selection import cross_val_predict

from sklearn.linear_model import LogisticRegression
from sklearn.tree import DecisionTreeClassifier
from sklearn.naive_bayes import GaussianNB
from sklearn import svm

from sklearn.metrics import accuracy_score
from sklearn.metrics import confusion_matrix

<a id="2."></a>
# DATA DESCRIPTION

In [None]:
data = pd.read_csv('../input/ibm-hr-analytics-attrition-dataset/WA_Fn-UseC_-HR-Employee-Attrition.csv')

In [None]:
data.sample(10)

As we can see from our dataset 'Attrition' is our target variable.

In [None]:
data.shape

In [None]:
data.info()

In [None]:
data.describe()

<a id="2.1"></a>
### CHECKING FOR NULL VALUES

In [None]:
data.isnull().sum()

There are no missing values in our dataset

<a id="2.2"></a>
### CHECKING FOR IMBALANCED DATASET OR NOT

In [None]:
data['Attrition'].value_counts()

237 employees out of 1470 left the company

In [None]:
labels = data['Attrition'].value_counts().index
size = data['Attrition'].value_counts().values
plt.figure(figsize = (6,6))
plt.pie(size,colors = ['lightskyblue', 'gold'],explode = (0, 0.1), labels = labels,shadow = True,autopct = "%.2f%%")
plt.title('Attrition Percentage')
plt.axis('off')
plt.legend()

Positive class accounts for about 16.12% of data.So we can say that our dataset is imbalanced.

In [None]:
data.nunique()

<a id="3"></a>
# EXPLORATORY DATA ANALYSIS

In [None]:
#Numerical Features
num_features = ['Age','DailyRate','DistanceFromHome','HourlyRate','MonthlyIncome','MonthlyRate','PercentSalaryHike','TotalWorkingYears','YearsAtCompany','YearsInCurrentRole','YearsSinceLastPromotion','YearsWithCurrManager']

In [None]:
#Categorical Features
cat_features = ['BusinessTravel','Department','Education','EducationField','EnvironmentSatisfaction','Gender','JobInvolvement','JobLevel','JobRole','JobSatisfaction','MaritalStatus','NumCompaniesWorked','TrainingTimesLastYear','OverTime','PerformanceRating','RelationshipSatisfaction','StockOptionLevel','WorkLifeBalance']

In [None]:
print("Number of numerical features in our dataset = ",len(num_features))

In [None]:
print("Number of categorical features in our dataset = ",len(cat_features))

<a id="3.1"></a>
### ANALYZING NUMERICAL FEATURES

In [None]:
plt.figure(figsize = (30,30))
for i,feature in enumerate(num_features):
    ax = plt.subplot(4,3,i+1)
    sns.histplot(data[feature],kde = True)
    ax.set_title(feature + " Histogram")

<a id="3.2"></a>
### OUTLIERS IN THE DATSET

In [None]:
plt.figure(figsize = (30,30))
for i,feature in enumerate(num_features):
    ax = plt.subplot(4,3,i+1)
    sns.boxplot(y = data[feature], x = data['Attrition'])
    ax.set_title(feature)

<a id="3.3"></a>
### ANALYZING CATEGORICAL FEATURES

In [None]:
plt.figure(figsize = (40,40))
for i,feature in enumerate(cat_features):
    labels = data[feature].value_counts().index
    size = data[feature].value_counts().values
    ax = plt.subplot(6,3,i+1)
    ax.pie(size,labels = labels,shadow = True,autopct = "%.2f%%")
    plt.title(feature)
    ax.axis('off')

<a id="3.4"></a>
#### Numerical features v/s Attrition

<a id="3.4.1"></a>
##### Age feature

In [None]:
sns.displot(x = "Age", hue = "Attrition",element = "poly", data = data,kde = True)

<a id="3.4.2"></a>
##### DistanceFromHome feature

In [None]:
sns.displot(x = "DistanceFromHome", hue = "Attrition", data = data,kde = True,palette = 'flare')

<a id="3.4.3"></a>
##### MonthlyIncome feature

In [None]:
sns.displot(x = "MonthlyIncome", hue = "Attrition", data = data,kde = True,palette = 'pastel')

<a id="3.4.4"></a>
##### PercentSalaryHike feature

In [None]:
sns.displot(x = "PercentSalaryHike", hue = "Attrition", data = data,kde = True)

<a id="3.4.5"></a>
##### YearsAtCompany feature

In [None]:
sns.displot(x = "YearsAtCompany",element = "poly", hue = "Attrition", data = data,kde = True)

<a id="3.4.6"></a>
##### YearsSinceLastPromotion feature

In [None]:
sns.displot(x = "YearsSinceLastPromotion",hue = "Attrition", data = data,kde = True)

<a id="3.5"></a>
#### Categorical features v/s Attrition

<a id="3.5.1"></a>
##### BusinessTravel feature

In [None]:
sns.countplot(x = "BusinessTravel",hue = "Attrition",data = data)

In [None]:
size = [data['BusinessTravel'][(data['Attrition'] == 'Yes') & (data['BusinessTravel'] == 'Travel_Rarely')].count(),data['BusinessTravel'][(data['Attrition'] == 'No') & (data['BusinessTravel'] == 'Travel_Rarely')].count()]
plt.pie(size,explode = (0.1,0),autopct='%.2f%%' )
plt.title('Attrition % in employees who travel rarely')
plt.axis('off')

In [None]:
size = [data['BusinessTravel'][(data['Attrition'] == 'Yes') & (data['BusinessTravel'] == 'Travel_Frequently')].count(),data['BusinessTravel'][(data['Attrition'] == 'No') & (data['BusinessTravel'] == 'Travel_Frequently')].count()]
plt.pie(size,explode = (0.1,0),autopct='%.2f%%' )
plt.title('Attrition % in employees who travel frequently')
plt.axis('off')

<a id="3.5.2"></a>
##### Department feature

In [None]:
data['Department'].unique()

In [None]:
sns.countplot(x = "Department",hue = "Attrition",data = data)

In [None]:
size = [data['Department'][(data['Attrition'] == 'Yes') & (data['Department'] == 'Sales')].count(),data['Department'][(data['Attrition'] == 'No') & (data['Department'] == 'Sales')].count()]
plt.pie(size,explode = (0.1,0),autopct='%.2f%%' )
plt.title('Attrition % in employees from sales department')
plt.axis('off')

In [None]:
size = [data['Department'][(data['Attrition'] == 'Yes') & (data['Department'] == 'Research & Development')].count(),data['Department'][(data['Attrition'] == 'No') & (data['Department'] == 'Research & Development')].count()]
plt.pie(size,explode = (0.1,0),autopct='%.2f%%' )
plt.title('Attrition % in employees from Research and Development department')
plt.axis('off')

<a id="3.5.3"></a>
##### EducationField feature

In [None]:
sns.countplot(x = "EducationField",hue = "Attrition",data = data)
plt.xticks(rotation = 90)

In [None]:
size = [data['EducationField'][(data['Attrition'] == 'Yes') & (data['EducationField'] == 'Marketing')].count(),data['EducationField'][(data['Attrition'] == 'No') & (data['EducationField'] == 'Marketing')].count()]
plt.pie(size,explode = (0.1,0),autopct='%.2f%%' )
plt.title('Attrition % in employees from marketing education field')
plt.axis('off')

In [None]:
size = [data['EducationField'][(data['Attrition'] == 'Yes') & (data['EducationField'] == 'Technical Degree')].count(),data['EducationField'][(data['Attrition'] == 'No') & (data['EducationField'] == 'Technical Degree')].count()]
plt.pie(size,explode = (0.1,0),autopct='%.2f%%' )
plt.title('Attrition % in employees from technical degree education field')
plt.axis('off')

In [None]:
size = [data['EducationField'][(data['Attrition'] == 'Yes') & (data['EducationField'] == 'Life Sciences')].count(),data['EducationField'][(data['Attrition'] == 'No') & (data['EducationField'] == 'Life Sciences')].count()]
plt.pie(size,explode = (0.1,0),autopct='%.2f%%' )
plt.title('Attrition % in employees from Life Sciences education field')
plt.axis('off')

In [None]:
size = [data['EducationField'][(data['Attrition'] == 'Yes') & (data['EducationField'] == 'Medical')].count(),data['EducationField'][(data['Attrition'] == 'No') & (data['EducationField'] == 'Medical')].count()]
plt.pie(size,explode = (0.1,0),autopct='%.2f%%' )
plt.title('Attrition % in employees from Medical education field')
plt.axis('off')

<a id="3.5.4"></a>
##### Gender feature

In [None]:
sns.countplot(x = "Gender",hue = "Attrition",data = data)

In [None]:
size = [data['Gender'][(data['Attrition'] == 'Yes') & (data['Gender'] == 'Male')].count(),data['Gender'][(data['Attrition'] == 'No') & (data['Gender'] == 'Male')].count()]
plt.pie(size,explode = (0.1,0),autopct='%.2f%%' )
plt.title('Attrition % in male employees')
plt.axis('off')

In [None]:
size = [data['Gender'][(data['Attrition'] == 'Yes') & (data['Gender'] == 'Female')].count(),data['Gender'][(data['Attrition'] == 'No') & (data['Gender'] == 'Female')].count()]
plt.pie(size,explode = (0.1,0),autopct='%.2f%%' )
plt.title('Attrition % in female employees')
plt.axis('off')

<a id="3.5.5"></a>
##### OverTime feature

In [None]:
sns.countplot(x = "OverTime",hue = "Attrition",data = data)

In [None]:
size = [data['OverTime'][(data['Attrition'] == 'Yes') & (data['OverTime'] == 'Yes')].count(),data['OverTime'][(data['Attrition'] == 'No') & (data['OverTime'] == 'Yes')].count()]
plt.pie(size,explode = (0.1,0),autopct='%.2f%%' )
plt.title('Attrition % in employees who do overtime')
plt.axis('off')

In [None]:
size = [data['OverTime'][(data['Attrition'] == 'Yes') & (data['OverTime'] == 'No')].count(),data['OverTime'][(data['Attrition'] == 'No') & (data['OverTime'] == 'No')].count()]
plt.pie(size,explode = (0.1,0),autopct='%.2f%%' )
plt.title('Attrition % in employees who do not do overtime')
plt.axis('off')

<a id="3.5.6"></a>
##### EnvironmentSatisfaction feature

In [None]:
sns.countplot(x = "EnvironmentSatisfaction",hue = "Attrition",data = data)

In [None]:
size = [data['EnvironmentSatisfaction'][(data['Attrition'] == 'Yes') & (data['EnvironmentSatisfaction'] == 1)].count(),data['EnvironmentSatisfaction'][(data['Attrition'] == 'No') & (data['EnvironmentSatisfaction'] == 1)].count()]
plt.pie(size,explode = (0.1,0),autopct='%.2f%%' )
plt.title('Attrition % in employees whose environment satisfaction =  1')
plt.axis('off')

In [None]:
size = [data['EnvironmentSatisfaction'][(data['Attrition'] == 'Yes') & (data['EnvironmentSatisfaction'] == 4)].count(),data['EnvironmentSatisfaction'][(data['Attrition'] == 'No') & (data['EnvironmentSatisfaction'] == 4)].count()]
plt.pie(size,explode = (0.1,0),autopct='%.2f%%' )
plt.title('Attrition % in employees whose environment satisfaction =  4')
plt.axis('off')

<a id="3.6"></a>
### INFERENCES

* Most employees are from Research & Development department (65.37 %)
* Most employees have medium (Bachelor's) level of education (38.91 %)
* Most employees have Sales executive as their job role(22.18 %)
* As expected Attrition is relatively higher in employees with lower monthly income
* Attrition starts decreasing as employee's years at company reaches to 15 years
* MonthlyIncome, Age, PercentSalaryHike and YearsAtCompany features are greatly impacting Attrition rate.
* Attrition percentage is highest in employees who travel frequently(24.91 %)
* Attrition percentage is highest in employees who are from Sales department(20.63 %)
* Attrition percentage is highest in employees who are from Technical degree education field(24.24 %)
* Attrition percentage is highest in male employees (17.01 %)
* Attrition percentage is highest in employees who do overtime(30.53 %)
* Attrition percentage is highest in employees who has 1(low) of Environment Satisfaction(25.35 %)

<a id="4"></a>
# FEATURE ENGINEERING

EmployeeCount,Over18 and StandardHours contains a single unique value.So these features will not be useful for either visualization or modelling. 
EmployeeNumber feature is just an identifier and it's not required for modelling either.

In [None]:
df = pd.read_csv("../input/ibm-hr-analytics-attrition-dataset/WA_Fn-UseC_-HR-Employee-Attrition.csv")
df.drop(['EmployeeCount'],axis = 1, inplace = True)
df.drop(['StandardHours'],axis = 1,inplace = True)
df.drop(['Over18'],axis = 1,inplace = True)
df.drop(['EmployeeNumber'],axis = 1,inplace = True)

In [None]:
df.head()

<a id="4.1"></a>
### CORRELATION BETWEEN FEATURES

In [None]:
plt.figure(figsize = (30,30))
sns.heatmap(df.corr(),annot = True)

Correlation coefficients whose magnitude are between 0.7 and 0.9 indicate variables which can be considered highly correlated. 
Highly correlated features are -:
* JobLevel and MonthlyIncome  - 0.95
* JobLevel and TotalWorkingYears - 0.78
* MonthlyIncome and TotalworkingYears - 0.77
* PercentSalaryHike and PerformanceRating - 0.77
* YearsAtCompany and YearsWithCurrManager - 0.77
* YearsAtCompany and YearsInCurrRole - 0.76
* YearsInCurrentRole and YearsWithCurrentManager  - 0.71

We will drop the features which are highly correalated with each other

In [None]:
df.drop(['JobLevel'],axis = 1,inplace = True)
df.drop(['TotalWorkingYears'],axis = 1,inplace = True)
df.drop(['PerformanceRating'],axis = 1,inplace = True)
df.drop(['YearsWithCurrManager'],axis = 1,inplace = True)
df.drop(['YearsInCurrentRole'],axis = 1,inplace = True)

In [None]:
cat_features.remove('JobLevel')
cat_features.remove('PerformanceRating')

In [None]:
cat_features

In [None]:
num_features.remove('TotalWorkingYears')
num_features.remove('YearsInCurrentRole')
num_features.remove('YearsWithCurrManager')

In [None]:
num_features

<a id="4.2"></a>
### FEATURE SCALING

In [None]:
le = LabelEncoder()
for feature in cat_features:
    df[feature] = le.fit_transform(df[feature])

In [None]:
sc = StandardScaler()
for feature in num_features:
    df[feature] = sc.fit_transform(np.array(df[feature]).reshape(-1,1))

In [None]:
df['Attrition'] = le.fit_transform(df['Attrition'])

In [None]:
df.head()

In [None]:
df.info()

<a id="5"></a>
# TRAIN-TEST SPLIT

<a id="5.1"></a>
### Stratified K-Fold

In [None]:
X = df.drop(['Attrition'],axis = 1)
y = df['Attrition']

In [None]:
skf = StratifiedKFold(n_splits = 5)
skf.get_n_splits(X,y)
print(skf)

<a id="6"></a>
# MODEL BUILDING

Following are the classification algorithms I will use to make the model:

1. Logistic Regression
1. Decision Trees
1. Naive Bayes
1. Support Vector Machines(Linear and radial)

In [None]:
skf = StratifiedKFold(n_splits = 5)
skf.get_n_splits(X,y)
accuracy=[]
mean_acc = []
classifiers=['Logistic Regression','Decision Tree','Naive Bayes','Linear SVM']
models=[LogisticRegression(),DecisionTreeClassifier(criterion = 'entropy',max_depth = 3),GaussianNB(),svm.SVC(kernel = 'linear')]
for i in models:
    model = i
    cv_result = cross_val_score(model,X,y, cv = skf,scoring = "accuracy")
    mean_acc.append(cv_result.mean())
    accuracy.append(cv_result)

<a id="6.1"></a>
### MODEL ACCURACIES

In [None]:
models_acc_df = pd.DataFrame(accuracy,index = [classifiers])
cols = ['Fold 1','Fold 2','Fold 3','Fold 4','Fold 5']
models_acc_df.columns = cols
models_acc_df

In [None]:
models_mean_acc_df = pd.DataFrame(mean_acc,index = [classifiers])
models_mean_acc_df.columns = ['Mean Accuracy']
models_mean_acc_df

In [None]:
plt.subplots(figsize=(12,4))
box = pd.DataFrame(accuracy,index=[classifiers])
box.T.boxplot()

<a id="6.2"></a>
### MEAN ACCURACY OF CLASSIFICATION MODELS

In [None]:
models_mean_acc_df['Mean Accuracy'].plot.barh(width=0.8)
plt.title('CV Mean Accuracy')
fig=plt.gcf()
fig.set_size_inches(8,5)
plt.show()

<a id="6.3"></a>
### CONFUSION MATRIX OF CLASSIFICATION MODELS

In [None]:
f,ax=plt.subplots(2,2,figsize=(20,20))

y_pred = cross_val_predict(LogisticRegression(),X,y,cv= skf)
sns.heatmap(confusion_matrix(y,y_pred),ax=ax[0][0],annot=True,fmt='2.0f')
ax[0][0].set_title('Matrix for Logistic Regression')

y_pred = cross_val_predict(DecisionTreeClassifier(criterion = 'entropy',max_depth = 3),X,y,cv= skf)
sns.heatmap(confusion_matrix(y,y_pred),ax=ax[0][1],annot=True,fmt='2.0f')
ax[0][1].set_title('Matrix for Decision Tree')

y_pred = cross_val_predict(GaussianNB(),X,y,cv= skf)
sns.heatmap(confusion_matrix(y,y_pred),ax=ax[1][0],annot=True,fmt='2.0f')
ax[1][0].set_title('Matrix for Naive Bayes')

y_pred = cross_val_predict(svm.SVC(kernel = 'linear'),X,y,cv = skf)
sns.heatmap(confusion_matrix(y,y_pred),ax = ax[1][1],annot = True, fmt = '2.0f')
ax[1][1].set_title('Matrix for Linear SVM')
plt.show()

# NOTEBOOK STILL IN PROGRESS