In [3]:
# This Python 3 environment comes with many helpful analytics libraries installed
# It is defined by the kaggle/python docker image: https://github.com/kaggle/docker-python
# For example, here's several helpful packages to load in 

import numpy as np # linear algebra
import pandas as pd # data processing, CSV file I/O (e.g. pd.read_csv)

# Input data files are available in the "../input/" directory.
# For example, running this (by clicking run or pressing Shift+Enter) will list the files in the input directory

import os
print(os.listdir("../input"))
data = pd.read_csv('../input/WA_Fn-UseC_-HR-Employee-Attrition.csv')
# Any results you write to the current directory are saved as output.

In [4]:
import matplotlib.pyplot as plt
import seaborn as sns
%matplotlib inline

In [5]:
data.head()

The above data consists of a dependent variable Attrition and others as independent variables. Exploring the data would provide explain which model to use.

# Data Exploration

In [6]:
plt.figure(figsize=(12,7))
sns.heatmap(data.isnull(),yticklabels=False,cbar=False,cmap='viridis')
plt.show()

In the above Chart we checked for missing values using a heatmap. Thus from the above visualisation we can say that there are no NA in this data.

In [7]:
plt.figure(figsize=(14,10))
sns.heatmap(data.corr(),yticklabels=False,cbar=True,linewidths=0)
plt.show()

The above data shows there is a huge serial correlation in the data. Dropping these features may cause to lose enough information. We will try using decomposition to solve this problem.

The above shows the joint plot of Kernel Density distribution of Daily Rate against Age.

In [8]:
plt.figure(figsize=(8,8))
sns.barplot(x=data['Department'],y=data['DailyRate'],hue=data['EducationField'])
plt.show()

In [9]:
sns.countplot(data['Attrition'])
plt.show()

In [10]:
data.columns

# Feature Engineering

In [11]:
data.info()

<b> From the above, we can see that there are 9 categorical data. Here we have to create dummy variables of them.</b>

In [12]:
BusinessTravel = pd.get_dummies(data['BusinessTravel'],drop_first=True)

In [13]:
Department = pd.get_dummies(data['Department'],drop_first=True)

In [14]:
EducationField = pd.get_dummies(data['EducationField'],drop_first=True)

In [15]:
Gender = pd.get_dummies(data['Gender'],drop_first=True)

In [16]:
JobRole  = pd.get_dummies(data['JobRole'],drop_first=True)

In [17]:
MaritalStatus = pd.get_dummies(data['MaritalStatus'],drop_first=True)

In [18]:
Train = data

In [19]:
def StrToBin(a):
    if a == 'Yes':
        return 1
    else:
        return 0
    

In [20]:
def StrToBinb(a):
    if a == 'Y':
        return 1
    else:
        return 0
    

In [21]:
Train['Attrition']=Train['Attrition'].apply(StrToBin)

In [22]:
Train['OverTime']=Train['OverTime'].apply(StrToBin)

In [23]:
Train['Over18']=Train['Over18'].apply(StrToBinb)

In [24]:
Train.info()

In [25]:
Train.drop(['Department','EducationField','Gender','BusinessTravel','JobRole','MaritalStatus'],axis=1,inplace=True)

In [26]:
Train = pd.concat([Train,Department,EducationField,Gender,BusinessTravel,JobRole,MaritalStatus],axis=1)

In [27]:
Train.info()

As we can see, there are many features in this data. Removing the features might lose information. Instead of feature selection, We are going to extract features from this data using Linear Discriminant Analysis.

In [28]:
m = list(Train.columns)
n = list(filter(lambda t: t not in ['Attrition'], m))

In [29]:
X = Train[n]

In [30]:
y = Train['Attrition']

# Train Test Split

In [31]:
from sklearn.model_selection import train_test_split
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size = 0.3, random_state = 0)

In [32]:
from sklearn.preprocessing import StandardScaler
sc = StandardScaler()
X_train = sc.fit_transform(X_train)
X_test = sc.transform(X_test)

--------------------------------------------------------------------------------------------------------------------------------

--------------------------------------------------------------------------------------------------------------------------------

#  MODEL 1: Logistic Regression with LDA

# Applying LDA

In [33]:
from sklearn.discriminant_analysis import LinearDiscriminantAnalysis as LDA

In [34]:
lda = LDA(n_components = 2)
X_train = lda.fit_transform(X_train, y_train)
X_test = lda.transform(X_test)

# Logistic Regression

In [35]:
from sklearn.linear_model import LogisticRegression
classifier = LogisticRegression(penalty='l2', solver='sag', C=1)
fit1 = classifier.fit(X_train, y_train)

In [36]:
y_pred = fit1.predict(X_test)

# Confusion Matrix and Classification Report

In [37]:
from sklearn.metrics import confusion_matrix,classification_report
cm = confusion_matrix(y_test, y_pred)
cr = classification_report(y_test, y_pred)

In [38]:
print(cm)
print(cr)

<b>Applying LDA and then performing Logistic Regression gives us an accuracy score of 87%. 

----------------------------------------------------------------------------------------------------------------------

--------------------------------------------------------------------------------------------------------------------------------

# MODEL 2: K- Nearest Neighbour 

# Train Test Split

In [39]:
from sklearn.model_selection import train_test_split
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size = 0.2, random_state = 0)

<b>Applying LDA</b>

In [40]:
from sklearn.discriminant_analysis import LinearDiscriminantAnalysis as LDA

In [41]:
lda = LDA(n_components = 2,solver='eigen',shrinkage='auto')
X_train = lda.fit_transform(X_train, y_train)
X_test = lda.transform(X_test)

<b> 1) Performing KNN </b>

In [42]:
from sklearn.neighbors import KNeighborsClassifier
knn = KNeighborsClassifier(n_neighbors=17,weights='distance',algorithm='brute')
knn.fit(X_train,y_train)

In [43]:
pred = knn.predict(X_test)

<b> Choosing the best K </b>

In [44]:
error_rate = []
for i in range(1,40):
    knn = KNeighborsClassifier(n_neighbors=i)
    knn.fit(X_train,y_train)
    pred_i = knn.predict(X_test)
    error_rate.append(np.mean(pred_i != y_test))

In [45]:
plt.figure(figsize=(10,6))
plt.plot(range(1,40),error_rate,color='blue', linestyle='dashed', marker='o',
         markerfacecolor='red', markersize=10)
plt.title('Error Rate vs. K Value')
plt.xlabel('K')
plt.ylabel('Error Rate')

<b> Report </b>

In [46]:
from sklearn.metrics import classification_report,confusion_matrix
print(confusion_matrix(y_test,pred))
print(classification_report(y_test,pred))

--------------------------------------------------------------------------------------------------------------------------------

--------------------------------------------------------------------------------------------------------------------------------

#  MODEL 3: SVM

# Train Test Split

In [47]:
from sklearn.model_selection import train_test_split
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size = 0.3, random_state = 0)

<b> Applying LDA </b>

In [48]:
from sklearn.discriminant_analysis import LinearDiscriminantAnalysis as LDA
lda = LDA(n_components = 2,solver='svd')
X_train = lda.fit_transform(X_train, y_train)
X_test = lda.transform(X_test)

In [49]:
from sklearn.svm import SVC
model = SVC(C=100,kernel = 'rbf')
model.fit(X_train,y_train)

In [50]:
predictions = model.predict(X_test)

<b> Report </b>

In [51]:
from sklearn.metrics import classification_report,confusion_matrix
print(confusion_matrix(y_test,predictions))
print(classification_report(y_test,predictions))

# Decision Tree

In [52]:
from sklearn.tree import DecisionTreeClassifier as DTC

In [53]:
dt = DTC(criterion='entropy',splitter='random')

In [54]:
tree = dt.fit(X_train,y_train)

In [55]:
pred_t = dt.predict(X_test)

In [56]:
cm = confusion_matrix(y_test,pred_t)

In [57]:
cr = classification_report(y_test,pred_t)

In [58]:
print(cm)
print(cr)

# K-FOLD CROSS VALIDATION

<b> Logistic Regression </b>

In [64]:
from sklearn.model_selection import cross_val_score
accuracies = cross_val_score(estimator = fit1, X = X_train, y = y_train, cv = 10)
print('mean: ' ,accuracies.mean())
print('SD:' ,accuracies.std())
print(np.mean(accuracies))

<b> KNN </b>

In [65]:
from sklearn.model_selection import cross_val_score
accuracies = cross_val_score(estimator = knn, X = X_train, y = y_train, cv = 10)
print('mean: ' ,accuracies.mean())
print('SD:' ,accuracies.std())
print(np.mean(accuracies))

<b> SVM </b>

In [66]:
from sklearn.model_selection import cross_val_score
accuracies = cross_val_score(estimator = model, X = X_train, y = y_train, cv = 10)
print('mean: ' ,accuracies.mean())
print('SD:' ,accuracies.std())
print(np.mean(accuracies))

<b> Decision Tree </b>

In [67]:
from sklearn.model_selection import cross_val_score
accuracies = cross_val_score(estimator = tree, X = X_train, y = y_train, cv = 10)
print('mean: ' ,accuracies.mean())
print('SD:' ,accuracies.std())
print(np.mean(accuracies))

From this we can see that the best algorithm to predict if an employee will leave or not is **KNN** followed by **Logistic Regression**.

# Thank You