# Attrition Predictor

1. Getting the data from CSV file

In [1]:
import pandas as pd
df = pd.read_csv("HR-Employee-Attrition.csv")

In [2]:
df.head() # taking a look at the data

Unnamed: 0,Age,Attrition,BusinessTravel,DailyRate,Department,DistanceFromHome,Education,EducationField,EmployeeCount,EmployeeNumber,...,RelationshipSatisfaction,StandardHours,StockOptionLevel,TotalWorkingYears,TrainingTimesLastYear,WorkLifeBalance,YearsAtCompany,YearsInCurrentRole,YearsSinceLastPromotion,YearsWithCurrManager
0,41,Yes,Travel_Rarely,1102,Sales,1,2,Life Sciences,1,1,...,1,80,0,8,0,1,6,4,0,5
1,49,No,Travel_Frequently,279,Research & Development,8,1,Life Sciences,1,2,...,4,80,1,10,3,3,10,7,1,7
2,37,Yes,Travel_Rarely,1373,Research & Development,2,2,Other,1,4,...,2,80,0,7,3,3,0,0,0,0
3,33,No,Travel_Frequently,1392,Research & Development,3,4,Life Sciences,1,5,...,3,80,0,8,3,3,8,7,3,0
4,27,No,Travel_Rarely,591,Research & Development,2,1,Medical,1,7,...,4,80,1,6,3,3,2,2,2,2


In [3]:
# Finding the missing values
df.isna().sum()

Age                         0
Attrition                   0
BusinessTravel              0
DailyRate                   0
Department                  0
DistanceFromHome            0
Education                   0
EducationField              0
EmployeeCount               0
EmployeeNumber              0
EnvironmentSatisfaction     0
Gender                      0
HourlyRate                  0
JobInvolvement              0
JobLevel                    0
JobRole                     0
JobSatisfaction             0
MaritalStatus               0
MonthlyIncome               0
MonthlyRate                 0
NumCompaniesWorked          0
Over18                      0
OverTime                    0
PercentSalaryHike           0
PerformanceRating           0
RelationshipSatisfaction    0
StandardHours               0
StockOptionLevel            0
TotalWorkingYears           0
TrainingTimesLastYear       0
WorkLifeBalance             0
YearsAtCompany              0
YearsInCurrentRole          0
YearsSince

In [4]:
# Taking out the target variable
Y = df['Attrition']

Here we can see that 'Attrition','EmployeeNumber','EmployeeCount','StandardHours' can be dropped off
1. Employee number is a series and is not connected to prediction variable - seen in correlation
2. Attrition - already isolated from data for prediction
3. Employee count and Standard hours - are not chnaging for any record

In [5]:
df = df.drop(columns = ['Attrition','EmployeeNumber','EmployeeCount','StandardHours'])

In [6]:
# Feature Encoding
from sklearn.preprocessing import LabelEncoder 
le = LabelEncoder() 
df['Gender']= le.fit_transform(df['Gender']) 
df['Over18']= le.fit_transform(df['Over18']) 
df['OverTime']= le.fit_transform(df['OverTime']) 
df['BusinessTravel']= le.fit_transform(df['BusinessTravel']) 
df['Department']= le.fit_transform(df['Department']) 
df['EducationField']= le.fit_transform(df['EducationField']) 
df['JobRole']= le.fit_transform(df['JobRole']) 
df['MaritalStatus']= le.fit_transform(df['MaritalStatus']) 

In [7]:
# Encoding the target
Y = le.fit_transform(Y)

In [8]:
# Extracting Nominal Features

NominalData = df[['BusinessTravel','Department','EducationField','JobRole','MaritalStatus']]

In [9]:
# Encoding the Nominal features

from sklearn.preprocessing import OneHotEncoder
enc = OneHotEncoder(handle_unknown='ignore')
enc.fit(NominalData)
data = enc.transform(NominalData).toarray()

In [10]:
# Gluing these features back to the original dataframe

df = df.drop(columns = ['BusinessTravel','Department','EducationField','JobRole','MaritalStatus'])
data1 = pd.DataFrame(data)
df = pd.concat([df,data1], axis=1, join='outer')

In [11]:
# Testing and Training data Split

from sklearn.model_selection import train_test_split
Train_x, Test_x, Train_y, Test_y = train_test_split(df, Y, test_size=0.35, random_state= 40)

In [12]:
#Model training

from sklearn import linear_model
reg = linear_model.LogisticRegression()
reg.fit(Train_x,Train_y)


LogisticRegression(C=1.0, class_weight=None, dual=False, fit_intercept=True,
                   intercept_scaling=1, l1_ratio=None, max_iter=100,
                   multi_class='auto', n_jobs=None, penalty='l2',
                   random_state=None, solver='lbfgs', tol=0.0001, verbose=0,
                   warm_start=False)

Making Predictions

In [13]:
predict = reg.predict(Test_x)

Checking Accuracy

In [14]:
from sklearn.metrics import accuracy_score
accuracy_score(Test_y, predict)

0.8388349514563107

Checking confusion Matrix

In [15]:
from sklearn.metrics import confusion_matrix
confu_mat = confusion_matrix(Test_y, predict)
print(confu_mat)

[[432   0]
 [ 83   0]]


In [16]:
# Checking the random forest classifier

In [17]:
from sklearn.ensemble import RandomForestClassifier
lm2 = RandomForestClassifier(n_estimators= 300,random_state= 20).fit(Train_x,Train_y)
forest_pred = lm2.predict(Test_x)

In [18]:
# Accuracy of random forest

accuracy_score(Test_y, forest_pred)

0.8563106796116505

In [19]:
# Confusion Matrix
confu_mat = confusion_matrix(Test_y, forest_pred)
print(confu_mat)

[[431   1]
 [ 73  10]]
