In [121]:
import numpy as np
import pandas as pd
import matplotlib.pyplot as plt
%matplotlib inline
from sklearn.preprocessing import LabelBinarizer
from sklearn.model_selection import train_test_split
from sklearn.tree import DecisionTreeClassifier
from sklearn.metrics import accuracy_score, classification_report, confusion_matrix

In [39]:
hr = pd.read_excel('IBM_hr.xlsx')

In [40]:
hr.head()

Unnamed: 0,Age,Attrition,BusinessTravel,DailyRate,Department,DistanceFromHome,Education,EducationField,EmployeeCount,EmployeeNumber,...,RelationshipSatisfaction,StandardHours,StockOptionLevel,TotalWorkingYears,TrainingTimesLastYear,WorkLifeBalance,YearsAtCompany,YearsInCurrentRole,YearsSinceLastPromotion,YearsWithCurrManager
0,41,Yes,Travel_Rarely,1102,Sales,1,2,Life Sciences,1,1,...,1,80,0,8,0,1,6,4,0,5
1,49,No,Travel_Frequently,279,Research & Development,8,1,Life Sciences,1,2,...,4,80,1,10,3,3,10,7,1,7
2,37,Yes,Travel_Rarely,1373,Research & Development,2,2,Other,1,4,...,2,80,0,7,3,3,0,0,0,0
3,33,No,Travel_Frequently,1392,Research & Development,3,4,Life Sciences,1,5,...,3,80,0,8,3,3,8,7,3,0
4,27,No,Travel_Rarely,591,Research & Development,2,1,Medical,1,7,...,4,80,1,6,3,3,2,2,2,2


In [41]:
hr.shape

(1470, 35)

In [42]:
# Dropping columns which are irrelevant to attrition or they have same values throughout.
hr = hr.drop(['EmployeeCount', 'EmployeeNumber', 'Over18', 'StandardHours', 'Over18'], axis = 1)

In [43]:
# Check columns with NULL values
hr.isna().sum()

Age                         0
Attrition                   0
BusinessTravel              0
DailyRate                   0
Department                  0
DistanceFromHome            0
Education                   0
EducationField              0
EnvironmentSatisfaction     0
Gender                      0
HourlyRate                  0
JobInvolvement              0
JobLevel                    0
JobRole                     0
JobSatisfaction             0
MaritalStatus               0
MonthlyIncome               0
MonthlyRate                 0
NumCompaniesWorked          0
OverTime                    0
PercentSalaryHike           0
PerformanceRating           0
RelationshipSatisfaction    0
StockOptionLevel            0
TotalWorkingYears           0
TrainingTimesLastYear       0
WorkLifeBalance             0
YearsAtCompany              0
YearsInCurrentRole          0
YearsSinceLastPromotion     0
YearsWithCurrManager        0
dtype: int64

In [44]:
# Making feature and target variable
X = hr.drop('Attrition', axis = 1) # Features (all columns except Attrition)
y = hr['Attrition'] # Target

### Converting target variable(y) from Yes/No to numeric

In [45]:
lb = LabelBinarizer()

In [46]:
y = lb.fit_transform(y)

### Tidying categorical data in feature variable(X)

In [59]:
# Vectorizing all 'object' columns and concatenating them with other 'integer' column of the original X dataframe
X_vect = pd.concat([X.select_dtypes('int64'), pd.get_dummies(X.select_dtypes('object'))], axis = 1)

In [61]:
X_vect.shape

(1470, 51)

# Modelling

In [65]:
X_train, X_test, y_train, y_test = train_test_split(X_vect, y, test_size = 0.1)

In [143]:
tree = DecisionTreeClassifier(max_depth = 3)

In [144]:
tree.fit(X_train, y_train)

DecisionTreeClassifier(class_weight=None, criterion='gini', max_depth=3,
            max_features=None, max_leaf_nodes=None,
            min_impurity_decrease=0.0, min_impurity_split=None,
            min_samples_leaf=1, min_samples_split=2,
            min_weight_fraction_leaf=0.0, presort=False, random_state=None,
            splitter='best')

### Evaluating

In [153]:
print('{0:.2f}'.format(accuracy_score(y_test, tree.predict(X_test))*100))

84.35


In [146]:
print(classification_report(y_test, tree.predict(X_test)))

              precision    recall  f1-score   support

           0       0.88      0.95      0.91       127
           1       0.33      0.15      0.21        20

   micro avg       0.84      0.84      0.84       147
   macro avg       0.61      0.55      0.56       147
weighted avg       0.80      0.84      0.82       147



In [147]:
confusion_matrix(y_test, tree.predict(X_test))

array([[121,   6],
       [ 17,   3]], dtype=int64)

> Will try to improve my accuracy using ensembles.