In [21]:
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
import pickle

In [22]:
df = pd.read_csv('HR-Employee-Attrition.csv')
df.drop(['EmployeeCount', 'EmployeeNumber', 'StandardHours', 'Over18'], axis=1, inplace=True)

In [17]:
def unique_vals(col):

  if col.dtype == "object":

    print(f'{col.name}: {col.nunique()}')

df.apply(lambda col: unique_vals(col))





from sklearn.preprocessing import LabelEncoder, OneHotEncoder
# Create a label encoder object
le = LabelEncoder()

def label_encode(ser):

    if ser.dtype=="object" and ser.nunique() <= 2:
      print(ser.name)

      le.fit(ser)
      ser = le.transform(ser)

    return ser

df = df.apply(lambda col: label_encode(col))



# convert rest of categorical variable into dummy
df = pd.get_dummies(df, columns = ["BusinessTravel", "Department", "MaritalStatus"], drop_first = True)


target = df['Attrition'].copy()
df = df.drop(["Attrition"], axis = 1)
type(target)






# Since we have class imbalance (i.e. more employees with turnover=0 than turnover=1)
# let's use stratify=y to maintain the same ratio as in the training dataset when splitting the dataset
from sklearn.model_selection import train_test_split
X_train, X_test, y_train, y_test = train_test_split(df,
                                                    target,
                                                    test_size=0.25,
                                                    random_state=7,
                                                    stratify=target)

print("Number transactions X_train dataset: ", X_train.shape)
print("Number transactions y_train dataset: ", y_train.shape)
print("Number transactions X_test dataset: ", X_test.shape)
print("Number transactions y_test dataset: ", y_test.shape)



import category_encoders as ce

ce_target = ce.TargetEncoder(cols = ['EducationField', 'JobRole'])
X_train = ce_target.fit_transform(X_train, y_train)
X_test = ce_target.transform(X_test)




#Upsampling using SMOTE

from imblearn.over_sampling import SMOTE
from collections import Counter

smt = SMOTE()
X_sm, y_sm = smt.fit_resample(X_train, y_train)

print('Resampled dataset shape {}'.format(Counter(y_sm)))

Attrition: 2
BusinessTravel: 3
Department: 3
EducationField: 6
Gender: 2
JobRole: 9
MaritalStatus: 3
OverTime: 2
Attrition
Gender
OverTime
Number transactions X_train dataset:  (1102, 33)
Number transactions y_train dataset:  (1102,)
Number transactions X_test dataset:  (368, 33)
Number transactions y_test dataset:  (368,)
Resampled dataset shape Counter({0: 924, 1: 924})


In [21]:
from sklearn.tree import DecisionTreeClassifier
model = DecisionTreeClassifier(criterion = 'entropy')
model.fit(X_train, y_train)
y_pred = model.predict(X_test)
y_pred[:5]

In [33]:
depths = [1,2, 3,4,5,6,7,9,11,13,15]

for depth in depths:
    tree_clf = DecisionTreeClassifier(random_state=7, max_depth=depth)

    cv_acc_results = cross_validate(tree_clf, X_train, y_train, cv = 10, scoring = 'accuracy', return_train_score = True)

    print(f"K-Fold for depth:{depth} \
            Accuracy Mean: Train: {cv_acc_results['train_score'].mean().round(4)*100} \
            Validation: {cv_acc_results['test_score'].mean().round(4)*100}")

    print('***************')

K-Fold for depth:1             Accuracy Mean: Train: 83.85000000000001             Validation: 83.85000000000001
***************
K-Fold for depth:2             Accuracy Mean: Train: 85.87             Validation: 84.48
***************
K-Fold for depth:3             Accuracy Mean: Train: 86.91             Validation: 84.75
***************
K-Fold for depth:4             Accuracy Mean: Train: 88.92999999999999             Validation: 84.03
***************
K-Fold for depth:5             Accuracy Mean: Train: 90.72             Validation: 83.03
***************
K-Fold for depth:6             Accuracy Mean: Train: 92.52             Validation: 83.03
***************
K-Fold for depth:7             Accuracy Mean: Train: 94.16             Validation: 81.75
***************
K-Fold for depth:9             Accuracy Mean: Train: 96.98             Validation: 80.94
***************
K-Fold for depth:11             Accuracy Mean: Train: 98.77             Validation: 77.94
***************
K-Fold for depth:1

In [23]:
from sklearn.tree import DecisionTreeClassifier

tree_clf = DecisionTreeClassifier(random_state = 7, max_depth = 6)
tree_clf = tree_clf.fit(X_train, y_train)
pred = tree_clf.predict(X_test)


print(tree_clf.score(X_train, y_train))
print(tree_clf.score(X_test, y_test))

0.9228675136116152
0.8478260869565217


In [29]:
row_sample = X_train.sample(n=600, replace=True)
row_sample

Unnamed: 0,Age,DailyRate,DistanceFromHome,Education,EducationField,EnvironmentSatisfaction,Gender,HourlyRate,JobInvolvement,JobLevel,...,YearsAtCompany,YearsInCurrentRole,YearsSinceLastPromotion,YearsWithCurrManager,BusinessTravel_Travel_Frequently,BusinessTravel_Travel_Rarely,Department_Research & Development,Department_Sales,MaritalStatus_Married,MaritalStatus_Single
337,29,738,9,5,0.138715,2,1,30,2,1,...,3,2,2,2,False,True,True,False,False,True
1435,44,1037,1,3,0.127479,2,1,42,3,1,...,4,3,1,2,False,True,True,False,False,True
711,29,906,10,3,0.151584,4,0,92,2,1,...,0,0,0,0,False,True,True,False,False,True
369,31,408,9,4,0.151584,3,1,42,2,1,...,2,2,2,2,False,True,True,False,False,True
691,40,1469,9,4,0.127479,4,1,35,3,1,...,1,1,0,0,True,False,True,False,False,False
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
619,33,586,1,3,0.127479,1,1,48,4,2,...,9,8,0,8,False,True,False,True,False,False
460,26,775,29,2,0.127479,1,1,45,3,2,...,0,0,0,0,False,True,False,True,False,False
508,35,1017,6,4,0.151584,2,1,82,1,2,...,17,11,11,8,False,True,True,False,False,True
901,48,969,2,2,0.239544,4,1,76,4,1,...,1,0,0,0,False,True,True,False,False,True


In [30]:
#Sampling columns (We do not wish to repeat columns, hence replace=False in this case)
row_sample = row_sample.sample(n=5, replace=False, axis=1)
row_sample

Unnamed: 0,DistanceFromHome,MonthlyRate,MonthlyIncome,RelationshipSatisfaction,Department_Sales
337,9,7621,3983,3,False
1435,1,13422,2436,3,False
711,10,11479,2404,3,False
369,9,7551,2657,4,False
691,9,25063,3617,4,False
...,...,...,...,...,...
619,1,21816,4037,1,True
460,29,4267,4306,1,True
508,6,19368,6646,2,False
901,2,16620,2559,3,False


In [31]:
from sklearn.ensemble import RandomForestClassifier

In [32]:
#n_estimators is the number of trees
rf_clf = RandomForestClassifier(random_state= 7, max_depth = 6, n_estimators= 100)

In [33]:
from sklearn.model_selection import cross_validate

In [35]:
cv_acc_results = cross_validate(rf_clf, X_train, y_train, cv=10, scoring='accuracy', return_train_score=True)

In [36]:
print(f"K-Fold Accuracy Mean: \n\
Train: {cv_acc_results['train_score'].mean()*100:.2f}\n\
Validation: {cv_acc_results['test_score'].mean()*100:.2f}")

K-Fold Accuracy Mean: 
Train: 89.77
Validation: 84.94
