In [5]:
import numpy as np
import pandas as pd

from sklearn.tree import DecisionTreeClassifier
from sklearn.ensemble import RandomForestClassifier
from sklearn.neural_network import MLPClassifier

# Employee will be Attrited or not is the Dependent Variable

df = pd.read_csv("HR_Employee_Attrition_Data.csv")

In [6]:
df.head()

Unnamed: 0,Age,Attrition,DailyRate,DistanceFromHome,Education,EmployeeCount,EmployeeNumber,EnvironmentSatisfaction,HourlyRate,JobInvolvement,...,RelationshipSatisfaction,StandardHours,StockOptionLevel,TotalWorkingYears,TrainingTimesLastYear,WorkLifeBalance,YearsAtCompany,YearsInCurrentRole,YearsSinceLastPromotion,YearsWithCurrManager
0,41,1,1102,1,2,1,1,2,94,3,...,1,80,0,8,0,1,6,4,0,5
1,49,0,279,8,1,1,2,3,61,2,...,4,80,1,10,3,3,10,7,1,7
2,37,1,1373,2,2,1,3,4,92,2,...,2,80,0,7,3,3,0,0,0,0
3,33,0,1392,3,4,1,4,4,56,3,...,3,80,0,8,3,3,8,7,3,0
4,27,0,591,2,1,1,5,1,40,3,...,4,80,1,6,3,3,2,2,2,2


In [3]:
df.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 2940 entries, 0 to 2939
Data columns (total 27 columns):
 #   Column                    Non-Null Count  Dtype
---  ------                    --------------  -----
 0   Age                       2940 non-null   int64
 1   Attrition                 2940 non-null   int64
 2   DailyRate                 2940 non-null   int64
 3   DistanceFromHome          2940 non-null   int64
 4   Education                 2940 non-null   int64
 5   EmployeeCount             2940 non-null   int64
 6   EmployeeNumber            2940 non-null   int64
 7   EnvironmentSatisfaction   2940 non-null   int64
 8   HourlyRate                2940 non-null   int64
 9   JobInvolvement            2940 non-null   int64
 10  JobLevel                  2940 non-null   int64
 11  JobSatisfaction           2940 non-null   int64
 12  MonthlyIncome             2940 non-null   int64
 13  MonthlyRate               2940 non-null   int64
 14  NumCompaniesWorked        2940 non-null 

In [7]:
df.drop(['EmployeeCount', 'EmployeeNumber'], axis=1, inplace=True)

In [9]:
df.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 2940 entries, 0 to 2939
Data columns (total 25 columns):
 #   Column                    Non-Null Count  Dtype
---  ------                    --------------  -----
 0   Age                       2940 non-null   int64
 1   Attrition                 2940 non-null   int64
 2   DailyRate                 2940 non-null   int64
 3   DistanceFromHome          2940 non-null   int64
 4   Education                 2940 non-null   int64
 5   EnvironmentSatisfaction   2940 non-null   int64
 6   HourlyRate                2940 non-null   int64
 7   JobInvolvement            2940 non-null   int64
 8   JobLevel                  2940 non-null   int64
 9   JobSatisfaction           2940 non-null   int64
 10  MonthlyIncome             2940 non-null   int64
 11  MonthlyRate               2940 non-null   int64
 12  NumCompaniesWorked        2940 non-null   int64
 13  PercentSalaryHike         2940 non-null   int64
 14  PerformanceRating         2940 non-null 

In [11]:
df.Attrition.sum()/df.Attrition.count()

0.16122448979591836

In [12]:
# Split the data as train test split

x = df.drop('Attrition', axis=1)
y = df.pop('Attrition')

from sklearn.model_selection import train_test_split

x_train, x_test, y_train, y_test = train_test_split(x, y, test_size=.30, random_state=1)

# Decision Tree Classifier

In [13]:
# Decision Tree Classifier

dt_model = DecisionTreeClassifier(criterion='gini')

In [14]:
dt_model.fit(x_train, y_train)

DecisionTreeClassifier()

In [15]:
from sklearn import tree

In [18]:
train_char_labels = ['No', 'Yes']
HR_Tree_File = open('hr_tree.doc', 'w')

dot_data = tree.export_graphviz(dt_model, out_file=HR_Tree_File, feature_names= list(x_train), class_names=list(train_char_labels))

HR_Tree_File.close()

## Grid Search CV to check for the optimal depth for D-Tree

In [19]:
from sklearn.model_selection import GridSearchCV

In [20]:
param_grid = {
    'max_depth': [7, 8, 9, 10],
    'min_samples_leaf': [15, 20, 25],
    'min_samples_split': [45, 60, 75]
}

dt_model = DecisionTreeClassifier()

In [21]:
grid_search = GridSearchCV(estimator=dt_model, param_grid=param_grid, cv=3)

In [22]:
grid_search.fit(x_train, y_train)

GridSearchCV(cv=3, estimator=DecisionTreeClassifier(),
             param_grid={'max_depth': [7, 8, 9, 10],
                         'min_samples_leaf': [15, 20, 25],
                         'min_samples_split': [45, 60, 75]})

In [23]:
grid_search.best_params_

{'max_depth': 7, 'min_samples_leaf': 25, 'min_samples_split': 45}

In [24]:
best_grid = grid_search.best_estimator_

In [25]:
ytrain_predict = best_grid.predict(x_train)
ytest_predict = best_grid.predict(x_test)

In [26]:
from sklearn.metrics import classification_report

In [27]:
print(classification_report(y_train, ytrain_predict))

              precision    recall  f1-score   support

           0       0.87      0.98      0.92      1737
           1       0.66      0.21      0.32       321

    accuracy                           0.86      2058
   macro avg       0.76      0.60      0.62      2058
weighted avg       0.84      0.86      0.83      2058



In [28]:
print(classification_report(y_test, ytest_predict))

              precision    recall  f1-score   support

           0       0.84      0.97      0.90       729
           1       0.51      0.15      0.23       153

    accuracy                           0.83       882
   macro avg       0.68      0.56      0.57       882
weighted avg       0.79      0.83      0.79       882



Use Predict_Proba

In [29]:
ytrain_predict_proba = best_grid.predict_proba(x_train)
ytest_predict_proba = best_grid.predict_proba(x_test)

In [30]:
ytrain_predict_proba[:,1]

array([0.        , 0.04      , 0.1875    , ..., 0.05079365, 0.        ,
       0.13953488])

# Random Forest Classifier

In [33]:
param_grid = {
    'max_depth': [6, 7],
    'max_features': [10, 14],
    'min_samples_leaf': [15, 20],
    'min_samples_split': [40, 50],
    'n_estimators': [101, 301]
}

rfcl = RandomForestClassifier()

grid_search = GridSearchCV(estimator = rfcl, param_grid = param_grid, cv = 3)

In [35]:
grid_search.fit(x_train, y_train)

GridSearchCV(cv=3, estimator=RandomForestClassifier(),
             param_grid={'max_depth': [6, 7], 'max_features': [10, 14],
                         'min_samples_leaf': [15, 20],
                         'min_samples_split': [40, 50],
                         'n_estimators': [101, 301]})

In [36]:
grid_search.best_params_

{'max_depth': 6,
 'max_features': 14,
 'min_samples_leaf': 15,
 'min_samples_split': 50,
 'n_estimators': 301}

In [37]:
best_grid = grid_search.best_estimator_

In [38]:
ytrain_predict = best_grid.predict(x_train)
ytest_predict = best_grid.predict(x_test)

In [39]:
print(classification_report(y_train, ytrain_predict))

              precision    recall  f1-score   support

           0       0.87      1.00      0.93      1737
           1       0.93      0.16      0.28       321

    accuracy                           0.87      2058
   macro avg       0.90      0.58      0.60      2058
weighted avg       0.88      0.87      0.83      2058



In [40]:
print(classification_report(y_test, ytest_predict))

              precision    recall  f1-score   support

           0       0.85      1.00      0.92       729
           1       0.91      0.13      0.23       153

    accuracy                           0.85       882
   macro avg       0.88      0.56      0.57       882
weighted avg       0.86      0.85      0.80       882



# Artificial Neural Networks

In [41]:
# Scaling is necessary here
from sklearn.preprocessing import StandardScaler

In [42]:
sc = StandardScaler()

In [43]:
x_trains = sc.fit_transform(x_train)

In [45]:
x_tests = sc.transform(x_test)

In [69]:
param_grid = {
    'hidden_layer_sizes': [100],
    'activation': ['logistic', 'relu'],
    'solver': ['sgd', 'adam'],
    'tol': [0.01,0.001],
    'max_iter' : [10000]
}

mlp = MLPClassifier()

grid_search = GridSearchCV(estimator = mlp, param_grid = param_grid, cv = 3)

In [70]:
grid_search.fit(x_trains, y_train)

GridSearchCV(cv=3, estimator=MLPClassifier(),
             param_grid={'activation': ['logistic', 'relu'],
                         'hidden_layer_sizes': [100], 'max_iter': [10000],
                         'solver': ['sgd', 'adam'], 'tol': [0.01, 0.001]})

In [71]:
grid_search.best_params_

{'activation': 'relu',
 'hidden_layer_sizes': 100,
 'max_iter': 10000,
 'solver': 'adam',
 'tol': 0.001}

In [72]:
best_grid = grid_search.best_estimator_

In [74]:
ytrain_predict = best_grid.predict(x_train)
ytest_predict = best_grid.predict(x_test)

In [56]:
print(classification_report(y_train, ytrain_predict))
print(classification_report(y_test, ytest_predict))

              precision    recall  f1-score   support

           0       0.84      1.00      0.92      1737
           1       0.00      0.00      0.00       321

    accuracy                           0.84      2058
   macro avg       0.42      0.50      0.46      2058
weighted avg       0.71      0.84      0.77      2058

              precision    recall  f1-score   support

           0       0.83      1.00      0.91       729
           1       0.00      0.00      0.00       153

    accuracy                           0.83       882
   macro avg       0.41      0.50      0.45       882
weighted avg       0.68      0.83      0.75       882



  _warn_prf(average, modifier, msg_start, len(result))
