# Demostration of HR Analytics (Predicting Attrition and Performance)

[Using IBM HR Analytics Employee Attrition & Performance](https://www.kaggle.com/pavansubhasht/ibm-hr-analytics-attrition-datase)

***

## Import relevant libraries

In [31]:
import os 
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
import seaborn as sns

from collections import Counter

% matplotlib inline

## load and process data

In [32]:
homedir = os.path.expanduser('~')
DATA_FILEPATH = os.path.join(homedir, 'data/HR Analytics/WA_Fn-UseC_-HR-Employee-Attrition.csv')
df = pd.read_csv(DATA_FILEPATH)

In [33]:
df.head()

Unnamed: 0,Age,Attrition,BusinessTravel,DailyRate,Department,DistanceFromHome,Education,EducationField,EmployeeCount,EmployeeNumber,...,RelationshipSatisfaction,StandardHours,StockOptionLevel,TotalWorkingYears,TrainingTimesLastYear,WorkLifeBalance,YearsAtCompany,YearsInCurrentRole,YearsSinceLastPromotion,YearsWithCurrManager
0,41,Yes,Travel_Rarely,1102,Sales,1,2,Life Sciences,1,1,...,1,80,0,8,0,1,6,4,0,5
1,49,No,Travel_Frequently,279,Research & Development,8,1,Life Sciences,1,2,...,4,80,1,10,3,3,10,7,1,7
2,37,Yes,Travel_Rarely,1373,Research & Development,2,2,Other,1,4,...,2,80,0,7,3,3,0,0,0,0
3,33,No,Travel_Frequently,1392,Research & Development,3,4,Life Sciences,1,5,...,3,80,0,8,3,3,8,7,3,0
4,27,No,Travel_Rarely,591,Research & Development,2,1,Medical,1,7,...,4,80,1,6,3,3,2,2,2,2


In [34]:
df.shape

(1470, 35)

In [35]:
# remove noisy fields
df.pop('EmployeeCount')
df.pop('EmployeeNumber')
df.pop('Over18')
df.pop('StandardHours')

print('Removed noisy features')

Removed noisy features


In [36]:
df.columns

Index(['Age', 'Attrition', 'BusinessTravel', 'DailyRate', 'Department',
       'DistanceFromHome', 'Education', 'EducationField',
       'EnvironmentSatisfaction', 'Gender', 'HourlyRate', 'JobInvolvement',
       'JobLevel', 'JobRole', 'JobSatisfaction', 'MaritalStatus',
       'MonthlyIncome', 'MonthlyRate', 'NumCompaniesWorked', 'OverTime',
       'PercentSalaryHike', 'PerformanceRating', 'RelationshipSatisfaction',
       'StockOptionLevel', 'TotalWorkingYears', 'TrainingTimesLastYear',
       'WorkLifeBalance', 'YearsAtCompany', 'YearsInCurrentRole',
       'YearsSinceLastPromotion', 'YearsWithCurrManager'],
      dtype='object')

In [37]:
print("normal data distribution: {}".format(Counter(df['Attrition'])))
targets = df['Attrition'].value_counts()

pcts = targets / targets.sum()
pcts

normal data distribution: Counter({'No': 1233, 'Yes': 237})


No     0.838776
Yes    0.161224
Name: Attrition, dtype: float64

In [38]:
y = df['Attrition']

#remove target and set all remaining fields as features
df.pop('Attrition')
print('Set target variable')

Set target variable


In [39]:
from sklearn import preprocessing

In [40]:
# binarize the target variable from Yes / No to [0|1]
le = preprocessing.LabelBinarizer()
y = le.fit_transform(y).ravel()

In [41]:
# one hot encode categortical data using pandas built-in dummies method
all_categorical_fields = df.select_dtypes('object').columns
all_categorical_fields_indicator = []

for categorical_field in all_categorical_fields:
        all_categorical_fields_indicator.append(pd.get_dummies(df[categorical_field], prefix=categorical_field))
        
# create a new dataframe with one hot encoded features and continous variables
# now add remaining continous variables
all_categorical_fields_indicator.append(df.select_dtypes(['int64']))
X = pd.concat([categorical_field for categorical_field in all_categorical_fields_indicator], axis=1)

In [42]:
print('shape of X: {}'.format(X.shape))
print('shape of y: {}'.format(y.shape))

shape of X: (1470, 51)
shape of y: (1470,)


In [43]:
from imblearn.pipeline import make_pipeline as make_pipeline_imb
from imblearn.over_sampling import SMOTE

X, y = SMOTE().fit_sample(X, y)
print("SMOTE data distribution: {}".format(Counter(y)))

SMOTE data distribution: Counter({1: 1233, 0: 1233})


***

## Fit and Test Model

In [44]:
from sklearn.ensemble import GradientBoostingClassifier
from sklearn.model_selection import train_test_split
from sklearn.model_selection import cross_val_score, cross_val_predict
from sklearn.metrics import accuracy_score, classification_report, confusion_matrix

In [45]:
# split train test dataset
X_train, X_test, y_train, y_test = train_test_split(X, y)

In [46]:
def print_score(clf, X_train, y_train, X_test, y_test, train=True):
    '''
    print the accuracy score, classification report and confusion matrix of classifier
    '''
    if train:
        '''
        training performance
        '''
        print("Train Result:\n")
        print("accuracy score: {0:.4f}\n".format(accuracy_score(y_train, clf.predict(X_train))))
        print("Classification Report: \n {}\n".format(classification_report(y_train, clf.predict(X_train))))
        print("Confusion Matrix: \n {}\n".format(confusion_matrix(y_train, clf.predict(X_train))))

        res = cross_val_score(clf, X_train, y_train, cv=10, scoring='accuracy')
        print("Average Accuracy: \t {0:.4f}".format(np.mean(res)))
        print("Accuracy SD: \t\t {0:.4f}".format(np.std(res)))
        
    elif train==False:
        '''
        test performance
        '''
        print("Test Result:\n")        
        print("accuracy score: {0:.4f}\n".format(accuracy_score(y_test, clf.predict(X_test))))
        print("Classification Report: \n {}\n".format(classification_report(y_test, clf.predict(X_test))))
        print("Confusion Matrix: \n {}\n".format(confusion_matrix(y_test, clf.predict(X_test))))    

In [47]:
gbc_clf = GradientBoostingClassifier(
    criterion = 'friedman_mse',
    init = None,
    learning_rate = 0.3,
    loss = 'deviance',
    max_depth = 3,
    max_features = None,
    max_leaf_nodes = None,
    min_impurity_decrease = 0.0,
    min_impurity_split = None,
    min_samples_leaf = 3,
    min_samples_split = 10,
    min_weight_fraction_leaf = 0.0,
    n_estimators = 100,
    presort = 'auto',
    random_state = 42,
    subsample = 1.0,
    verbose = 0,
    warm_start = False)

In [48]:
# fit in sample data
gbc_clf.fit(X_train, y_train)

GradientBoostingClassifier(criterion='friedman_mse', init=None,
              learning_rate=0.3, loss='deviance', max_depth=3,
              max_features=None, max_leaf_nodes=None,
              min_impurity_decrease=0.0, min_impurity_split=None,
              min_samples_leaf=3, min_samples_split=10,
              min_weight_fraction_leaf=0.0, n_estimators=100,
              presort='auto', random_state=42, subsample=1.0, verbose=0,
              warm_start=False)

### Evaluate model performance

In [49]:
print_score(gbc_clf, X_train, y_train, X_test, y_test, train=True) # training set
print_score(gbc_clf, X_train, y_train, X_test, y_test, train=False) # testing set

Train Result:

accuracy score: 0.9973

Classification Report: 
              precision    recall  f1-score   support

          0       0.99      1.00      1.00       939
          1       1.00      0.99      1.00       910

avg / total       1.00      1.00      1.00      1849


Confusion Matrix: 
 [[939   0]
 [  5 905]]

Average Accuracy: 	 0.9151
Accuracy SD: 		 0.0290
Test Result:

accuracy score: 0.9287

Classification Report: 
              precision    recall  f1-score   support

          0       0.90      0.96      0.93       294
          1       0.96      0.90      0.93       323

avg / total       0.93      0.93      0.93       617


Confusion Matrix: 
 [[282  12]
 [ 32 291]]



## Improve basemodel via parameter optimisation

from sklearn.model_selection import GridSearchCV

params_grid = {"max_depth": [3, 5, None],
               "min_samples_split": [2, 3, 10],
               "min_samples_leaf": [1, 3, 10],
               "learning_rate": [0.1, 0.2, 0.3, 0.4, 0.5]}

grid_search = GridSearchCV(gbc_clf, params_grid,
                           n_jobs=-1, cv=5,
                           verbose=1, scoring='accuracy')

grid_search.fit(X_train, y_train)

print(grid_search.best_score_)
grid_search.best_estimator_.get_params()

In [24]:
print_score(gbc_clf, X_train, y_train, X_test, y_test, train=True) # training set
print_score(gbc_clf, X_train, y_train, X_test, y_test, train=False) # testing set

Train Result:

accuracy score: 0.9555

Classification Report: 
              precision    recall  f1-score   support

          0       0.95      1.00      0.97       920
          1       0.99      0.74      0.85       182

avg / total       0.96      0.96      0.95      1102


Confusion Matrix: 
 [[919   1]
 [ 48 134]]

Average Accuracy: 	 0.8766
Accuracy SD: 		 0.0248
Test Result:

accuracy score: 0.8668

Classification Report: 
              precision    recall  f1-score   support

          0       0.88      0.97      0.93       313
          1       0.62      0.27      0.38        55

avg / total       0.85      0.87      0.84       368


Confusion Matrix: 
 [[304   9]
 [ 40  15]]

