# Model for employee turnover

In [1]:
# Processing the data
import json
import pandas as pd
import numpy as np
from sklearn.model_selection import train_test_split
from sklearn.preprocessing import OrdinalEncoder

# Ensemble algorithm
from sklearn.ensemble import RandomForestClassifier

# Create a pipeline
from sklearn.pipeline import Pipeline
from sklearn.compose import ColumnTransformer

# To measure performance
from sklearn import metrics

# Save the model
import joblib

## Train Model

In [2]:
hrdata = pd.read_csv('HR_Employee_Attrition.csv')

In [3]:
train_hrdata = hrdata.drop(columns=['EmployeeCount','EmployeeNumber','JobLevel', 
                                    'Over18', 'StandardHours', 'TotalWorkingYears'])

In [4]:
train_hrdata['Attrition'] = train_hrdata.Attrition.map({'Yes':1,
                                              'No':0})

In [5]:
categorical_attributes = ['BusinessTravel', 'OverTime',
                          'Department', 'EducationField', 
                          'Gender','JobRole','MaritalStatus']

In [6]:
rf = RandomForestClassifier(max_depth=12, 
                            max_features=11, 
                            n_estimators=180, 
                            random_state=2021, 
                            n_jobs=-1)

cat_pipe = ColumnTransformer([('ordinal_encoder', OrdinalEncoder(), categorical_attributes)],
                             remainder='passthrough')

pipe_model = Pipeline([
      ('encoder', cat_pipe),
      ('classification', rf )
    ])

In [7]:
df1=  train_hrdata[train_hrdata.Attrition==0].sample(600).reset_index(drop=True)
df2=  train_hrdata[train_hrdata.Attrition == 1]
train_set = pd.concat([df1 , df2 , df2] , axis=0).reset_index(drop=True)

In [8]:
x = train_set.drop('Attrition',1) ### Drop before having the target variable
y = train_set['Attrition']

print(x.shape)
print(y.shape)

(1074, 28)
(1074,)


In [9]:
x_train, x_test, y_train, y_test = train_test_split(x,y, 
                                                    random_state=2021, 
                                                    test_size=0.2,
                                                    stratify =y)

In [10]:
pipe_model.fit(x_train, y_train)
y_pred = pipe_model.predict(x_test)

print('Accuracy Score of Random Forest Classifier is: ', metrics.accuracy_score(y_test, y_pred))
print('Recall Score of Random Forest Classifier Model is: ', metrics.recall_score(y_test, y_pred))

Accuracy Score of Random Forest Classifier is:  0.9116279069767442
Recall Score of Random Forest Classifier Model is:  0.9157894736842105


In [11]:
print(metrics.classification_report(y_test, y_pred))

              precision    recall  f1-score   support

           0       0.93      0.91      0.92       120
           1       0.89      0.92      0.90        95

    accuracy                           0.91       215
   macro avg       0.91      0.91      0.91       215
weighted avg       0.91      0.91      0.91       215



In [12]:
val_cols = list(train_set.columns)

In [13]:
val_cols.remove('Attrition')

## Generate Dataset with Turnover score column

In [14]:
hrdata["turnover_score"] = pipe_model.predict_proba(hrdata[val_cols])[:,1] # 

In [30]:
hrdata[['EmployeeNumber','turnover_score']].head(2)

Unnamed: 0,EmployeeNumber,turnover_score
0,1,0.829019
1,2,0.29012


## Save model

In [16]:
joblib.dump(pipe_model, 'clf.model')

['clf.model']

## Load Model

In [17]:
clf = joblib.load('clf.model')

### Create a record for testing

In [18]:
hrdata2 = pd.read_csv('HR_Employee_Attrition.csv')

In [19]:
collaborator_rn = np.random.choice(range(1,hrdata2.shape[1]))

In [20]:
collaborator = pd.DataFrame(hrdata2.iloc[collaborator_rn,:]).T

In [21]:
collaborator.drop(columns=['EmployeeCount', 
                           'Attrition',
                           'JobLevel', 
                           'Over18',
                           'StandardHours', 
                           'TotalWorkingYears'], inplace=True)

### Generate json example using original dataset

In [22]:
collaborator.to_json(orient="records")

'[{"Age":39,"BusinessTravel":"Travel_Rarely","DailyRate":895,"Department":"Sales","DistanceFromHome":5,"Education":3,"EducationField":"Technical Degree","EmployeeNumber":42,"EnvironmentSatisfaction":4,"Gender":"Male","HourlyRate":56,"JobInvolvement":3,"JobRole":"Sales Representative","JobSatisfaction":4,"MaritalStatus":"Married","MonthlyIncome":2086,"MonthlyRate":3335,"NumCompaniesWorked":3,"OverTime":"No","PercentSalaryHike":14,"PerformanceRating":3,"RelationshipSatisfaction":3,"StockOptionLevel":1,"TrainingTimesLastYear":6,"WorkLifeBalance":4,"YearsAtCompany":1,"YearsInCurrentRole":0,"YearsSinceLastPromotion":0,"YearsWithCurrManager":0}]'

In [23]:
request = collaborator.to_json(orient="records")

### Create a new json example, not in the original dataset, for testing

In [24]:
example = {"Age":37,  
           "BusinessTravel":"Travel_Frequently",  
           "DailyRate":29, 
           "Department":"Research & Development", 
           "DistanceFromHome":12, 
           "Education":3,  
           "EducationField":"Life Sciences",  
           "EmployeeNumber":23333999,  
           "EnvironmentSatisfaction":13,  
           "Gender":"Male",  
           "HourlyRate":61,  
           "JobInvolvement":2,  
           "JobRole":"Research Scientist",  
           "JobSatisfaction":2,  
           "MaritalStatus":"Married",  
           "MonthlyIncome":5130,  
           "MonthlyRate":24907, 
           "NumCompaniesWorked":1, 
           "OverTime":"No", 
           "PercentSalaryHike":23, 
           "PerformanceRating":4, 
           "RelationshipSatisfaction":4,  
           "StockOptionLevel":1,  
           "TrainingTimesLastYear":3,  
           "WorkLifeBalance":3,  
           "YearsAtCompany":10,  
           "YearsInCurrentRole":1,  
           "YearsSinceLastPromotion":1,  
           "YearsWithCurrManager":1}

In [25]:
new_example = json.dumps(example)

In [26]:
new_example

'{"Age": 37, "BusinessTravel": "Travel_Frequently", "DailyRate": 29, "Department": "Research & Development", "DistanceFromHome": 12, "Education": 3, "EducationField": "Life Sciences", "EmployeeNumber": 23333999, "EnvironmentSatisfaction": 13, "Gender": "Male", "HourlyRate": 61, "JobInvolvement": 2, "JobRole": "Research Scientist", "JobSatisfaction": 2, "MaritalStatus": "Married", "MonthlyIncome": 5130, "MonthlyRate": 24907, "NumCompaniesWorked": 1, "OverTime": "No", "PercentSalaryHike": 23, "PerformanceRating": 4, "RelationshipSatisfaction": 4, "StockOptionLevel": 1, "TrainingTimesLastYear": 3, "WorkLifeBalance": 3, "YearsAtCompany": 10, "YearsInCurrentRole": 1, "YearsSinceLastPromotion": 1, "YearsWithCurrManager": 1}'

### Use the example from the dataset or the new example to test prediction

In [27]:
# Use an existing example in the dataset:
#data = json.loads(request)

# Use the new example:
data = json.loads(new_example)

In [28]:
def hr_predict(request):
    df = pd.DataFrame([request])
    ID = df['EmployeeNumber'][0]
    df.drop(columns=['EmployeeNumber'], inplace=True)
    prediction = clf.predict_proba(df)
    output = {'ID': ID , 'prediction': list(prediction[:,1])[0]}
    return output

In [29]:
hr_predict(data)

{'ID': 23333999, 'prediction': 0.3669442821144328}

> This is the prediction output for a new input data corresponding to a new collaborator.