## Note

This model gave very similar result as the Logistic regression model.

This is also true with the actual data

In [1]:
import pandas as pd
import numpy as np

In [2]:
data = pd.read_csv("data/WA_Fn-UseC_-HR-Employee-Attrition.csv")

In [3]:
data

Unnamed: 0,Age,Attrition,BusinessTravel,DailyRate,Department,DistanceFromHome,Education,EducationField,EmployeeCount,EmployeeNumber,...,RelationshipSatisfaction,StandardHours,StockOptionLevel,TotalWorkingYears,TrainingTimesLastYear,WorkLifeBalance,YearsAtCompany,YearsInCurrentRole,YearsSinceLastPromotion,YearsWithCurrManager
0,41,Yes,Travel_Rarely,1102,Sales,1,2,Life Sciences,1,1,...,1,80,0,8,0,1,6,4,0,5
1,49,No,Travel_Frequently,279,Research & Development,8,1,Life Sciences,1,2,...,4,80,1,10,3,3,10,7,1,7
2,37,Yes,Travel_Rarely,1373,Research & Development,2,2,Other,1,4,...,2,80,0,7,3,3,0,0,0,0
3,33,No,Travel_Frequently,1392,Research & Development,3,4,Life Sciences,1,5,...,3,80,0,8,3,3,8,7,3,0
4,27,No,Travel_Rarely,591,Research & Development,2,1,Medical,1,7,...,4,80,1,6,3,3,2,2,2,2
5,32,No,Travel_Frequently,1005,Research & Development,2,2,Life Sciences,1,8,...,3,80,0,8,2,2,7,7,3,6
6,59,No,Travel_Rarely,1324,Research & Development,3,3,Medical,1,10,...,1,80,3,12,3,2,1,0,0,0
7,30,No,Travel_Rarely,1358,Research & Development,24,1,Life Sciences,1,11,...,2,80,1,1,2,3,1,0,0,0
8,38,No,Travel_Frequently,216,Research & Development,23,3,Life Sciences,1,12,...,2,80,0,10,2,3,9,7,1,8
9,36,No,Travel_Rarely,1299,Research & Development,27,3,Medical,1,13,...,2,80,2,17,3,2,7,7,7,7


## Basic Preprocessing

In [6]:
data.head().T

Unnamed: 0,0,1,2,3,4
Age,41,49,37,33,27
Attrition,Yes,No,Yes,No,No
BusinessTravel,Travel_Rarely,Travel_Frequently,Travel_Rarely,Travel_Frequently,Travel_Rarely
DailyRate,1102,279,1373,1392,591
Department,Sales,Research & Development,Research & Development,Research & Development,Research & Development
DistanceFromHome,1,8,2,3,2
Education,2,1,2,4,1
EducationField,Life Sciences,Life Sciences,Other,Life Sciences,Medical
EmployeeCount,1,1,1,1,1
EmployeeNumber,1,2,4,5,7


In [7]:
data.shape

(1470, 35)

In [8]:
# are there null columns
data.isnull().all().all()

False

In [9]:
len(data.select_dtypes(include='O').columns)

9

## Basic Modelling

In [10]:
def numericalise(df):
    df = df.copy()
    for col in df.select_dtypes(include='O').columns:
        df[col] = df[col].astype("category").cat.codes
    return df

In [12]:
data_numericalised = numericalise(data)
data_numericalised.head()

Unnamed: 0,Age,Attrition,BusinessTravel,DailyRate,Department,DistanceFromHome,Education,EducationField,EmployeeCount,EmployeeNumber,...,RelationshipSatisfaction,StandardHours,StockOptionLevel,TotalWorkingYears,TrainingTimesLastYear,WorkLifeBalance,YearsAtCompany,YearsInCurrentRole,YearsSinceLastPromotion,YearsWithCurrManager
0,41,1,2,1102,2,1,2,1,1,1,...,1,80,0,8,0,1,6,4,0,5
1,49,0,1,279,1,8,1,1,1,2,...,4,80,1,10,3,3,10,7,1,7
2,37,1,2,1373,1,2,2,4,1,4,...,2,80,0,7,3,3,0,0,0,0
3,33,0,1,1392,1,3,4,1,1,5,...,3,80,0,8,3,3,8,7,3,0
4,27,0,2,591,1,2,1,3,1,7,...,4,80,1,6,3,3,2,2,2,2


## Correlation between variables

In [13]:
data_numericalised.corr()['Attrition'].sort_values()

TotalWorkingYears          -0.171063
JobLevel                   -0.169105
YearsInCurrentRole         -0.160545
MonthlyIncome              -0.159840
Age                        -0.159205
YearsWithCurrManager       -0.156199
StockOptionLevel           -0.137145
YearsAtCompany             -0.134392
JobInvolvement             -0.130016
JobSatisfaction            -0.103481
EnvironmentSatisfaction    -0.103369
WorkLifeBalance            -0.063939
TrainingTimesLastYear      -0.059478
DailyRate                  -0.056652
RelationshipSatisfaction   -0.045872
YearsSinceLastPromotion    -0.033019
Education                  -0.031373
PercentSalaryHike          -0.013478
EmployeeNumber             -0.010577
HourlyRate                 -0.006846
BusinessTravel              0.000074
PerformanceRating           0.002889
MonthlyRate                 0.015170
EducationField              0.026846
Gender                      0.029453
NumCompaniesWorked          0.043494
Department                  0.063991
J

## RandomForest

In [14]:
from sklearn.ensemble import RandomForestClassifier 

In [15]:
random_forest_obj = RandomForestClassifier(n_estimators=100, 
                                           max_depth=5,
                                          max_features="sqrt",
                                          )

In [16]:
target = data_numericalised.Attrition.copy()
features = data_numericalised.drop('Attrition', axis=1).copy()

In [17]:
from sklearn.model_selection import train_test_split

In [22]:
X_train, X_test, y_train, y_test = train_test_split(features, target, 
                                                    random_state=42)

In [23]:
model = random_forest_obj.fit(X_train, y_train)

In [24]:
model.score(X_train, y_train)

0.8693284936479129

In [25]:
model.score(X_test,y_test)

0.8722826086956522

## Feature Importance vs Correlation

In [30]:
feats_v_corr = pd.DataFrame({'corr_w_target': data_numericalised.corr()['Attrition'],
              'feature_imp':model.feature_importances_}, 
             index=features.columns).sort_values(by='feature_imp',
                                                ascending=False)

feats_v_corr

Unnamed: 0,corr_w_target,feature_imp
MonthlyIncome,-0.15984,0.108972
OverTime,0.246118,0.100671
Age,-0.159205,0.075635
TotalWorkingYears,-0.171063,0.063292
YearsAtCompany,-0.134392,0.056173
StockOptionLevel,-0.137145,0.050331
MaritalStatus,0.16207,0.044931
YearsInCurrentRole,-0.160545,0.039112
YearsWithCurrManager,-0.156199,0.038396
DailyRate,-0.056652,0.036525


#### Per the RandomForest Baseline Model, the following 16 features have the explanatory scope for about 80 of the data


In [33]:
feats_v_corr['Cumulative_Frequency'] = feats_v_corr['feature_imp'].cumsum()

feats_v_corr.head(16)

Unnamed: 0,corr_w_target,feature_imp,Cumulative_Frequency
MonthlyIncome,-0.15984,0.108972,0.108972
OverTime,0.246118,0.100671,0.209642
Age,-0.159205,0.075635,0.285277
TotalWorkingYears,-0.171063,0.063292,0.348569
YearsAtCompany,-0.134392,0.056173,0.404742
StockOptionLevel,-0.137145,0.050331,0.455073
MaritalStatus,0.16207,0.044931,0.500004
YearsInCurrentRole,-0.160545,0.039112,0.539116
YearsWithCurrManager,-0.156199,0.038396,0.577512
DailyRate,-0.056652,0.036525,0.614037


### Modelling with just those features again

In [34]:
random_forest_obj2 = RandomForestClassifier(n_estimators=100, 
                                            max_depth=5,
                                            max_features="sqrt",
                                            )

#### Modelling with lesser features gave us similar output with much more simpler model.

In [41]:
top_features_name_list = feats_v_corr.head(16).index.tolist()

X_train_top = X_train[top_features_name_list].copy()
X_test_top = X_test[top_features_name_list].copy()

model_top_features = random_forest_obj2.fit(X_train_top, y_train)
model_top_features.score(X_train_top, y_train)

0.8847549909255898

In [42]:
model_top_features.score(X_test_top, y_test)

0.8695652173913043