In [1]:
%matplotlib notebook

In [2]:
# Import dependencies

import pandas as pd
import matplotlib.pyplot as plt
import scipy.stats as stats
import numpy as np

In [3]:
pathHR = 'HR_comma_sep.csv'
df_HR = pd.read_csv(pathHR)
df_HR

Unnamed: 0,satisfaction_level,last_evaluation,number_project,average_montly_hours,time_spend_company,Work_accident,left,promotion_last_5years,sales,salary
0,0.38,0.53,2,157,3,0,1,0,sales,low
1,0.80,0.86,5,262,6,0,1,0,sales,medium
2,0.11,0.88,7,272,4,0,1,0,sales,medium
3,0.72,0.87,5,223,5,0,1,0,sales,low
4,0.37,0.52,2,159,3,0,1,0,sales,low
...,...,...,...,...,...,...,...,...,...,...
14994,0.40,0.57,2,151,3,0,1,0,support,low
14995,0.37,0.48,2,160,3,0,1,0,support,low
14996,0.37,0.53,2,143,3,0,1,0,support,low
14997,0.11,0.96,6,280,4,0,1,0,support,low


Data Cleaning

In [4]:
df_HR.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 14999 entries, 0 to 14998
Data columns (total 10 columns):
 #   Column                 Non-Null Count  Dtype  
---  ------                 --------------  -----  
 0   satisfaction_level     14999 non-null  float64
 1   last_evaluation        14999 non-null  float64
 2   number_project         14999 non-null  int64  
 3   average_montly_hours   14999 non-null  int64  
 4   time_spend_company     14999 non-null  int64  
 5   Work_accident          14999 non-null  int64  
 6   left                   14999 non-null  int64  
 7   promotion_last_5years  14999 non-null  int64  
 8   sales                  14999 non-null  object 
 9   salary                 14999 non-null  object 
dtypes: float64(2), int64(6), object(2)
memory usage: 1.1+ MB


In [5]:
# Note dataset has no specific ID variable such as employee ID.
# Check frequency for different variables

df_HR['left'].value_counts()

0    11428
1     3571
Name: left, dtype: int64

In [6]:
df_HR['sales'].value_counts()

sales          4140
technical      2720
support        2229
IT             1227
product_mng     902
marketing       858
RandD           787
accounting      767
hr              739
management      630
Name: sales, dtype: int64

In [7]:
df_HR.rename(columns={'sales':'Departments'}, inplace = True)
df_HR.head()

Unnamed: 0,satisfaction_level,last_evaluation,number_project,average_montly_hours,time_spend_company,Work_accident,left,promotion_last_5years,Departments,salary
0,0.38,0.53,2,157,3,0,1,0,sales,low
1,0.8,0.86,5,262,6,0,1,0,sales,medium
2,0.11,0.88,7,272,4,0,1,0,sales,medium
3,0.72,0.87,5,223,5,0,1,0,sales,low
4,0.37,0.52,2,159,3,0,1,0,sales,low


In [8]:
# Align coding for variables with those from EmployeeAttrition.csv (and related dfs)
## In EmployeeAttrition.csv: 1 = Stayers, 2 = Leavers
## In current dataset: 1 = Left; 0 = Stay. Code below amends any deviation.

df_HR['left'] = df_HR['left'].replace({1: 2})
df_HR['left'] = df_HR['left'].replace({0: 1})
df_HR['left'].value_counts()

1    11428
2     3571
Name: left, dtype: int64

In [9]:
# Align column name with those from EmployeeAttrition.csv

df_HR.rename(columns={'left':'Current_status'}, inplace = True)
df_HR.head()

Unnamed: 0,satisfaction_level,last_evaluation,number_project,average_montly_hours,time_spend_company,Work_accident,Current_status,promotion_last_5years,Departments,salary
0,0.38,0.53,2,157,3,0,2,0,sales,low
1,0.8,0.86,5,262,6,0,2,0,sales,medium
2,0.11,0.88,7,272,4,0,2,0,sales,medium
3,0.72,0.87,5,223,5,0,2,0,sales,low
4,0.37,0.52,2,159,3,0,2,0,sales,low


In [10]:
# Descriptive Statistics for entire dataset
## satisfaction_level: 0-1
## last_evaluation: 0-1
## number_project: up to 7
## time_spend_company: up to 10

HR_sumStats = pd.DataFrame(df_HR[["satisfaction_level",
                                  "last_evaluation",
                                  "number_project",
                                  "average_montly_hours",
                                  "time_spend_company"]].mean())

HR_sumStats.rename(columns={0:'Mean'}, inplace = True)
HR_sumStats

Unnamed: 0,Mean
satisfaction_level,0.612834
last_evaluation,0.716102
number_project,3.803054
average_montly_hours,201.050337
time_spend_company,3.498233


In [11]:
# Descriptive Statistics by Stayers vs Leavers
## values_count() for Work_accident, promotion_last_5years, Departments

HR_byStayLeave = df_HR.groupby('Current_status').mean()
# HR_sumStats_byStayLeave = pd.DataFrame(HR_byStayLeave[["satisfaction_level", "last_evaluation",
#                                                    "number_project","average_montly_hours",
#                                                     "time_spend_company"]]).aggregate([np.mean, np.median, np.var, np.std].round(2))

# HR_sumStats_byStayLeave

HR_byStayLeave

Unnamed: 0_level_0,satisfaction_level,last_evaluation,number_project,average_montly_hours,time_spend_company,Work_accident,promotion_last_5years
Current_status,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1
1,0.66681,0.715473,3.786664,199.060203,3.380032,0.175009,0.026251
2,0.440098,0.718113,3.855503,207.41921,3.876505,0.047326,0.005321


In [12]:
HR_byStayLeaveTP = HR_byStayLeave.transpose()
HR_byStayLeaveTP

Current_status,1,2
satisfaction_level,0.66681,0.440098
last_evaluation,0.715473,0.718113
number_project,3.786664,3.855503
average_montly_hours,199.060203,207.41921
time_spend_company,3.380032,3.876505
Work_accident,0.175009,0.047326
promotion_last_5years,0.026251,0.005321


In [17]:
# Descriptive Statistics by Stayers vs Leavers

HR_byDepts = df_HR.groupby('Departments').aggregate([np.mean, np.median])
HR_byDepts

Unnamed: 0_level_0,satisfaction_level,satisfaction_level,last_evaluation,last_evaluation,number_project,number_project,average_montly_hours,average_montly_hours,time_spend_company,time_spend_company,Work_accident,Work_accident,Current_status,Current_status,promotion_last_5years,promotion_last_5years
Unnamed: 0_level_1,mean,median,mean,median,mean,median,mean,median,mean,median,mean,median,mean,median,mean,median
Departments,Unnamed: 1_level_2,Unnamed: 2_level_2,Unnamed: 3_level_2,Unnamed: 4_level_2,Unnamed: 5_level_2,Unnamed: 6_level_2,Unnamed: 7_level_2,Unnamed: 8_level_2,Unnamed: 9_level_2,Unnamed: 10_level_2,Unnamed: 11_level_2,Unnamed: 12_level_2,Unnamed: 13_level_2,Unnamed: 14_level_2,Unnamed: 15_level_2,Unnamed: 16_level_2
IT,0.618142,0.66,0.71683,0.72,3.816626,4,202.215974,199,3.468623,3,0.133659,0,1.222494,1,0.002445,0
RandD,0.619822,0.65,0.712122,0.71,3.853875,4,200.800508,200,3.367217,3,0.170267,0,1.153748,1,0.034307,0
accounting,0.582151,0.61,0.717718,0.73,3.825293,4,201.162973,199,3.522816,3,0.125163,0,1.265971,1,0.018253,0
hr,0.598809,0.61,0.70885,0.72,3.654939,4,198.684709,197,3.355886,3,0.120433,0,1.290934,1,0.020298,0
management,0.621349,0.655,0.724,0.73,3.860317,4,201.249206,204,4.303175,3,0.163492,0,1.144444,1,0.109524,0
marketing,0.618601,0.64,0.715886,0.71,3.687646,4,199.385781,198,3.56993,3,0.160839,0,1.236597,1,0.050117,0
product_mng,0.619634,0.64,0.714756,0.72,3.807095,4,199.965632,198,3.47561,3,0.146341,0,1.219512,1,0.0,0
sales,0.614447,0.64,0.709717,0.7,3.776329,4,200.911353,201,3.534058,3,0.141787,0,1.244928,1,0.024155,0
support,0.6183,0.65,0.723109,0.74,3.803948,4,200.758188,200,3.393001,3,0.154778,0,1.248991,1,0.008973,0
technical,0.607897,0.64,0.721099,0.73,3.877941,4,202.497426,201,3.411397,3,0.140074,0,1.25625,1,0.010294,0


In [18]:
HR_byDeptsTP = HR_byDepts.transpose()
HR_byDeptsTP

Unnamed: 0,Departments,IT,RandD,accounting,hr,management,marketing,product_mng,sales,support,technical
satisfaction_level,mean,0.618142,0.619822,0.582151,0.598809,0.621349,0.618601,0.619634,0.614447,0.6183,0.607897
satisfaction_level,median,0.66,0.65,0.61,0.61,0.655,0.64,0.64,0.64,0.65,0.64
last_evaluation,mean,0.71683,0.712122,0.717718,0.70885,0.724,0.715886,0.714756,0.709717,0.723109,0.721099
last_evaluation,median,0.72,0.71,0.73,0.72,0.73,0.71,0.72,0.7,0.74,0.73
number_project,mean,3.816626,3.853875,3.825293,3.654939,3.860317,3.687646,3.807095,3.776329,3.803948,3.877941
number_project,median,4.0,4.0,4.0,4.0,4.0,4.0,4.0,4.0,4.0,4.0
average_montly_hours,mean,202.215974,200.800508,201.162973,198.684709,201.249206,199.385781,199.965632,200.911353,200.758188,202.497426
average_montly_hours,median,199.0,200.0,199.0,197.0,204.0,198.0,198.0,201.0,200.0,201.0
time_spend_company,mean,3.468623,3.367217,3.522816,3.355886,4.303175,3.56993,3.47561,3.534058,3.393001,3.411397
time_spend_company,median,3.0,3.0,3.0,3.0,3.0,3.0,3.0,3.0,3.0,3.0


In [15]:
# Compare tenure

CHARTS & PLOTS

ANALYSIS

In [16]:
# Research Question: 1) Why do valuable employees leave?