In [2]:
import numpy as np 
import pandas as pd 
import seaborn as sns
import matplotlib.pyplot as plt
%matplotlib inline
        
print('Setup complete')

#  **Healthcare Employee Attrition Data**
#  **Part 1 - EDA**

 # 1. Reading and cleaning the data
 - We first reading the csv file (indexed to EmployeeID) and check for the types of columns present.
 - Ensure that there are no null values.

In [3]:
main_df = pd.read_csv('employee-attrition-for-healthcare/watson_healthcare_modified.csv', index_col = 'EmployeeID')
display(main_df.head(5))
cols = main_df.columns
display(cols)

In [4]:
main_df.isnull().sum()

- Next lets check for the uniqueness in values for each column.

In [5]:
for n in cols:
    print('Unique values of', n)
    display(pd.unique(main_df[n]))
    print()

As we can see, there are some columns which lack clarification or are not meaningful from the source. These columns will be dropped.
- EmployeeCount is always 1
- StandardHours is always 80
- Over18 is always Yes
- Daily, hourly, monthly rate (no meaningful given)
- JobRoles have both Administrative and Admin, which both should refer to the same category, we will rename this value.

In [6]:
main_df.drop(['DailyRate', 'Over18','StandardHours', 'EmployeeCount', 'DailyRate','MonthlyRate', 'HourlyRate'], axis = 1, inplace = True)
main_df.head()

In [7]:
cond = (main_df['JobRole']== 'Administrative')
display(cond.sum())
main_df.loc[cond, 'JobRole'] = 'Admin'
display(cond.sum())

# 2. Identifying patterns within Background History of Employees
- We will see if there are any correlations between the employees' background history and their attrition.

In [8]:
bg_history_df = main_df[['Attrition','Age', 'EducationField', 'Education', 'DistanceFromHome', 'MaritalStatus', 'Gender', 'NumCompaniesWorked', 'YearsAtCompany']]
display(round(bg_history_df.describe(), 2))

sns.catplot(data = bg_history_df, y = 'EducationField', hue = 'MaritalStatus', col = 'Attrition', kind = 'count', palette = 'Set1')

- **Across all education fields, we see that healthcare employees that are single have the highest count of attrition.**
- **Healthcare employees that are married tend to have a lower count of attrition**

In [9]:
plt.figure(figsize=(12,8))
#sns.kdeplot(data = bg_history_df, x = 'Age', hue = 'Attrition')
sns.swarmplot(data = bg_history_df, x = 'Attrition', y = 'Age', size = 4, palette = 'Set2')

- **We notice that employees aged between 18 to 35 have the highest density of attrition.**

In [10]:
plt.figure(figsize=(10,8))
display(sns.violinplot(data = bg_history_df, y = 'DistanceFromHome', 
                       x = 'MaritalStatus' , hue = 'Attrition', split = True,
                      palette = 'Set2'))

print('\nMeans for Violinplot below')
conditions = ['Single', 'Married', 'Divorced']
for n in conditions:
    cond = (bg_history_df['MaritalStatus'] == n) & (bg_history_df['Attrition'] == 'No')
    print(n, 'Mean (No)',round((bg_history_df[cond]).DistanceFromHome.mean(), 4))
    cond = (bg_history_df['MaritalStatus'] == n) & (bg_history_df['Attrition'] == 'Yes')
    print(n, 'Mean (Yes)',round((bg_history_df[cond]).DistanceFromHome.mean(), 4))
    print()

- **Across all marital status, we see that there is a higher mean for distance from home that results in attrition, this is observed across all marital status and especially higher for married employees.**
- **Employees that have a short distance from home (below 10) tend to have a low attrition rate**

In [11]:
plt.figure(figsize=(10,8))
sns.histplot(data = bg_history_df, x = 'NumCompaniesWorked', hue = 'Attrition', 
             discrete = True, multiple = 'stack', palette = 'Set2')
#sns.catplot(data = bg_history_df, x = 'NumCompaniesWorked', kind = 'count' , hue = 'Attrition', col = 'MaritalStatus')

- **Here we see little difference between the number of companies worked against attrition rate.**
- **We note that most healthcare employees have only worked 1 company prior to their current work.**

In [12]:
#sns.pairplot(data = bg_history_df,  y_vars = ['MaritalStatus','EducationField', 'Gender'], x_vars = ['DistanceFromHome', 'NumCompaniesWorked', 'Education'], hue = 'Attrition',kind = 'hist')


# 2. Identifying Work Engagement based on Department and Job Role
We identify employees' level of engagement based on
- JobInvolvement
- JobSatisfaction
- EnvironmentSatisfaction
- RelationshipSatisfaction
- WorkLifeBalance
- PerformanceRating
- TrainingTimesLastYear
- JobLevel
- YearsSinceLastPromotion
- YearsWithCurrManager


In [13]:
left = main_df[['Department', 'JobRole', 'Attrition']] #these are the indexes 
right = main_df[['JobLevel','JobInvolvement','JobSatisfaction',
          'EnvironmentSatisfaction','RelationshipSatisfaction', 
          'WorkLifeBalance','PerformanceRating']].apply(lambda x: (x*2)) #convert rating of 5 to percentage
wcond_df = left.join(right)
round(wcond_df.describe(), 2)

#'TrainingTimesLastYear','YearsSinceLastPromotion','YearsWithCurrManager',


In [14]:
#df = main_df[['Department', 'JobRole', 'JobSatisfaction']]
#df.set_index('Department', inplace= True)
piv = pd.pivot_table(data = wcond_df, index = ['JobRole', 'Department'])
plt.figure(figsize=(15,10))
plt.title('Heatmap of JobRole and Department against Ratings (scale of 1 to 10)')
sns.heatmap(piv, annot=True, annot_kws={'size': 13})

- **We can see that there is little difference in means of 'Environment, Relationship and Job Satisfaction' across all departments and roles**
- **JobRoles of Nurses and Others are at a lower JobLevel than others.**

In [15]:
piv = pd.pivot_table(data = wcond_df, index = ['Attrition'])
plt.figure(figsize=(10,6))
plt.title('Heatmap of Attrition based on Ratings')
display(sns.heatmap(piv, annot=True, annot_kws={'size': 13} ))

- **Employees with attrition tend to have lower means (below average of 5)  for** 
1. JobLevel
2. JobInvolvement
3. EnvironmentSatisfaction
4. JobSatisfaction
5. RelationshipSatisfaction
6. WorkLifeBalance

# 3. Identifying Work compensation against Attrition

In [16]:
wcomp_df = main_df[['Attrition','Age', 'Gender','JobLevel',
                   'MonthlyIncome','OverTime', 'PercentSalaryHike', 'Shift',
                   'YearsAtCompany', 'YearsSinceLastPromotion']]
wcomp_df.describe()



In [17]:
plt.figure(figsize=(8,8))
plt.title('Histogram showing in percentage of total employees, the attrition of OT and non-OT employees')
sns.histplot(data= wcomp_df, x= 'OverTime' , hue = 'Attrition', multiple = 'dodge'
             , stat = 'percent', shrink=0.8)

In [18]:
ot_tot = (wcomp_df['OverTime'] == 'Yes').sum() #total amount of employees who do overtime
ot_y = ((wcomp_df['OverTime'] == 'Yes') & (wcomp_df['Attrition'] == 'Yes')).sum() #total of those with attrition
print('Percentage of overtime employee attrition\n', round((ot_y/ot_tot*100), 3))

not_tot = (wcomp_df['OverTime'] == 'No').sum() #total amount of employees who don't do overtime
not_y = ((wcomp_df['OverTime'] == 'No') & (wcomp_df['Attrition'] == 'Yes')).sum() #total of those with attrition
print('\nPercentage of non-overtime employee attrition\n', round((not_y/not_tot*100), 3))

- **We see that there is a significantly higher percentage (29% against 5%) of OverTime employees leave the company.**

In [19]:
plt.figure(figsize=(8,8))
sns.stripplot(data = wcomp_df, x = 'OverTime', y = 'MonthlyIncome', hue = 'Attrition', dodge = True)

- **A further look at the OverTime employees show that lower MonthIncome also correlates with higher attrition**

In [21]:
#sns.jointplot(data = wcomp_df, x = 'PercentSalaryHike', y = 'MonthlyIncome', hue = 'Attrition')
plt.figure(figsize=(8,8))
sns.boxplot(data = wcomp_df, x = 'Attrition', y = 'PercentSalaryHike')

- **We see that salary hikes have little impact on lowering the attrition of OverTime employees.**

In [None]:
plt.figure(figsize=(8,8))
sns.stripplot(data = wcomp_df, x = 'Shift', y = 'Age', hue = 'Attrition', dodge = True)

- **Mean for shifts is 0.8. Most employees either have 1 or no shift work, there is little effect of shifts on attrition.**

# Conclusions of EDA:
Factors that show a positive correlation with attrition:
1. Marital Status - single
2. Age - 35 and below
3. DistanceFromHome - for Married and Divorced employees
4. Ratings below average (2.5 and below) for
- JobInvolvement
- EnvironmentSatisfaction
- JobSatisfaction
5. OverTime employees
6. OverTime employees with monthly income up till 7500

Salary Hikes may not be a useful way to prevent attrition in employees.

#  **Part 2 Predictive Analysis**

In this section, we will use the significant factors to fit a decision tree model in predicting employee attrition.


# 1. Choosing the columns and splitting the data

There are two categorical data, hence we will use get_dummies to replace the categories.

In [61]:
sig_factors = ['OverTime','Age','DistanceFromHome','MaritalStatus', 'MonthlyIncome', 
               'JobInvolvement','EnvironmentSatisfaction','JobSatisfaction',]

test_df = main_df[sig_factors]
one_hot_data = pd.get_dummies(test_df, columns =['OverTime'], drop_first = True) 
#this will remove colinearity

one_hot_data = pd.get_dummies(one_hot_data, columns =['MaritalStatus']) 
#there are three categories here so will not drop any columns from the dummies

target = main_df['Attrition']

one_hot_data.head()

# **2. Splitting the data for training and testing.**
We will use a training size of 80%. We will try a few max_depths to find the best model accuracy and precision.

In [121]:
from sklearn.model_selection import train_test_split
from sklearn.tree import DecisionTreeClassifier
from sklearn import tree
from sklearn.metrics import accuracy_score
from sklearn.metrics import confusion_matrix

X = one_hot_data
y = target
X_train, X_test, y_train, y_test = train_test_split(X, y, random_state = 1, train_size = 0.8)

depths = [3,4,6,8,10,12,20]

for d in depths:
    model = DecisionTreeClassifier(max_depth = d, random_state = 1)
    model.fit(X_train, y_train)
    print('Max depth of tree is', model.tree_.max_depth)

    y_predict = model.predict(X_test)
    score = accuracy_score(y_test, y_predict)
    print('Model accuracy: {0:0.4f}'.format(score))
    
    cm = confusion_matrix(y_test, y_predict)
    TP = cm[1][1]
    FP = cm[0][1]
    ps = TP/(TP+FP)
    print('Precision score:{0:0.4f}'.format(ps))
    print('Confusion matrix:\n', cm)
    print()

# **Model Finalisation**
**The algorithm favours a max depth of 14, however we see the highest accuracy and precision at max_depth of just 3.**
**We will go ahead with 3.**

In [123]:
model = DecisionTreeClassifier(max_depth = 3, random_state = 1)
model.fit(X_train, y_train)
print('Max depth of tree is', model.tree_.max_depth)

y_predict = model.predict(X_test)
score = accuracy_score(y_test, y_predict)
print('Model accuracy: {0:0.4f}'.format(score))
cm = confusion_matrix(y_test, y_predict)
TP = cm[1][1]
FP = cm[0][1]
ps = TP/(TP+FP)
print('Precision score:{0:0.4f}'.format(ps))
print('Confusion matrix:\n', cm)

**We'll save the tree for visualization on the model workings.**

In [124]:
from sklearn.metrics import ConfusionMatrixDisplay, precision_score
cm = confusion_matrix(y_test, y_predict)

disp = ConfusionMatrixDisplay(confusion_matrix = cm)
disp.plot()
plt.show()

#To learn how to calculate precision_score - not sure about how to get y_true yet
#y_true = ????
#p_score = precision_score(y_true, y_pred)
#print('Precision score: {0:0.4f}'.format(p_score))


In [126]:
import graphviz 

dot_data = tree.export_graphviz(model, out_file=None) 
graph = graphviz.Source(dot_data) 
graph.render("tree") 