In [None]:
# This Python 3 environment comes with many helpful analytics libraries installed
# It is defined by the kaggle/python docker image: https://github.com/kaggle/docker-python
# For example, here's several helpful packages to load in 

# Ignore warnings
import warnings
warnings.filterwarnings('ignore')

# Standard python import
import math, datetime, os 

import numpy as np # linear algebra
import pandas as pd # data processing, CSV file I/O (e.g. pd.read_csv)

# Visualisation 
import matplotlib.pyplot as plt
import seaborn as sns
import missingno as mn

# Stats
from scipy import stats

# ML
from sklearn.dummy import DummyClassifier
from sklearn.preprocessing import LabelEncoder, StandardScaler
from sklearn.model_selection import train_test_split, cross_val_predict, GridSearchCV, cross_val_score
from sklearn.metrics import classification_report, accuracy_score, confusion_matrix
from sklearn.linear_model import LogisticRegression
from sklearn.tree import DecisionTreeRegressor
from sklearn.ensemble import RandomForestClassifier
from xgboost import XGBClassifier, plot_importance

import lime
import shap
from yellowbrick.classifier import ConfusionMatrix, ClassificationReport, ROCAUC, ClassPredictionError, PrecisionRecallCurve
from yellowbrick.features import FeatureImportances
from yellowbrick.model_selection import LearningCurve, ValidationCurve


# Setting parameters for plotting 
plt.rcParams['figure.figsize'] = 8,6
plt.rcParams['image.cmap'] = 'viridis'
plt.style.use('ggplot')
%config InlineBackend.figure_format = 'png'

# Input data files are available in the "../input/" directory.
# For example, running this (by clicking run or pressing Shift+Enter) will list all files under the input directory

import os
for dirname, _, filenames in os.walk('/kaggle/input'):
    for filename in filenames:
        print(os.path.join(dirname, filename))

# Any results you write to the current directory are saved as output.

In [None]:
# Import csv
employee = pd.read_csv('/kaggle/input/ibm-hr-analytics-attrition-dataset/WA_Fn-UseC_-HR-Employee-Attrition.csv')
employee.head()

In [None]:
# Encoding attrition to binary variable 
employee['Attrition'] = np.where(employee.Attrition=='Yes',1,0)

## Understanding the types of variables

In [None]:
# Inspecting the types of variables in the dataset
employee.dtypes

In [None]:
# Retrieving the categorical variables
categorical = employee.select_dtypes(include='object')
print('There are {} categorical variables'.format(len(categorical.columns)))

In [None]:
# Retrieving the numerical variables
numerical = employee.select_dtypes(include=['int64','float64'])
print('There are {} numerical variables'.format(len(numerical.columns)))

In [None]:
# Viewing the categorical variables
categorical.head()

In [None]:
# Viewing the numerical variables 
numerical.head()

* **Continuous variables (13)**: Age, DailyRate, HourlyRate, MonthlyIncome, MonthlyRate, NumCompaniesWorked, PercentSalaryHike, TotalWorkingYears, TrainingTimesLastYear, YearsAtCompany, YearsInCurrentRole, YearsSinceLastPromotion, YearsWithCurrManager

* **Constant variable (2)**: StandardHours, EmployeeCount

* **Discrete variables (9)**: DistanceFromHome, Education, EnvironmentSatisfaction, JobInvolvement, JobLevel, JobSatisfaction, PerformanceRating, RelationshipSatisfaction, WorkLifeBalance, 

* **Binary variables (1)**: StockOptionLevel, Attrition (target)

* **ID variable (1)**: EmployeeNumber

In [None]:
# Understanding the values in discrete variables
for var in ["DistanceFromHome", "Education", "EnvironmentSatisfaction", 
            "JobInvolvement", "JobLevel", "JobSatisfaction", "PerformanceRating", 
            "RelationshipSatisfaction", 'TrainingTimesLastYear']:
    print(var, 'values: ', employee[var].unique())

## Understanding the types of problem within the variables

In [None]:
# Number of missing values
employee.isnull().mean()

There are no missing values within the dataset which is great! Let's move on explore the outliers present.

### Outliers

#### Outliers in continuous variables

In [None]:
non_cont = ['Attrition', 'BusinessTravel', 'Department', 'Education', 'EducationField', 'EnvironmentSatisfaction', 'Gender', 'JobInvolvement', 'JobLevel', 'JobRole', 'JobSatisfaction', 'MaritalStatus', 'NumCompaniesWorked', 'Over18', 'OverTime', 'PercentSalaryHike', 'PerformanceRating', 
            'RelationshipSatisfaction', 'TrainingTimesLastYear', 'WorkLifeBalance', 'YearsInCurrentRole', 'YearsSinceLastPromotion', 'YearsWithCurrManager',
           'DistanceFromHome', 'EmployeeCount', 'EmployeeNumber', 'StandardHours', 'StockOptionLevel']
continuous = [var for var in numerical.columns if var not in non_cont]
continuous

In [None]:
# Let's create boxplot to visualise the outliers in the continous variables
for var in continuous:
    plt.figure(figsize=(10,4), dpi=300)
    plt.subplot(1,2,1)
    fig = employee.boxplot(column=var)
    fig.set_title('')
    fig.set_ylabel(var)
    
    plt.subplot(1,2,2)
    fig = employee[var].hist(bins=20)
    fig.set_ylabel('Number of employees')
    fig.set_xlabel(var)
    
    plt.show()

We have identified several variables that seems to contain outliers. We have also identifed a few variables are not normally distributed. 

**Outliers Present**: MonthlyIncome, NumCompaniesWorked, TotalWorkingYears, YearsAtCompany, YearsInCurrentRole, YearsSinceLastPromotion, YearsWithCurrManager

**Not Normally Distributed**: MonthlyIncome, NumCompaniesWorked, 
TotalWorkingYears, YearsAtCompany, YearsInCurrentRole, YearsSinceLastPromotion, YearsWithCurrManager

#### Outliers in discrete variables

To find the outliers in the discrete variables, we will have to calculate the overall percentage of employees in each value that a discrete variables can take. 

In [None]:
# Outliers in discrete variables
discrete = []
for var in employee.columns:
    if len(employee[var].unique()) <20:
        discrete.append(var)
        
discrete = [var for var in discrete if var not in ['StandardHours', 'EmployeeCount', 'StockOptionLevel', 'EmployeeCount', 'EmployeeNumber']]
discrete

In [None]:
for var in discrete:
    print(employee[var].value_counts()/np.float(len(employee)))
    print()

In this case, we classify any values of a variable that consist of less that 1% to be an outlier. 

**Outliers**: YearsInCurrentRole, YearsSinceLastPromotion, YearsWithCurrManager

#### Number of labels: Cardinality

In [None]:
for var in categorical.columns:
    print(var, 'contains', len(employee[var].unique()), 'labels')

All the categorical data consist of only low number of labels. In this case, we do not need to be concern with high cardinality (i.e containing a lot of labels).

## Further EDA

In [None]:
# BusinessTravel
with sns.plotting_context('talk'):
    fig, ax = plt.subplots(figsize=(10,6), dpi=300)
    _ = sns.countplot(x='Attrition', data=employee, palette='viridis',
                     saturation=1,ax=ax)

In [None]:
# Gender
with sns.plotting_context('talk'):
    fig, ax = plt.subplots(figsize=(10,6), dpi=300)
    _ = sns.countplot(y='Gender', data=employee, hue='Attrition', palette='viridis',
                     saturation=1,ax=ax)

In [None]:
# BusinessTravel
with sns.plotting_context('talk'):
    fig, ax = plt.subplots(figsize=(10,6), dpi=300)
    _ = sns.countplot(y='BusinessTravel', data=employee, hue='Attrition', palette='viridis',
                     saturation=1,ax=ax)

In [None]:
# Department
with sns.plotting_context('talk'):
    fig, ax = plt.subplots(figsize=(10,6), dpi=300)
    _ = sns.countplot(y='Department', data=employee, hue='Attrition', palette='viridis',
                     saturation=1,ax=ax)

In [None]:
# EducationField
with sns.plotting_context('talk'):
    fig, ax = plt.subplots(figsize=(10,6), dpi=300)
    _ = sns.countplot(y='EducationField', data=employee, hue='Attrition', palette='viridis',
                     saturation=1,ax=ax)

In [None]:
# Gender
with sns.plotting_context('talk'):
    fig, ax = plt.subplots(figsize=(10,6), dpi=300)
    _ = sns.countplot(y='Gender', data=employee, hue='Attrition', palette='viridis',
                     saturation=1,ax=ax)

In [None]:
 # JobRole
with sns.plotting_context('talk'):
    fig, ax = plt.subplots(figsize=(25,6), dpi=300)
    _ = sns.countplot(x='JobRole', data=employee, palette='viridis',
                     saturation=1,ax=ax)

In [None]:
# MaritalStatus
with sns.plotting_context('talk'):
    fig, ax = plt.subplots(figsize=(10,6), dpi=300)
    _ = sns.countplot(y='MaritalStatus', data=employee, hue='Attrition', palette='viridis',
                     saturation=1,ax=ax)

In [None]:
# OverTime
with sns.plotting_context('talk'):
    fig, ax = plt.subplots(figsize=(10,6), dpi=300)
    _ = sns.countplot(y='OverTime', data=employee, hue='Attrition', palette='viridis',
                     saturation=1,ax=ax)

## Feature Engineering

In [None]:
# Drop features with constant values and redundant features
employee = employee.drop(['StandardHours','Over18','EmployeeCount', 'EmployeeNumber'], axis=1)

In [None]:
# Checking dataframe
employee.head()

In [None]:
# Seperating into train and test set

X_train, X_test, y_train, y_test = train_test_split(employee, employee.Attrition, test_size=0.2, random_state=0)
X_train.shape, X_test.shape

In [None]:
# Check shape
employee.shape

### Outliers in numerical variables 

In order to handle both outliers and non-normally distributed variables, we can use a Decision Tree to help us discretised the variables. The Decision Tree can help us find the optimal number of buckets accordingly. 

More can be refered to this article: https://towardsdatascience.com/discretisation-using-decision-trees-21910483fa4b

In [None]:
def tree_binariser(var):
    score_ls = []

    for tree_depth in [1,2,3,4]:
        # Calling the model
        tree_model = DecisionTreeRegressor(max_depth=tree_depth)

        # Train the model with 3 fold CV
        scores = cross_val_score(tree_model, X_train[var].to_frame(), y_train, cv=3, scoring='neg_mean_squared_error')
        score_ls.append(np.mean(scores))

    # Finding the depth with the smallest MSE
    depth = [1,2,3,4][np.argmax(score_ls)]
    #print(score_ls, np.argmax(score_ls), depth)

    # Transform the continous variable with the tree
    tree_model = DecisionTreeRegressor(max_depth=depth)
    tree_model.fit(X_train[var].to_frame(), X_train.Attrition)
    X_train[var] = tree_model.predict(X_train[var].to_frame())
    X_test[var] = tree_model.predict(X_test[var].to_frame())

In [None]:
# Transform the continuous variables
for var in continuous:
    tree_binariser(var)

In [None]:
X_train[continuous].head()

In [None]:
# Check the number of bins in each continuous variables
for var in continuous:
    print(var, len(X_train[var].unique()))

### Encoding categorical variables

In [None]:
# Initialising LabelEncoder()
le = LabelEncoder()

# Retrieving categorical columns
categorical = employee.select_dtypes(include='object')
categorical = categorical.columns

for var in categorical:
    X_train[var] = le.fit_transform(X_train[var])
    X_test[var] = le.fit_transform(X_test[var])

In [None]:
X_train.head()

In [None]:
# Creating dummy variables for all categorical features

cat = ["DistanceFromHome", "Education", "EnvironmentSatisfaction", 
            "JobInvolvement", "JobLevel", "JobSatisfaction", "PerformanceRating", 
            "RelationshipSatisfaction", 'TrainingTimesLastYear', "BusinessTravel",
        "Department", "EducationField", "Gender", "JobRole", "MaritalStatus", "OverTime", 'WorkLifeBalance',
      'StockOptionLevel', 'NumCompaniesWorked']

for var in cat:
    X_train[var] = X_train[var].astype('object')
    X_test[var] = X_test[var].astype('object')
    
X_train = pd.get_dummies(X_train)
X_test = pd.get_dummies(X_test)

In [None]:
X_train.head()

In [None]:
# Drop attrition
X_train = X_train.drop('Attrition', axis=1)
X_test = X_test.drop('Attrition', axis=1)

### Feature Scaling

In [None]:
# Initialise StandardScaler
sc = StandardScaler()
sc.fit(X_train)

Once we have fitted our training set, we can use it accordingly in algorithms that requires normalized dataset to perform better.

## Building Machine Learning Models

### Logistic Regression

In [None]:
# 1st model - Logistic Regression 
logr = LogisticRegression()
logr.fit(sc.transform(X_train), y_train)
logr.score(sc.transform(X_test), y_test), cross_val_score(logr, sc.transform(X_test), y_test, cv=5).mean()

In [None]:
# Plotting confusion matrix for logr
with sns.plotting_context('paper'):
    fig, ax = plt.subplots(figsize=(8,8), dpi=300)
    cm_viz = ConfusionMatrix(logr, cmap=False, percent=False)
    cm_viz.fit(sc.transform(X_train), y_train)
    cm_viz.score(sc.transform(X_test), y_test)
    cm_viz.poof()

In [None]:
# Classification report for logr
print(classification_report(y_test, logr.predict(sc.transform(X_test))))

### Random Forest

In [None]:
# 2nd model - Random Forest
rf = RandomForestClassifier()
rf.fit(X_train, y_train)
rf.score(X_test, y_test), cross_val_score(rf, X_test, y_test, cv=5).mean()

In [None]:
# Plotting confusion matrix for rf
with sns.plotting_context('paper'):
    fig, ax = plt.subplots(figsize=(8,8), dpi=300)
    cm_viz = ConfusionMatrix(rf, cmap=False, percent=False)
    cm_viz.fit(X_train, y_train)
    cm_viz.score(X_test, y_test)
    cm_viz.poof()

In [None]:
# Classification report for logr
print(classification_report(y_test, rf.predict(X_test)))

In [None]:
xgb = XGBClassifier()

xgb.fit(X_train, y_train, eval_set=[(X_test, y_test)], verbose=False)
xgb.score(X_test, y_test), cross_val_score(xgb, X_test, y_test, cv=5).mean()

In [None]:
# Plotting confusion matrix for xgb
with sns.plotting_context('paper'):
    fig, ax = plt.subplots(figsize=(8,8), dpi=300)
    cm_viz = ConfusionMatrix(xgb, cmap=False, percent=False)
    cm_viz.fit(X_train, y_train)
    cm_viz.score(X_test, y_test)
    cm_viz.poof()

In [None]:
# Classification report for logr
print(classification_report(y_test, xgb.predict(X_test)))

In [None]:
# Plotting ROC curve for logr, rf, xgb

with sns.plotting_context('notebook'):
    fig, (ax, ax2, ax3) = plt.subplots(ncols=3, figsize=(15,8), dpi=300)
    roc_viz = ROCAUC(logr, ax=ax, micro=False)
    roc_viz.score(X_test, y_test)
    roc_viz.finalize()
    roc_viz2 = ROCAUC(rf, ax=ax2, micro=False)
    roc_viz2.score(X_test, y_test)
    roc_viz2.finalize()
    roc_viz3 = ROCAUC(xgb, ax=ax3, micro=False)
    roc_viz3.score(X_test, y_test)
    roc_viz3.finalize()

### SMOTE

Since our target is imbalanced, we attempt to oversample and see if our models performs better.

In [None]:
from imblearn.over_sampling import SMOTE
sm = SMOTE(random_state=45)

In [None]:
X_train_new, y_train_new = sm.fit_sample(X_train, y_train.ravel())

In [None]:
logr = LogisticRegression()
logr.fit(sc.transform(X_train_new), y_train_new)
logr.score(sc.transform(X_test), y_test), cross_val_score(logr, sc.transform(X_test), y_test, cv=5).mean()

In [None]:
print(classification_report(y_test, logr.predict(sc.transform(X_test))))

In [None]:
# 2nd model - Random Forest
rf = RandomForestClassifier()
rf.fit(X_train_new, y_train_new)
rf.score(X_test, y_test), cross_val_score(rf, X_test, y_test, cv=5).mean()

In [None]:
print(classification_report(y_test, rf.predict(X_test)))

In [None]:
xgb = XGBClassifier()

xgb.fit(X_train_new, y_train_new, eval_set=[(X_test, y_test)], verbose=False)
xgb.score(X_test, y_test), cross_val_score(xgb, X_test, y_test, cv=5).mean()

In [None]:
print(classification_report(y_test, xgb.predict(X_test)))

We observed that Logistic Regression remains the champion model with both the highest accuracy and highest recall. Let's try to tuned the model and see if we can achieved better performance.

## Tuning: Regularization and Hyperparameters
Below are some of the hyperparameters that can be optimized for both the Logistic Regression to get better results.

**penalty** - Used to specify the norm used in the penalization. The ‘newton-cg’, ‘sag’ and ‘lbfgs’ solvers support only l2 penalties. ‘elasticnet’ is only supported by the ‘saga’ solver. If ‘none’ (not supported by the liblinear solver), no regularization is applied.

**C** - Inverse of regularization strength; must be a positive float. Like in support vector machines, smaller values specify stronger regularization.

**solver** - Algorithm to use in the optimization problem.

Let's setup our search grid for Logistic Regression!

In [None]:
%%time
# Hyperparameter turning of logr
param_grid = {
    'solver' : ['newton-cg', 'lbfgs', 'liblinear'],
    'penalty' : ['l1', 'l2'],
    'C' : [100, 10, 1.0, 0.1, 0.01]
}

# Instantiate the grid search
logr_g = GridSearchCV(logr, param_grid=param_grid, n_jobs=-1, verbose=0, cv=5, error_score=0)
logr_g.fit(sc.transform(X_train), y_train)
# Summarizing results
print("Best: %f using %s" % (logr_g.best_score_, logr_g.get_params()))
print("\n")

In [None]:
logr_g.score(sc.transform(X_test), y_test), cross_val_score(logr_g, sc.transform(X_test), y_test, cv=5).mean()

Performance remains approximately the same but fare slightly worse than the untuned model. 

In [None]:
logr_g = LogisticRegression(C=1.0, class_weight=None, dual=False, fit_intercept=True,
                   intercept_scaling=1, l1_ratio=None, max_iter=100,
                   multi_class='auto', n_jobs=None, penalty='l2',
                   random_state=None, solver='lbfgs', tol=0.0001, verbose=0,
                   warm_start=False)

In [None]:
# Using untuned model's parameter
logr_g.fit(sc.transform(X_train_new), y_train_new)
logr_g.score(sc.transform(X_test), y_test), cross_val_score(logr_g, sc.transform(X_test), y_test, cv=5).mean()

In [None]:
print(classification_report(y_test, logr_g.predict(sc.transform(X_test))))

In [None]:
with sns.plotting_context('paper'):
    fig, ax = plt.subplots(figsize=(8,8), dpi=300)
    cm_viz = ConfusionMatrix(logr_g, cmap=False, percent=False)
    cm_viz.fit(sc.transform(X_train), y_train)
    cm_viz.score(sc.transform(X_test), y_test)
    cm_viz.poof()

## Model Interpretation

So now considering that Logistic Regression has performed the best out of the 3 models, but one thing is lacking. What variables is the best in predicting employee attrition? Let's explore the model further!

In [None]:
# Initialising js
shap.initjs()

# Create a tree explainer and understanding the values we have 
shap_ex = shap.LinearExplainer(logr_g, X_test)
vals = shap_ex.shap_values(X_test)

In [None]:
# Looking at feature importance 
shap.summary_plot(vals, X_test, plot_type="bar")

Based on summary plot generated, we identified that OverTime, YearsSinceLastPromotion, JobLevel, EnvironmentSatisfaction are the top 5 more features in predicting the outcome of employee attrition. However, the summary plot on shows the average impact, let's dive deeper and look at how the values of these variables affect the outcome as a whole.

In [None]:
# Plotting a summary plot to see how the value of the features help us in predicting the patients

with sns.plotting_context('talk'):
    fig, ax = plt.subplots(figsize=(10,6), dpi=300)
    shap.summary_plot(vals, X_test, alpha=.5)

From the summary plot generated, we identified a few key points that results in employee attrition.

1. **OverTime** - Both OverTime_0 and OverTime_1 indicates the same thing, which is whether an employee does overtime. In this case as reflected by the plot, employees who overtime more, is more associated with employee attrition. This could mean that employees that do no have a proper worklife balance, or spend more time couped up at work are likely to be more unhappy hence, leaving the company. 


2. **YearsSinceLastPromotion** - We observed that employees that have not been promoted in a long time (higher up the range within the dataset) is found to be strongly associated with employee attrition. This could means that employees that stayed in a company for a long period of time and yet passed on for promotion could result in employee feeling neglected by the management, which possibly result in them leaving the company. 


3. **JobLevel_1** - Employees who are associated the lower job level such as JobLevel=1, are also more likely to result in employee attrition. This could indicate the employees are still new in their career, possibly in entry level jobs and hence, would pursue other jobs if the current one is not suitable or they are looking for a change. 


4. **EnvironmentSatisfaction_1** - Employees who indicated 'low' in their satisfaction in their work environment are also associated with employee attrition. This can be explained by the fact that having an environment that is not suitable or to their liking is not a good way to keep an employee. This could result in employee attrition as employees are looking for a change in their job environment. 

In [None]:
# Creating a force plot to explain the first 100 samples
shap.force_plot(shap_ex.expected_value, vals[:100], X_test.iloc[:100])

To better explain why each individal employee is classified into the respective classes (0,1), let's use the individual force plot and LIME to better understand!

Let's consider taking the no.50 employee of the test set for explaination!

In [None]:
# Retrieving employee's 50 details
X_test.iloc[[50]]

In [None]:
# Predicting using the logr_g
logr_g.predict(sc.transform(X_test.iloc[[50]]))

In [None]:
# Explaining why no.49 is classified as no employee attrition.
shap.force_plot(shap_ex.expected_value, vals[50,:], X_test.iloc[50,:])

In [None]:
# Convert dataframe to a matrix 
logr_g.fit(X_test.as_matrix(), y_test.as_matrix())

explainer = lime.lime_tabular.LimeTabularExplainer(
    X_test.values,
    feature_names=X_test.columns,
    class_names=[0,1]
)

# Taking row 50 and intepreting the prediction
pos = 50
exp = explainer.explain_instance(X_test.iloc[pos].values, 
                                 logr_g.predict_proba)
_ = exp.show_in_notebook()

From both SHAP and LIME we saw that employee 49 is indeed classifed as no employee attrition. Based on SHAP, we understood the main reason why the classification occured was due to the lower number of the years in the current role and time with the current manager. This could indicate that the employee is still relatively new to the job and hence there is no employee attrition.