# Employee Burnout Prediction

- Regression Models:
  - Linear Regression
  - Multivariate Adaptive Regression Splines
  - KNN Regressor
  - Decision Tree Regressor
  - Random Forest Regressor
  - Gradient Boosting Regressor
  - Extra Trees Regressor
  - XGBoost Regressor
  - LightGBM Regressor
  - CatBoost Regressor
  - Deep Neural Network

- Performace Metrics:
  - R-Squared
  - Mean Absolute Error

# 1. Import Libraries and Load Dataset

In [None]:
# This Python 3 environment comes with many helpful analytics libraries installed
# It is defined by the kaggle/python Docker image: https://github.com/kaggle/docker-python
# For example, here's several helpful packages to load

import numpy as np # linear algebra
import pandas as pd # data processing, CSV file I/O (e.g. pd.read_csv)

# Input data files are available in the read-only "../input/" directory
# For example, running this (by clicking run or pressing Shift+Enter) will list all files under the input directory

import os
for dirname, _, filenames in os.walk('/kaggle/input'):
    for filename in filenames:
        print(os.path.join(dirname, filename))

# You can write up to 20GB to the current directory (/kaggle/working/) that gets preserved as output when you create a version using "Save & Run All" 
# You can also write temporary files to /kaggle/temp/, but they won't be saved outside of the current session

In [None]:
import pandas as pd
import numpy as np

import matplotlib
import matplotlib.pyplot as plt
import seaborn as sns
%matplotlib inline
sns.set(color_codes=True)

from sklearn.preprocessing import scale
from sklearn.model_selection import train_test_split

from sklearn.model_selection import GridSearchCV
from sklearn.metrics import r2_score
from sklearn.metrics import mean_absolute_error

from sklearn import linear_model
from sklearn.linear_model import LinearRegression
from pyearth import Earth
from sklearn.neighbors import KNeighborsRegressor
from sklearn.tree import DecisionTreeRegressor
from sklearn.ensemble import RandomForestRegressor, GradientBoostingRegressor, ExtraTreesRegressor
import xgboost as xgb
from xgboost import XGBRegressor
from lightgbm import LGBMRegressor
from catboost import CatBoostRegressor




from keras.models import Sequential
from keras.layers import Dense
from keras.wrappers.scikit_learn import KerasRegressor

import warnings
warnings.simplefilter('ignore')

In [None]:
# Load Data
data_train = pd.read_csv("/kaggle/input/are-your-employees-burning-out/train.csv")
data_test = pd.read_csv("/kaggle/input/are-your-employees-burning-out/test.csv")
print("Shape of Training Data:", data_train.shape)
print("Shape of Test Data:", data_test.shape)
data_train.head()

The test data does not contain output variable "Burn Rate", so let's training models using the training data, and predict the Burn Rate in the test data using the trained models.

In [None]:
# Data Information
print(data_train.info())
print(data_test.info())

There are no missing values in the test set. So we need to impute missing data only for the training data.

In [None]:
#from datetime import datetime as dt
import datetime as dt
data_train["Date of Joining"] =  pd.to_datetime(data_train["Date of Joining"])
data_test["Date of Joining"] =  pd.to_datetime(data_test["Date of Joining"])
data_train.dtypes

In [None]:
dt_today = dt.date.today()
data_train["today"]=dt_today
data_test["today"]=dt_today
data_train.head()

In [None]:
data_train["today"] =  pd.to_datetime(data_train["today"])
data_test["today"] =  pd.to_datetime(data_test["today"])

In [None]:
data_train['tenure'] = data_train['today'] - data_train["Date of Joining"] 
data_test['tenure'] = data_test['today'] - data_test["Date of Joining"] 
data_train.head()

In [None]:
data_train['tenure'] = data_train['tenure'].astype(int) 
data_test['tenure'] = data_test['tenure'].astype(int)

In [None]:
data_train = data_train.drop(["Employee ID", "Date of Joining", "today"], axis=1)
data_test = data_test.drop(["Employee ID", "Date of Joining", "today"], axis=1)
print(data_train.info())
print(data_test.info())

# 2. Imputing Missing Values

- Drop rows where target variable "Burn Rate" is missing.
- Impute other missing values with median because all the missing values are float.

In [None]:
# Drop rows where target variable "Burn Rage" is missing.
data_train = data_train.dropna(subset=['Burn Rate'])
data_train.info()

In [None]:
# Impute remaining missing values with medians
df_train = data_train
for col in ['Resource Allocation', 'Mental Fatigue Score']:
    df_train[col] = df_train[col].fillna(df_train[col].median())

df_train.info()

# 2. Data Exploration and Preprocessing

In [None]:
# Descriptive statistics
df_train.describe()

## 2.2. Exploration of Categorical Variables

In [None]:
# Representing categorical data using Letter Value Boxplots
fig, ax = plt.subplots(nrows=1, ncols=3, sharey=True, figsize=(10,5))
sns.countplot(df_train["Gender"], palette="winter", ax=ax[0])
sns.countplot(df_train["Company Type"], palette="winter", ax=ax[1])
sns.countplot(df_train["WFH Setup Available"], palette="winter", ax=ax[2])
plt.tight_layout()
plt.show()

In [None]:
# Representing categorical data using Letter Value Boxplots
fig, ax = plt.subplots(nrows=1, ncols=3, sharey=True, figsize=(10,5))
sns.boxenplot(x="Gender", y="Burn Rate", data=df_train, palette="winter", linewidth=0.0, ax=ax[0])
sns.boxenplot(x="Company Type", y="Burn Rate", data=df_train, palette="winter", linewidth=0.0, ax=ax[1])
sns.boxenplot(x="WFH Setup Available", y="Burn Rate", data=df_train, palette="winter", linewidth=0.0, ax=ax[2])
plt.tight_layout()
plt.show()

In [None]:
# Representing categorical data using Letter Value Boxplots
fig, ax = plt.subplots(nrows=1, ncols=3, sharey=True, figsize=(14,5))
sns.boxenplot(x="Gender", y="Burn Rate", hue="Company Type", data=df_train, palette="winter", linewidth=0.0, ax=ax[0])
sns.boxenplot(x="Gender", y="Burn Rate", hue="WFH Setup Available", data=df_train, palette="winter", linewidth=0.0, ax=ax[1])
sns.boxenplot(x="WFH Setup Available", y="Burn Rate", hue="Company Type", data=df_train, palette="winter", linewidth=0.0, ax=ax[2])
plt.tight_layout()
plt.show()

## 2.3. Exploration of Continuous Variables

In [None]:
# Create a list of continuous variables
cont = ["Burn Rate", "Resource Allocation", "Mental Fatigue Score", "tenure"]

# Create a dataframe of continuous variables
df_cont = df_train[cont]

In [None]:
# Visualize correlation between continuous variables

# Compute the correlation matrix
#corr = df_cont.corr()

# Generate a mask for the upper triangle
#mask = np.triu(np.ones_like(corr, dtype=bool))

# Set up the matplotlib figure
#f, ax = plt.subplots(figsize=(4, 3))

# Draw the heatmap with the mask and correct aspect ratio
#sns.heatmap(corr, mask=mask, cmap=sns.diverging_palette(128, 240,as_cmap=True), 
#            vmax=.3, center=0, square=True, linewidths=.5, cbar_kws={"shrink": .5}, annot=False)

In [None]:
# Visualize regression between each feature and output variable

# The number of variables
k = 4
fig = plt.figure(figsize=(14,5))
# Correlations between each variable
corrmat = df_cont.corr()
# Take k elements in descending order of coefficient 
cols = corrmat.nlargest(k, "Burn Rate")["Burn Rate"].index
# Calculate correlation
for i in np.arange(1,k):
    regline = df_cont[cols[i]]
    ax = fig.add_subplot(1,3,i)
    sns.regplot(x=regline, y=df_train['Burn Rate'], scatter_kws={"color": "royalblue", "s": 3},
                line_kws={"color": "turquoise"})
plt.tight_layout()
plt.show()

In [None]:
# Representing categorical data using Letter Value Boxplots
fig, ax = plt.subplots(nrows=1, ncols=3, sharey=True, figsize=(14,5))
sns.scatterplot(x="Mental Fatigue Score", y="Burn Rate", hue="Gender", data=df_train, linewidth=0.0, ax=ax[0])
sns.scatterplot(x="Resource Allocation", y="Burn Rate", hue="Gender", data=df_train, linewidth=0.0, ax=ax[1])
sns.scatterplot(x="tenure", y="Burn Rate", hue="Gender", data=df_train, linewidth=0.0, ax=ax[2])
plt.tight_layout()
plt.show()

In [None]:
# Representing categorical data using Letter Value Boxplots
fig, ax = plt.subplots(nrows=1, ncols=3, sharey=True, figsize=(14,5))
sns.scatterplot(x="Mental Fatigue Score", y="Burn Rate", hue="WFH Setup Available", data=df_train, linewidth=0.0, ax=ax[0])
sns.scatterplot(x="Resource Allocation", y="Burn Rate", hue="WFH Setup Available", data=df_train, linewidth=0.0, ax=ax[1])
sns.scatterplot(x="tenure", y="Burn Rate", hue="WFH Setup Available", data=df_train, linewidth=0.0, ax=ax[2])
plt.tight_layout()
plt.show()

In [None]:
# Representing categorical data using Letter Value Boxplots
fig, ax = plt.subplots(nrows=1, ncols=3, sharey=True, figsize=(10,5))
sns.boxenplot(x="Gender", y="Mental Fatigue Score", data=df_train, palette="winter", linewidth=0.0, ax=ax[0])
sns.boxenplot(x="Company Type", y="Mental Fatigue Score", data=df_train, palette="winter", linewidth=0.0, ax=ax[1])
sns.boxenplot(x="WFH Setup Available", y="Mental Fatigue Score", data=df_train, palette="winter", linewidth=0.0, ax=ax[2])
plt.tight_layout()
plt.show()

## 2.4. Data Preparation for Modeling

In [None]:
import category_encoders as ce

list_cols = ['Gender','Company Type','WFH Setup Available']

ce_ohe = ce.OneHotEncoder(cols=list_cols)
df_train = ce_ohe.fit_transform(df_train)
df_train.head()

ce_ohe2 = ce.OneHotEncoder(cols=list_cols)
data_test = ce_ohe2.fit_transform(data_test)

In [None]:
# Split X and y
X = df_train.drop(['Burn Rate','Gender_2','Company Type_2','WFH Setup Available_2'], axis=1)
y = df_train['Burn Rate']

data_test = data_test.drop(['Gender_2','Company Type_2','WFH Setup Available_2'], axis=1)

In [None]:
# Create dummies for categorical variables

# subset all categorical variables
#categorical = X.select_dtypes(include=['object'])
# convert into dummies
#dummies = pd.get_dummies(categorical, drop_first=True)
# drop categorical variables 
#X = X.drop(list(categorical.columns), axis=1)
#data_test = data_test.drop(list(categorical.columns), axis=1)
# concat dummy variables with X
#X = pd.concat([X, cars_dummies], axis=1)
#data_test = data_test([data_test, cars_dummies], axis=1)

In [None]:
# split into train and test
X_train, X_test, y_train, y_test = train_test_split(X, y, train_size=0.7,test_size = 0.3, random_state=100)

In [None]:
from sklearn.preprocessing import StandardScaler

cols = X.columns

sc = StandardScaler()
X_train = sc.fit_transform(X_train)
X_test = sc.transform (X_test)

X_train = pd.DataFrame(X_train)
X_test = pd.DataFrame(X_test)

X_train.columns = cols
X_test.columns = cols

# 3. Regression

## 3.1. Linear Regression

In [None]:
# Instantiate the model
lm = LinearRegression()

# Fit the model
lm.fit(X_train, y_train)

# Make prediction
lm_y_pred = lm.predict(X_test)

# Performance metrics
lr_r2= r2_score(y_test, lm_y_pred)
lr_mae = mean_absolute_error(y_test, lm_y_pred)

# Show the metrics
print("Linear Regression R2: ", lr_r2)
print("Linear Regression MAE: ", lr_mae)

In [None]:
# Evaluate the model based on the assumption of linear regression:

# Assumption 1. The error terms are normally distributed with mean approximately 0.

fig = plt.figure()
sns.distplot((y_test - lm_y_pred),bins=50, color="blue")
fig.suptitle('Error Terms', fontsize=14)                  
plt.xlabel('y_test-y_pred', fontsize=12)                  
plt.ylabel('Index', fontsize=12)                          
plt.show()

The first assumption seems to be met.

In [None]:
# Assumption 2: Homoscedasticity, i.e. the variance of the error term (y_true-y_pred) is constant.

c = [i for i in range(len(lm_y_pred))]
fig = plt.figure()
plt.plot(c,y_test - lm_y_pred, color="blue", linewidth=2.5, linestyle="-", alpha=0.4)
fig.suptitle('Error Terms', fontsize=14)               
plt.xlabel('Index', fontsize=12)                      
plt.ylabel('ytest-ypred', fontsize=12)                
plt.show()

The second assumption seems to be met.

In [None]:
# Assumption 3: There is little correlation between the predictors. i.e., Multicollinearity:

# Compute the correlation matrix
cors = X.loc[:, list(X.columns)].corr()

# Generate a mask for the upper triangle
mask_2 = np.triu(np.ones_like(cors, dtype=bool))

# Set up the matplotlib figure
f, ax = plt.subplots(figsize=(9, 6))

# Draw the heatmap with the mask and correct aspect ratio
sns.heatmap(cors, mask=mask_2, cmap="coolwarm",
            vmax=.3, center=0, square=True, linewidths=.5, cbar_kws={"shrink": .5}, annot=False)
plt.show()

Some features are highly correlated. So let's check the multicolliearity by VIF.

In [None]:
# Check for the VIF values of the feature variables. 
from statsmodels.stats.outliers_influence import variance_inflation_factor

In [None]:
# Create a dataframe that will contain the names of all the feature variables and their respective VIFs
vif = pd.DataFrame()
vif['Features'] = X.columns
vif['VIF'] = [variance_inflation_factor(X_train.values, i) for i in range(X_train.shape[1])]
vif['VIF'] = round(vif['VIF'], 2)
vif = vif.sort_values(by = "VIF", ascending = False)
vif

All VIF are below 5. So, there is no concern for multicollinearity.

## 3.2. Multivariate Adaptive Regression Splines (MARS)

In [None]:
# Initiate the model
mars = Earth()

# By default, we do not need to set any of the algorithm hyperparameters.
# The algorithm automatically discovers the number and type of basis functions to use.

# Fit the model
mars.fit(X_train, y_train)

# Making predictions
mars_y_pred = mars_model.predict(X_test)

# Performance Metrics
mars_r2 = r2_score(y_test, mars_y_pred)
mars_mae = mean_absolute_error(y_test, mars_y_pred)

# Show the model performance
print("MARS R2: ", mars_r2)
print("MARS MAE: ", mars_mae)

## 3.3. KNN Regression

In [None]:
# Initiate the model
knn = KNeighborsRegressor()

# Fit the model
knn.fit(X_train, y_train)

# Make predictions
knn_y_pred = knn.predict(X_test)

# Performance metrics
knn_r2 = r2_score(y_test, knn_y_pred)
knn_mae = mean_absolute_error(y_test, knn_y_pred)

# Show the model performance
print("KNN R2: ", knn_r2)
print("KNN MAE: ", knn_mae)

## 3.4. Decision Tree Regression

In [None]:
# Initiate the model
dt = DecisionTreeRegressor()

# Fit the model
dt.fit(X_train, y_train)

# Make predictions
dt_y_pred = dt.predict(X_test)

# Performance metrics
dt_r2 = r2_score(y_test, dt_y_pred)
dt_mae = mean_absolute_error(y_test, dt_y_pred)

# Show the model performance
print("DT R2: ", dt_r2)
print("DT MAE: ", dt_mae)

## 3.5. Random Forest Regression 

In [None]:
# Initiate the model
rf = RandomForestRegressor()

# Fit the model
rf.fit(X_train, y_train)

# Make predictions
rf_y_pred = rf.predict(X_test)

# Performance metrics
rf_r2 = r2_score(y_test, rf_y_pred)
rf_mae = mean_absolute_error(y_test, rf_y_pred)

# Show the model performance
print("RF R2: ", rf_r2)
print("RF MAE: ", rf_mae)

## 3.6. Gradient Boosting Regression

In [None]:
# Initiate the model
gb = GradientBoostingRegressor()

# Fit the model
gb.fit(X_train, y_train)

# Make predictions
gb_y_pred = gb.predict(X_test)

# Performance metrics
gb_r2 = r2_score(y_test, gb_y_pred)
gb_mae = mean_absolute_error(y_test, gb_y_pred)

# Show the model performance
print("GB R2: ", gb_r2)
print("GB MAE: ", gb_mae)

## 3.7. ExtraTreesRegressor

In [None]:
# Initiate the model
et = ExtraTreesRegressor()

# Fit the model
et.fit(X_train, y_train)

# Make predictions
et_y_pred = et.predict(X_test)

# Performance metrics
et_r2 = r2_score(y_test, et_y_pred)
et_mae = mean_absolute_error(y_test, et_y_pred)

# Show the model performance
print("ET R2: ", et_r2)
print("ET MAE: ", et_mae)

## 3.8. XGB Regression

In [None]:
# Initiate the model
xg = XGBRegressor()

# Fit the model
xg.fit(X_train, y_train)

# Make predictions
xg_y_pred = xg.predict(X_test)

# Performance metrics
xg_r2 = r2_score(y_test, xg_y_pred)
xg_mae = mean_absolute_error(y_test, xg_y_pred)

# Show the model performance
print("XGB R2: ", xg_r2)
print("XGB MAE: ", xg_mae)

## 3.9. Light GBM Regression

In [None]:
# Initiate the model
lg = LGBMRegressor()

# Fit the model
lg.fit(X_train, y_train)

# Make predictions
lg_y_pred = lg.predict(X_test)

# Performance metrics
lg_r2 = r2_score(y_test, lg_y_pred)
lg_mae = mean_absolute_error(y_test, lg_y_pred)

# Show the model performance
print("LGBM R2: ", lg_r2)
print("LGBM MAE: ", lg_mae)

## 3.10. CatBoost Regression

In [None]:
# Initiate the model
cb = CatBoostRegressor()

# Fit the model
cb.fit(X_train, y_train)

# Make predictions
cb_y_pred = cb.predict(X_test)

# Performance metrics
cb_r2 = r2_score(y_test, cb_y_pred)
cb_mae = mean_absolute_error(y_test, cb_y_pred)

# Show the model performance
print("CATB R2: ", cb_r2)
print("CATB MAE: ", cb_mae)

## 3.5. Deep Neural Network

In [None]:
# Define a DNN
def create_model(optimizer='adam'):
    model = Sequential()
    model.add(Dense(X_train.shape[1], input_dim=X_train.shape[1], kernel_initializer='normal', activation='relu'))
    model.add(Dense(4, kernel_initializer='normal', activation='relu'))
    model.add(Dense(1, kernel_initializer='normal'))

    model.compile(loss='mean_squared_error', optimizer=optimizer)
    return model

In [None]:
# Initiate DNN
dnn = KerasRegressor(build_fn=create_model, epochs=1000, batch_size=20, verbose=1)

# Fit DNN
dnn_history = dnn.fit(X_train, y_train)

In [None]:
# Visualize the DNN learning
loss_train = dnn_history.history['loss']
epochs = range(1,1001)
plt.figure(figsize=(8,6))
plt.plot(epochs, loss_train, 'royalblue', label='Training loss', linewidth=3)
plt.title('Training loss')
plt.xlabel('Epochs')
plt.ylabel('Loss')
plt.legend()
plt.show()

In [None]:
# Make predictions
dnn_y_pred = dnn.predict(X_test)

# Performance metrics
dnn_r2 = r2_score(y_test, dnn_y_pred)
dnn_mae = mean_absolute_error(y_test, dnn_y_pred)

# Show the model performance
print("DNN R2: ", dnn_r2)
print("DNN MAE: ", dnn_mae)

# 4. Summary of Results

In [None]:
results_table = pd.DataFrame([[np.mean(lr_r2), np.mean(lr_mae)],
                              [np.mean(mars_r2), np.mean(mars_mae)],
                              [np.mean(knn_r2), np.mean(knn_mae)],
                              [np.mean(dt_r2), np.mean(dt_mae)],
                              [np.mean(rf_r2), np.mean(rf_mae)],
                              [np.mean(gb_r2), np.mean(gb_mae)],
                              [np.mean(et_r2), np.mean(et_mae)],
                              [np.mean(xg_r2), np.mean(xg_mae)],
                              [np.mean(lg_r2), np.mean(lg_mae)],
                              [np.mean(cb_r2), np.mean(cb_mae)],
                              [np.mean(dnn_r2), np.mean(dnn_mae)]],
                            columns=['R2', 'MAE'],
                            index=["Linear Regression","MARS","KNN","Decision Tree","Random Forest","Gradient Boosting",
                                   "Extra Trees","XGBoost","LightGBM","CatBoost","DNN"])
pd.options.display.precision = 3
results_table

In [None]:
pred_table = pd.DataFrame({"Linear Regression: Predicted Price": lm_y_pred,
                           "MARS: Predicted Price": mars_y_pred,
                           "KNN: Predicted Price": knn_y_pred,
                           "Decision Tree: Predicted Price": dt_y_pred,
                           "Random Forest: Predicted Price": rf_y_pred,
                           "Gradient Boosting: Predicted Price": gb_y_pred,
                           "Extra Trees: Predicted Price": et_y_pred,
                           "XGBoost: Predicted Price": xg_y_pred,
                           "LightGBM: Predicted Price": lg_y_pred,
                           "CatBoost: Predicted Price": cb_y_pred,
                           "DNN: Predicted Price": dnn_y_pred,
                           "Actual Price": y_test})

In [None]:
# Visualize the predicted price and actual price
fig = plt.figure(figsize=(16,10))
plt.subplot(3,4,1)
sns.regplot(x = 'Linear Regression: Predicted Price', y = 'Actual Price', data = pred_table,
           color = 'royalblue',scatter_kws={"s": 5}, line_kws={"color": "turquoise"})
plt.subplot(3,4,2)
sns.regplot(x = 'MARS: Predicted Price', y = 'Actual Price', data = pred_table,
           color = 'royalblue',scatter_kws={"s": 5}, line_kws={"color": "turquoise"})
plt.subplot(3,4,3)
sns.regplot(x = 'KNN: Predicted Price', y = 'Actual Price', data = pred_table,
           color = 'royalblue',scatter_kws={"s": 5}, line_kws={"color": "turquoise"})
plt.subplot(3,4,4)
sns.regplot(x = 'Decision Tree: Predicted Price', y = 'Actual Price', data = pred_table,
           color = 'royalblue',scatter_kws={"s": 5}, line_kws={"color": "turquoise"})
plt.subplot(3,4,5)
sns.regplot(x = 'Random Forest: Predicted Price', y = 'Actual Price', data = pred_table,
           color = 'royalblue',scatter_kws={"s": 5}, line_kws={"color": "turquoise"})
plt.subplot(3,4,6)
sns.regplot(x = 'Gradient Boosting: Predicted Price', y = 'Actual Price', data = pred_table,
           color = 'royalblue',scatter_kws={"s": 5}, line_kws={"color": "turquoise"})
plt.subplot(3,4,7)
sns.regplot(x = 'Extra Trees: Predicted Price', y = 'Actual Price', data = pred_table,
           color = 'royalblue',scatter_kws={"s": 5}, line_kws={"color": "turquoise"})
plt.subplot(3,4,8)
sns.regplot(x = 'XGBoost: Predicted Price', y = 'Actual Price', data = pred_table,
           color = 'royalblue',scatter_kws={"s": 5}, line_kws={"color": "turquoise"})
plt.subplot(3,4,9)
sns.regplot(x = 'LightGBM: Predicted Price', y = 'Actual Price', data = pred_table,
           color = 'royalblue',scatter_kws={"s": 5}, line_kws={"color": "turquoise"})
plt.subplot(3,4,10)
sns.regplot(x = 'CatBoost: Predicted Price', y = 'Actual Price', data = pred_table,
           color = 'royalblue',scatter_kws={"s": 5}, line_kws={"color": "turquoise"})
plt.subplot(3,4,11)
sns.regplot(x = 'DNN: Predicted Price', y = 'Actual Price', data = pred_table,
           color = 'royalblue',scatter_kws={"s": 5}, line_kws={"color": "turquoise"})
plt.tight_layout()
plt.show()

# 5. Feature Importance

In [None]:
# Feature Importances
fti = lg.feature_importances_

print('Feature Importances:')
for i, feat in enumerate(X.columns):
    print('\t{0:10s} : {1:>12.4f}'.format(feat, fti[i]))

### Mental Fatigue and Tenure are important features.