### Prediction of Employee Attrition/Burnout
Authors: Florencia Luque and Simon Schmetz

In [41]:
import pandas as pd
import numpy as np
import sklearn as skn


In [42]:
data_train = pd.read_csv("attrition_availabledata_10.csv.gz", compression='gzip')
data_test = pd.read_csv("attrition_competition_10.csv.gz", compression='gzip')


In [43]:
data_train.head()

Unnamed: 0,hrs,absences,JobInvolvement,PerformanceRating,EnvironmentSatisfaction,JobSatisfaction,WorkLifeBalance,Age,BusinessTravel,Department,...,Over18,PercentSalaryHike,StandardHours,StockOptionLevel,TotalWorkingYears,TrainingTimesLastYear,YearsAtCompany,YearsSinceLastPromotion,YearsWithCurrManager,Attrition
0,7.462297,13.0,3.0,3.0,3.0,3.0,3.0,48.0,Travel_Rarely,Research & Development,...,Y,13.0,8.0,1.0,18.0,2.0,8.0,7.0,7.0,No
1,7.488597,6.0,1.0,3.0,3.0,4.0,4.0,31.0,Travel_Frequently,Research & Development,...,Y,17.0,8.0,1.0,3.0,2.0,1.0,0.0,0.0,Yes
2,7.116541,16.0,2.0,4.0,1.0,1.0,3.0,36.0,Travel_Rarely,Research & Development,...,Y,24.0,8.0,0.0,6.0,6.0,5.0,0.0,3.0,No
3,9.709125,6.0,3.0,4.0,3.0,3.0,3.0,49.0,Travel_Rarely,Research & Development,...,Y,22.0,8.0,0.0,25.0,2.0,7.0,0.0,7.0,No
4,6.297729,6.0,4.0,3.0,3.0,2.0,3.0,40.0,Travel_Frequently,Research & Development,...,Y,13.0,8.0,1.0,8.0,3.0,8.0,3.0,7.0,No


In [44]:
data_train.shape

(2940, 31)

We have 2940 instances and 31 variables including the attrition. 

In [45]:
data_train.isna().sum()/data_train.shape[0] 

hrs                        0.0
absences                   0.0
JobInvolvement             0.0
PerformanceRating          0.0
EnvironmentSatisfaction    0.0
JobSatisfaction            0.0
WorkLifeBalance            0.0
Age                        0.0
BusinessTravel             0.0
Department                 0.0
DistanceFromHome           0.0
Education                  0.0
EducationField             0.0
EmployeeCount              0.0
EmployeeID                 0.0
Gender                     0.0
JobLevel                   0.0
JobRole                    0.0
MaritalStatus              0.0
MonthlyIncome              0.0
NumCompaniesWorked         0.0
Over18                     0.0
PercentSalaryHike          0.0
StandardHours              0.0
StockOptionLevel           0.0
TotalWorkingYears          0.0
TrainingTimesLastYear      0.0
YearsAtCompany             0.0
YearsSinceLastPromotion    0.0
YearsWithCurrManager       0.0
Attrition                  0.0
dtype: float64

There's no Nan values in any of the columns in the available data

In [46]:
data_train.dtypes

hrs                        float64
absences                   float64
JobInvolvement             float64
PerformanceRating          float64
EnvironmentSatisfaction    float64
JobSatisfaction            float64
WorkLifeBalance            float64
Age                        float64
BusinessTravel              object
Department                  object
DistanceFromHome           float64
Education                  float64
EducationField              object
EmployeeCount              float64
EmployeeID                 float64
Gender                      object
JobLevel                   float64
JobRole                     object
MaritalStatus               object
MonthlyIncome              float64
NumCompaniesWorked         float64
Over18                      object
PercentSalaryHike          float64
StandardHours              float64
StockOptionLevel           float64
TotalWorkingYears          float64
TrainingTimesLastYear      float64
YearsAtCompany             float64
YearsSinceLastPromot

In [47]:
data_train.columns

Index(['hrs', 'absences', 'JobInvolvement', 'PerformanceRating',
       'EnvironmentSatisfaction', 'JobSatisfaction', 'WorkLifeBalance', 'Age',
       'BusinessTravel', 'Department', 'DistanceFromHome', 'Education',
       'EducationField', 'EmployeeCount', 'EmployeeID', 'Gender', 'JobLevel',
       'JobRole', 'MaritalStatus', 'MonthlyIncome', 'NumCompaniesWorked',
       'Over18', 'PercentSalaryHike', 'StandardHours', 'StockOptionLevel',
       'TotalWorkingYears', 'TrainingTimesLastYear', 'YearsAtCompany',
       'YearsSinceLastPromotion', 'YearsWithCurrManager', 'Attrition'],
      dtype='object')

The data is unbalanced with a 83.8% of No and 16.12% of Yes in the response variable

In [48]:
data_train.groupby("Attrition").count()/data_train.shape[0]

Unnamed: 0_level_0,hrs,absences,JobInvolvement,PerformanceRating,EnvironmentSatisfaction,JobSatisfaction,WorkLifeBalance,Age,BusinessTravel,Department,...,NumCompaniesWorked,Over18,PercentSalaryHike,StandardHours,StockOptionLevel,TotalWorkingYears,TrainingTimesLastYear,YearsAtCompany,YearsSinceLastPromotion,YearsWithCurrManager
Attrition,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1,Unnamed: 11_level_1,Unnamed: 12_level_1,Unnamed: 13_level_1,Unnamed: 14_level_1,Unnamed: 15_level_1,Unnamed: 16_level_1,Unnamed: 17_level_1,Unnamed: 18_level_1,Unnamed: 19_level_1,Unnamed: 20_level_1,Unnamed: 21_level_1
No,0.838776,0.838776,0.838776,0.838776,0.838776,0.838776,0.838776,0.838776,0.838776,0.838776,...,0.838776,0.838776,0.838776,0.838776,0.838776,0.838776,0.838776,0.838776,0.838776,0.838776
Yes,0.161224,0.161224,0.161224,0.161224,0.161224,0.161224,0.161224,0.161224,0.161224,0.161224,...,0.161224,0.161224,0.161224,0.161224,0.161224,0.161224,0.161224,0.161224,0.161224,0.161224


In [49]:
data_train.dtypes


hrs                        float64
absences                   float64
JobInvolvement             float64
PerformanceRating          float64
EnvironmentSatisfaction    float64
JobSatisfaction            float64
WorkLifeBalance            float64
Age                        float64
BusinessTravel              object
Department                  object
DistanceFromHome           float64
Education                  float64
EducationField              object
EmployeeCount              float64
EmployeeID                 float64
Gender                      object
JobLevel                   float64
JobRole                     object
MaritalStatus               object
MonthlyIncome              float64
NumCompaniesWorked         float64
Over18                      object
PercentSalaryHike          float64
StandardHours              float64
StockOptionLevel           float64
TotalWorkingYears          float64
TrainingTimesLastYear      float64
YearsAtCompany             float64
YearsSinceLastPromot

In [50]:
data_train.head(10)

Unnamed: 0,hrs,absences,JobInvolvement,PerformanceRating,EnvironmentSatisfaction,JobSatisfaction,WorkLifeBalance,Age,BusinessTravel,Department,...,Over18,PercentSalaryHike,StandardHours,StockOptionLevel,TotalWorkingYears,TrainingTimesLastYear,YearsAtCompany,YearsSinceLastPromotion,YearsWithCurrManager,Attrition
0,7.462297,13.0,3.0,3.0,3.0,3.0,3.0,48.0,Travel_Rarely,Research & Development,...,Y,13.0,8.0,1.0,18.0,2.0,8.0,7.0,7.0,No
1,7.488597,6.0,1.0,3.0,3.0,4.0,4.0,31.0,Travel_Frequently,Research & Development,...,Y,17.0,8.0,1.0,3.0,2.0,1.0,0.0,0.0,Yes
2,7.116541,16.0,2.0,4.0,1.0,1.0,3.0,36.0,Travel_Rarely,Research & Development,...,Y,24.0,8.0,0.0,6.0,6.0,5.0,0.0,3.0,No
3,9.709125,6.0,3.0,4.0,3.0,3.0,3.0,49.0,Travel_Rarely,Research & Development,...,Y,22.0,8.0,0.0,25.0,2.0,7.0,0.0,7.0,No
4,6.297729,6.0,4.0,3.0,3.0,2.0,3.0,40.0,Travel_Frequently,Research & Development,...,Y,13.0,8.0,1.0,8.0,3.0,8.0,3.0,7.0,No
5,5.742623,13.0,2.0,3.0,4.0,3.0,3.0,52.0,Travel_Rarely,Research & Development,...,Y,11.0,8.0,1.0,9.0,1.0,5.0,1.0,4.0,No
6,5.792075,21.0,2.0,3.0,3.0,4.0,2.0,27.0,Travel_Rarely,Sales,...,Y,13.0,8.0,1.0,6.0,2.0,6.0,1.0,4.0,No
7,6.792231,14.0,2.0,3.0,3.0,1.0,2.0,49.0,Travel_Rarely,Research & Development,...,Y,11.0,8.0,1.0,16.0,3.0,15.0,5.0,11.0,No
8,5.601004,17.0,2.0,3.0,4.0,3.0,3.0,47.0,Travel_Rarely,Research & Development,...,Y,19.0,8.0,3.0,27.0,3.0,5.0,1.0,0.0,No
9,6.918927,7.0,2.0,3.0,4.0,1.0,3.0,29.0,Travel_Rarely,Research & Development,...,Y,12.0,8.0,0.0,11.0,4.0,3.0,1.0,2.0,No


In [51]:
categorical_variables = ['JobInvolvement', 'PerformanceRating','EnvironmentSatisfaction', 'JobSatisfaction',
                          'WorkLifeBalance','BusinessTravel', 'Department', 'Education','EducationField',
                          'EmployeeID','Gender', 'JobLevel','JobRole','MaritalStatus','StockOptionLevel','Attrition']
numeric_variables = ['hrs','absences','Age','DistanceFromHome','MonthlyIncome','NumCompaniesWorked',
                     'PercentSalaryHike', 'StandardHours','TotalWorkingYears', 'TrainingTimesLastYear', 'YearsAtCompany',
       'YearsSinceLastPromotion', 'YearsWithCurrManager']

constant_variables = ["EmployeeCount",'Over18','StandardHours']

In [52]:
len(categorical_variables) + len(numeric_variables) + len(constant_variables)

32

In [53]:
constant_variables

['EmployeeCount', 'Over18', 'StandardHours']

In [54]:
data_train[categorical_variables].nunique()

JobInvolvement                4
PerformanceRating             2
EnvironmentSatisfaction       4
JobSatisfaction               4
WorkLifeBalance               4
BusinessTravel                3
Department                    3
Education                     5
EducationField                6
EmployeeID                 2940
Gender                        2
JobLevel                      5
JobRole                       9
MaritalStatus                 3
StockOptionLevel              4
Attrition                     2
dtype: int64

# Train Test Split

In [55]:
from sklearn.datasets import fetch_california_housing
from sklearn.model_selection import train_test_split, GridSearchCV, cross_val_score, KFold
from sklearn.tree import DecisionTreeRegressor
from sklearn.neighbors import KNeighborsRegressor
from sklearn.preprocessing import StandardScaler
from sklearn.pipeline import Pipeline

# Load dataset
X, y = data_train.data, data_train.Attrition

# Outer evaluation split
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=1/4, random_state=42)

# Inner evaluation with 3-fold CV
inner = KFold(n_splits=4, shuffle=True, random_state=42)

# Store inner evaluation scores
inner_scores = {}


AttributeError: 'DataFrame' object has no attribute 'data'

# Stuff from Class

In [26]:
from sklearn.datasets import fetch_california_housing
from sklearn.model_selection import train_test_split, GridSearchCV, cross_val_score, KFold
from sklearn.tree import DecisionTreeRegressor
from sklearn.neighbors import KNeighborsRegressor
from sklearn.preprocessing import StandardScaler
from sklearn.pipeline import Pipeline

# Load dataset
data = fetch_california_housing()
X, y = data.data, data.target

# Outer evaluation split
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=1/3, random_state=42)

# Inner evaluation with 3-fold CV
inner = KFold(n_splits=3, shuffle=True, random_state=42)

# Store inner evaluation scores
inner_scores = {}

In [27]:
# Regression Tree with default parameters
tree_reg = DecisionTreeRegressor(random_state=42)
tree_default_scores = cross_val_score(tree_reg, X_train, y_train, cv=inner, scoring='neg_root_mean_squared_error')
inner_scores['Tree Default'] = -tree_default_scores.mean()

# Regression Tree with hyperparameter tuning
param_grid_tree = {'max_depth': [10, 20, 30],
                   'min_samples_split': [2, 10, 20]}
grid_search_tree = GridSearchCV(tree_reg, param_grid_tree, cv=inner, scoring='neg_root_mean_squared_error')
grid_search_tree.fit(X_train, y_train)
inner_scores['Tree Tuned'] = -grid_search_tree.best_score_

In [28]:
#KNN with StandardScaler using Pipeline:

knn_pipeline = Pipeline([
    ('scaler', StandardScaler()),
    ('knn', KNeighborsRegressor())
])

knn_std_default_scores = cross_val_score(knn_pipeline, X_train, y_train, cv=inner, scoring='neg_root_mean_squared_error')
inner_scores['KNN Standard Default'] = -knn_std_default_scores.mean()

# KNN with HPO:
pipe_param_grid = {
    'knn__n_neighbors': [3, 5, 7],
    'knn__weights': ['uniform', 'distance']
}

grid_search_knn = GridSearchCV(knn_pipeline, pipe_param_grid, cv=inner, scoring='neg_root_mean_squared_error')
grid_search_knn.fit(X_train, y_train)
inner_scores['KNN Standard Tuned'] = -grid_search_knn.best_score_

In [29]:
#Trees with default hyper-parameters:

tree_reg = DecisionTreeRegressor(random_state=42)

empty_param_grid_tree = {}

grid_search_tree_default = GridSearchCV(tree_reg, empty_param_grid_tree, cv=inner, scoring='neg_root_mean_squared_error')

grid_search_tree_default.fit(X_train, y_train)

inner_scores['Tree Default'] = -grid_search_tree_default.best_score_

#Trees with HPO:

param_grid_tree = {'max_depth': [10, 20, 30],
                   'min_samples_split': [2, 10, 20]}

grid_search_tree = GridSearchCV(tree_reg, param_grid_tree, cv=inner, scoring='neg_root_mean_squared_error')

grid_search_tree.fit(X_train, y_train)

inner_scores['Tree Tuned'] = -grid_search_tree.best_score_

In [30]:
# Dummy Regressor using mean
dummy_reg = DummyRegressor(strategy='mean')
dummy_scores = cross_val_score(dummy_reg, X_train, y_train, cv=inner, scoring='neg_root_mean_squared_error')
inner_scores['Dummy Mean'] = -dummy_scores.mean()

NameError: name 'DummyRegressor' is not defined

In [None]:
# Print inner evaluation scores and ratios
print(f"{'Model': <21} {'Inner RMSE': <15} {'Model/Dummy RMSE Ratio': <20}")

for model, score in inner_scores.items():
    ratio = score / inner_scores['Dummy Mean']
    print(f"{model: <21} {score: <15.4f} {ratio: <20.4f}")

In [None]:
# Outer evaluation for best model (estimation of future performance):
test_predictions = grid_search_knn.predict(X_test)
test_rmse = root_mean_squared_error(y_test, test_predictions)
print(f'\nBest Model: KNN Standard Tuned')
print(f'Best Model Test RMSE: {test_rmse:.4f}')

# Outer evaluation for Dummy Regressor:

dummy_reg.fit(X_train, y_train)
dummy_predictions = dummy_reg.predict(X_test)
dummy_outer_rmse = root_mean_squared_error(y_test, dummy_predictions)
print(f'\nDummy Outer RMSE: {dummy_outer_rmse:.4f}')

In [None]:
# Train final model

final_model = grid_search_knn.fit(X,y)

In [None]:
# Imputation + Scaling for KNN and SVM

from sklearn.pipeline import Pipeline
from sklearn.preprocessing import StandardScaler
from sklearn.impute import SimpleImputer
from sklearn.neighbors import KNeighborsRegressor

# Define steps in the pipeline
knn = KNeighborsRegressor()
scaler = StandardScaler()
imputer = SimpleImputer(strategy='mean')  # Imputation transformer for completing missing values with the mean

# Update the pipeline to include the imputation step
classif = Pipeline([
    ('imputation', imputer),
    ('standardization', scaler),
    ('knn', knn)
])

# Now you can fit the pipeline to your data and make predictions
classif.fit(X_train, y_train)
y_hat = classif.predict(X_test)


In [None]:
### Pipeline for preprocessing data with categorical ordinal, categorical non-ordinal, and numerical features

# Define columns by type
numerical_features = ['age', 'income', 'years_experience']
ordinal_features = ['education_level']
non_ordinal_features = ['city', 'gender']

# Define transformations for each type
numerical_transformer = Pipeline([
    ('imputer', SimpleImputer(strategy='mean'))  # Impute missing
])

ordinal_transformer = OrdinalEncoder(categories=[['High School', 'Bachelor\'s', 'Master\'s', 'PhD']])

non_ordinal_transformer = OneHotEncoder(handle_unknown='ignore')

# Combine preprocessing into a column transformer
preprocessor = ColumnTransformer([
    ('num', numerical_transformer, numerical_features),  # Impute numerical features
    ('ord', ordinal_transformer, ordinal_features),  # Encode ordinal categorical features
    ('non_ord', non_ordinal_transformer, non_ordinal_features)  # One-hot encode non-ordinal categorical features
])

# Define the full pipeline with scaling applied to all features
knn_pipeline = Pipeline([
    ('preprocessor', preprocessor),  # Preprocess all features
    ('scaler', StandardScaler()),  # Scale all features after preprocessing
    ('knn', KNeighborsRegressor())  # Apply KNN
])