### Prediction of Employee Attrition/Burnout
Authors: Florencia Luque and Simon Schmetz

In [None]:
import pandas as pd
import numpy as np
import sklearn as skn


In [2]:
data_available = pd.read_csv("attrition_availabledata_10.csv.gz", compression='gzip')
data_competetion = pd.read_csv("attrition_competition_10.csv.gz", compression='gzip')


In [None]:
data_available.head()

In [None]:
data_available.shape

We have 2940 instances and 31 variables including the attrition. 

In [None]:
data_available.isna().sum()/data_available.shape[0] 

There's no Nan values in any of the columns in the available data

In [None]:
data_available.dtypes

In [None]:
data_available.columns

The data is unbalanced with a 83.8% of No and 16.12% of Yes in the response variable

In [None]:
data_available.groupby("Attrition").count()/data_available.shape[0]

In [None]:
data_available.dtypes


In [None]:
data_available.head(10)

In [11]:
categorical_variables = ['JobInvolvement', 'PerformanceRating','EnvironmentSatisfaction', 'JobSatisfaction',
                          'WorkLifeBalance','BusinessTravel', 'Department', 'Education','EducationField',
                          'EmployeeID','Gender', 'JobLevel','JobRole','MaritalStatus','StockOptionLevel','Attrition']
numeric_variables = ['hrs','absences','Age','DistanceFromHome','MonthlyIncome','NumCompaniesWorked',
                     'PercentSalaryHike', 'StandardHours','TotalWorkingYears', 'TrainingTimesLastYear', 'YearsAtCompany',
       'YearsSinceLastPromotion', 'YearsWithCurrManager']

constant_variables = ["EmployeeCount",'Over18','StandarHours']

In [None]:
data_available[categorical_variables].nunique()

# Stuff from Class

In [13]:
from sklearn.datasets import fetch_california_housing
from sklearn.model_selection import train_test_split, GridSearchCV, cross_val_score, KFold
from sklearn.tree import DecisionTreeRegressor
from sklearn.neighbors import KNeighborsRegressor
from sklearn.preprocessing import StandardScaler
from sklearn.pipeline import Pipeline

# Load dataset
data = fetch_california_housing()
X, y = data.data, data.target

# Outer evaluation split
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=1/3, random_state=42)

# Inner evaluation with 3-fold CV
inner = KFold(n_splits=3, shuffle=True, random_state=42)

# Store inner evaluation scores
inner_scores = {}

In [None]:
# Regression Tree with default parameters
tree_reg = DecisionTreeRegressor(random_state=42)
tree_default_scores = cross_val_score(tree_reg, X_train, y_train, cv=inner, scoring='neg_root_mean_squared_error')
inner_scores['Tree Default'] = -tree_default_scores.mean()

# Regression Tree with hyperparameter tuning
param_grid_tree = {'max_depth': [10, 20, 30],
                   'min_samples_split': [2, 10, 20]}
grid_search_tree = GridSearchCV(tree_reg, param_grid_tree, cv=inner, scoring='neg_root_mean_squared_error')
grid_search_tree.fit(X_train, y_train)
inner_scores['Tree Tuned'] = -grid_search_tree.best_score_

In [None]:
#KNN with StandardScaler using Pipeline:

knn_pipeline = Pipeline([
    ('scaler', StandardScaler()),
    ('knn', KNeighborsRegressor())
])

knn_std_default_scores = cross_val_score(knn_pipeline, X_train, y_train, cv=inner, scoring='neg_root_mean_squared_error')
inner_scores['KNN Standard Default'] = -knn_std_default_scores.mean()

# KNN with HPO:
pipe_param_grid = {
    'knn__n_neighbors': [3, 5, 7],
    'knn__weights': ['uniform', 'distance']
}

grid_search_knn = GridSearchCV(knn_pipeline, pipe_param_grid, cv=inner, scoring='neg_root_mean_squared_error')
grid_search_knn.fit(X_train, y_train)
inner_scores['KNN Standard Tuned'] = -grid_search_knn.best_score_

In [None]:
#Trees with default hyper-parameters:

tree_reg = DecisionTreeRegressor(random_state=42)

empty_param_grid_tree = {}

grid_search_tree_default = GridSearchCV(tree_reg, empty_param_grid_tree, cv=inner, scoring='neg_root_mean_squared_error')

grid_search_tree_default.fit(X_train, y_train)

inner_scores['Tree Default'] = -grid_search_tree_default.best_score_

#Trees with HPO:

param_grid_tree = {'max_depth': [10, 20, 30],
                   'min_samples_split': [2, 10, 20]}

grid_search_tree = GridSearchCV(tree_reg, param_grid_tree, cv=inner, scoring='neg_root_mean_squared_error')

grid_search_tree.fit(X_train, y_train)

inner_scores['Tree Tuned'] = -grid_search_tree.best_score_

In [None]:
# Dummy Regressor using mean
dummy_reg = DummyRegressor(strategy='mean')
dummy_scores = cross_val_score(dummy_reg, X_train, y_train, cv=inner, scoring='neg_root_mean_squared_error')
inner_scores['Dummy Mean'] = -dummy_scores.mean()

In [None]:
# Print inner evaluation scores and ratios
print(f"{'Model': <21} {'Inner RMSE': <15} {'Model/Dummy RMSE Ratio': <20}")

for model, score in inner_scores.items():
    ratio = score / inner_scores['Dummy Mean']
    print(f"{model: <21} {score: <15.4f} {ratio: <20.4f}")

In [None]:
# Outer evaluation for best model (estimation of future performance):
test_predictions = grid_search_knn.predict(X_test)
test_rmse = root_mean_squared_error(y_test, test_predictions)
print(f'\nBest Model: KNN Standard Tuned')
print(f'Best Model Test RMSE: {test_rmse:.4f}')

# Outer evaluation for Dummy Regressor:

dummy_reg.fit(X_train, y_train)
dummy_predictions = dummy_reg.predict(X_test)
dummy_outer_rmse = root_mean_squared_error(y_test, dummy_predictions)
print(f'\nDummy Outer RMSE: {dummy_outer_rmse:.4f}')

In [None]:
# Train final model

final_model = grid_search_knn.fit(X,y)

In [None]:
# Imputation + Scaling for KNN and SVM

from sklearn.pipeline import Pipeline
from sklearn.preprocessing import StandardScaler
from sklearn.impute import SimpleImputer
from sklearn.neighbors import KNeighborsRegressor

# Define steps in the pipeline
knn = KNeighborsRegressor()
scaler = StandardScaler()
imputer = SimpleImputer(strategy='mean')  # Imputation transformer for completing missing values with the mean

# Update the pipeline to include the imputation step
classif = Pipeline([
    ('imputation', imputer),
    ('standardization', scaler),
    ('knn', knn)
])

# Now you can fit the pipeline to your data and make predictions
classif.fit(X_train, y_train)
y_hat = classif.predict(X_test)


In [None]:
### Pipeline for preprocessing data with categorical ordinal, categorical non-ordinal, and numerical features

# Define columns by type
numerical_features = ['age', 'income', 'years_experience']
ordinal_features = ['education_level']
non_ordinal_features = ['city', 'gender']

# Define transformations for each type
numerical_transformer = Pipeline([
    ('imputer', SimpleImputer(strategy='mean'))  # Impute missing
])

ordinal_transformer = OrdinalEncoder(categories=[['High School', 'Bachelor\'s', 'Master\'s', 'PhD']])

non_ordinal_transformer = OneHotEncoder(handle_unknown='ignore')

# Combine preprocessing into a column transformer
preprocessor = ColumnTransformer([
    ('num', numerical_transformer, numerical_features),  # Impute numerical features
    ('ord', ordinal_transformer, ordinal_features),  # Encode ordinal categorical features
    ('non_ord', non_ordinal_transformer, non_ordinal_features)  # One-hot encode non-ordinal categorical features
])

# Define the full pipeline with scaling applied to all features
knn_pipeline = Pipeline([
    ('preprocessor', preprocessor),  # Preprocess all features
    ('scaler', StandardScaler()),  # Scale all features after preprocessing
    ('knn', KNeighborsRegressor())  # Apply KNN
])