In [246]:
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
import seaborn as sns
import os
import sys
from sklearn.base import BaseEstimator, TransformerMixin
from sklearn.preprocessing import OrdinalEncoder, OneHotEncoder, TargetEncoder, StandardScaler
from sklearn.model_selection import train_test_split
import xgboost as xgb

pd.set_option('future.no_silent_downcasting', True)
from tabulate import tabulate

In [248]:
df = pd.read_csv('data/mental_health_condition_data.csv')
df.shape

(5000, 20)

In [228]:
df.columns

Index(['Employee_ID', 'Age', 'Gender', 'Job_Role', 'Industry',
       'Years_of_Experience', 'Work_Location', 'Hours_Worked_Per_Week',
       'Number_of_Virtual_Meetings', 'Work_Life_Balance_Rating',
       'Stress_Level', 'Mental_Health_Condition',
       'Access_to_Mental_Health_Resources', 'Productivity_Change',
       'Social_Isolation_Rating', 'Satisfaction_with_Remote_Work',
       'Company_Support_for_Remote_Work', 'Physical_Activity', 'Sleep_Quality',
       'Region'],
      dtype='object')

In [229]:
# Null values
n_nulls = df.isnull().sum().sum()
print(n_nulls)

# fill null values
df = df.ffill().bfill()

df = df.drop(columns=["Employee_ID"])

2825


### Data Preprocessing

In [230]:
class CategoricalEncoder(BaseEstimator, TransformerMixin):
    def __init__(self):
        self.ord_cols = ["Work_Location", "Industry", "Job_Role", "Gender"]
        self.mappings = {
            "Stress_Level": {"High": 3, "Medium":2, "Low": 0},
            "Access_to_Mental_Health_Resources": {"Yes": 1, "No": 0},
            "Productivity_Change": {"Decrease": -1, "No Change": 0, "Increase": 1},
            "Satisfaction_with_Remote_Work": {"Unsatisfied": -1, "Neutral": 0, "Satisfied": 1},
            "Physical_Activity": {"Weekly": 1, "Daily": 2},
            "Sleep_Quality": {"Poor": -1, "Average": 0, "Good": 1},
            "Region": {"Africa": -2, "South America": -1, "Asia": 0, "Oceania": 2, "North America": 1, "Europe": 2}
        }
        self.ord_encoder = OrdinalEncoder(handle_unknown="use_encoded_value", unknown_value=-1)
  
    def _custom_encoder(self, df):
        df = df.copy()
        for col in self.mappings.keys():
            df.loc[:, col] = df[col].replace(self.mappings[col])
        
        return df
    
    def fit(self, df):
        self.ord_encoder.fit(df[self.ord_cols])
    
    def transform(self, df):
        df = df.copy()
        df[self.ord_cols] = self.ord_encoder.transform(df[self.ord_cols])
        df = self._custom_encoder(df)
        return df


In [231]:
class FeatureScaler(BaseEstimator, TransformerMixin):
    def __init__(self):
        self.std_scaler = StandardScaler()
    
    def fit(self, df):
        self.std_scaler.fit(df)

    def transform(self, df):
        X = self.std_scaler.transform(df)
        df = pd.DataFrame(X, columns=df.columns)
        return df

In [232]:
X, y = df.drop(columns=["Mental_Health_Condition"]), df["Mental_Health_Condition"]
X_train, X_val, y_train, y_val = train_test_split(X, y, test_size=0.2)

# categorical encoding
cat_encoder = CategoricalEncoder()
cat_encoder.fit(X_train)
X_train, X_val = cat_encoder.transform(X_train), cat_encoder.transform(X_val)
target_mapping = {"Burnout": 0, "Depression": 1, "Anxiety": 2}
y_train, y_val = y_train.replace(target_mapping).astype('int'), y_val.replace(target_mapping).astype('int')

# feature scaling
feat_scaler = FeatureScaler()
feat_scaler.fit(X_train)
X_train, X_val = feat_scaler.transform(X_train), feat_scaler.transform(X_val)

## Training

In [233]:
def evaluate_model(model, X_val, y_val):
    pred = model.predict(X_val)
    data = [
        ["Accuracy", accuracy_score(y_val, pred)],
        ["Precision", precision_score(y_val, pred, average="weighted")],
        ["Recall", recall_score(y_val, pred, average="weighted" )],
        ["F1 Score", f1_score(y_val, pred, average="weighted")],
        # ["ROC-AUC", roc_auc_score(y_val, pred, average="weighted")]
    ]

    print(tabulate(data, headers=["Metric", "Value"]))

In [234]:
from sklearn.pipeline import Pipeline
from sklearn.linear_model import LogisticRegression
from sklearn.svm import SVC
from sklearn.neighbors import KNeighborsClassifier
import xgboost as xgb
from sklearn.metrics import accuracy_score, roc_auc_score, f1_score, precision_score, recall_score

In [235]:
log_clf = LogisticRegression(max_iter=200, multi_class='multinomial', solver='lbfgs')
log_clf.fit(X_train, y_train)
evaluate_model(log_clf, X_val, y_val)

Metric        Value
---------  --------
Accuracy   0.355
Precision  0.356139
Recall     0.355
F1 Score   0.349755




In [236]:
params = {
    "objective": "multi:softmax",
    "learning_rate": 0.01,
    "n_estimators": 50,
    "max_depth": 9,
    "eval_metric": "merror"
}
xgb_clf = xgb.XGBClassifier(**params)
xgb_clf.fit(X_train, y_train, eval_set=[(X_train, y_train), (X_val, y_val)], verbose=True)

[0]	validation_0-merror:0.31450	validation_1-merror:0.62800
[1]	validation_0-merror:0.29275	validation_1-merror:0.63900
[2]	validation_0-merror:0.29375	validation_1-merror:0.64200
[3]	validation_0-merror:0.29150	validation_1-merror:0.64200
[4]	validation_0-merror:0.28925	validation_1-merror:0.63600
[5]	validation_0-merror:0.28575	validation_1-merror:0.64500
[6]	validation_0-merror:0.28200	validation_1-merror:0.65200
[7]	validation_0-merror:0.28125	validation_1-merror:0.65400
[8]	validation_0-merror:0.28200	validation_1-merror:0.64600
[9]	validation_0-merror:0.28025	validation_1-merror:0.65200


[10]	validation_0-merror:0.28125	validation_1-merror:0.64900
[11]	validation_0-merror:0.27825	validation_1-merror:0.65000
[12]	validation_0-merror:0.27575	validation_1-merror:0.65200
[13]	validation_0-merror:0.27300	validation_1-merror:0.64400
[14]	validation_0-merror:0.27425	validation_1-merror:0.64500
[15]	validation_0-merror:0.26875	validation_1-merror:0.64400
[16]	validation_0-merror:0.26800	validation_1-merror:0.64900
[17]	validation_0-merror:0.26400	validation_1-merror:0.64100
[18]	validation_0-merror:0.26200	validation_1-merror:0.65000
[19]	validation_0-merror:0.25925	validation_1-merror:0.65300
[20]	validation_0-merror:0.25500	validation_1-merror:0.64700
[21]	validation_0-merror:0.24750	validation_1-merror:0.64700
[22]	validation_0-merror:0.24350	validation_1-merror:0.64600
[23]	validation_0-merror:0.23850	validation_1-merror:0.64700
[24]	validation_0-merror:0.23450	validation_1-merror:0.64800
[25]	validation_0-merror:0.22975	validation_1-merror:0.65000
[26]	validation_0-merror

In [237]:
evaluate_model(xgb_clf, X_val, y_val)

Metric        Value
---------  --------
Accuracy   0.335
Precision  0.33349
Recall     0.335
F1 Score   0.333249


In [238]:
svm_classifier = SVC(kernel='rbf', decision_function_shape='ovr')
svm_classifier.fit(X_train, y_train)
evaluate_model(svm_classifier, X_val, y_val)

Metric        Value
---------  --------
Accuracy   0.359
Precision  0.359445
Recall     0.359
F1 Score   0.357502


In [239]:
knn_clf = KNeighborsClassifier(n_neighbors=3)
knn_clf.fit(X_train, y_train)
evaluate_model(knn_clf, X_val, y_val)

Metric        Value
---------  --------
Accuracy   0.302
Precision  0.301827
Recall     0.302
F1 Score   0.290843


### Model Export

In [240]:
import joblib
import pickle

In [None]:
joblib.dump(cat_encoder, "models/categorical_encoder.pkl")
joblib.dump(feat_scaler, "models/feature_scaler.pkl")
joblib.dump(xgb_clf, "models/xgb_classifier.pkl")

['xgb_classifier.pkl']