In [3]:
import pandas as pd
import numpy as np

from sklearn.model_selection import train_test_split, StratifiedKFold, cross_val_score
from sklearn.compose import ColumnTransformer
from sklearn.pipeline import Pipeline
from sklearn.preprocessing import OneHotEncoder, OrdinalEncoder

from sklearn.ensemble import RandomForestClassifier, HistGradientBoostingClassifier
from sklearn.metrics import accuracy_score, f1_score, confusion_matrix, classification_report

In [4]:
#loading the data and quick checks
df = pd.read_csv("../data/Impact_of_Remote_Work_on_Mental_Health.csv")
print(f"Shape: {df.shape}")
print(f"\nColumns:\n {df.columns}")
print(f"\nTarget:\n {df["Stress_Level"].value_counts(normalize=True)}")

Shape: (5000, 20)

Columns:
 Index(['Employee_ID', 'Age', 'Gender', 'Job_Role', 'Industry',
       'Years_of_Experience', 'Work_Location', 'Hours_Worked_Per_Week',
       'Number_of_Virtual_Meetings', 'Work_Life_Balance_Rating',
       'Stress_Level', 'Mental_Health_Condition',
       'Access_to_Mental_Health_Resources', 'Productivity_Change',
       'Social_Isolation_Rating', 'Satisfaction_with_Remote_Work',
       'Company_Support_for_Remote_Work', 'Physical_Activity', 'Sleep_Quality',
       'Region'],
      dtype='object')

Target:
 Stress_Level
High      0.3372
Medium    0.3338
Low       0.3290
Name: proportion, dtype: float64


In [5]:
# Feature groups for preprocessing

ordinal_features = ["Work_Life_Balance_Rating", "Social_Isolation_Rating", "Satisfaction_with_Remote_Work", "Company_Support_for_Remote_Work", 
                    "Sleep_Quality"]
nominal_features = ["Work_Location", "Job_Role"]
numeric_features = ["Age", "Years_of_Experience", "Hours_Worked_Per_Week","Number_of_Virtual_Meetings"]

print(f"Ordinal: {ordinal_features}")
print(f"Nominal: {nominal_features}")
print(f"Numeric: {numeric_features}")

Ordinal: ['Work_Life_Balance_Rating', 'Social_Isolation_Rating', 'Satisfaction_with_Remote_Work', 'Company_Support_for_Remote_Work', 'Sleep_Quality']
Nominal: ['Work_Location', 'Job_Role']
Numeric: ['Age', 'Years_of_Experience', 'Hours_Worked_Per_Week', 'Number_of_Virtual_Meetings']


In [6]:
#unwanted columns from the data
drop_cols = ["Employee_ID", "Mental_Health_Condition", "Productivity_Change", "Physical_Activity","Gender",
            "Industry", "Region"]

#deleting the columns and updating df
df = df.drop(columns=drop_cols, errors="ignore")
print(df)

      Age           Job_Role  Years_of_Experience Work_Location  \
0      32                 HR                   13        Hybrid   
1      40     Data Scientist                    3        Remote   
2      59  Software Engineer                   22        Hybrid   
3      27  Software Engineer                   20        Onsite   
4      49              Sales                   32        Onsite   
...   ...                ...                  ...           ...   
4995   32              Sales                    4        Onsite   
4996   39              Sales                   27        Onsite   
4997   42              Sales                   21        Hybrid   
4998   27              Sales                   26        Remote   
4999   29                 HR                   30        Onsite   

      Hours_Worked_Per_Week  Number_of_Virtual_Meetings  \
0                        47                           7   
1                        52                           4   
2                 

In [7]:
#dropping the target from feature and defining feature 
X = df.drop(columns = ["Stress_Level"])

#defining the target (y)
y = df["Stress_Level"]

In [8]:
#evaluation
print(f"X shape: {X.shape}")
print(f"\ny distribution: \n {y.value_counts(normalize = True)}")

X shape: (5000, 12)

y distribution: 
 Stress_Level
High      0.3372
Medium    0.3338
Low       0.3290
Name: proportion, dtype: float64


In [9]:
#moving ordinal columns with numeric values to numeric features list
numeric_ordinals = [c for c in ordinal_features if c in df.columns and pd.api.types.is_numeric_dtype(df[c])]
numeric_features = list(dict.fromkeys(numeric_features + numeric_ordinals))
ordinal_features = [c for c in ordinal_features if c not in numeric_ordinals]

print(f" Numeric ordinals moved from ordinal features to numeric features: {numeric_ordinals}")
print(f"final numeric features: {numeric_features}")
print(f"final ordinal features: {ordinal_features}")
print(f"final nominal features: {nominal_features}")

 Numeric ordinals moved from ordinal features to numeric features: ['Work_Life_Balance_Rating', 'Social_Isolation_Rating', 'Company_Support_for_Remote_Work']
final numeric features: ['Age', 'Years_of_Experience', 'Hours_Worked_Per_Week', 'Number_of_Virtual_Meetings', 'Work_Life_Balance_Rating', 'Social_Isolation_Rating', 'Company_Support_for_Remote_Work']
final ordinal features: ['Satisfaction_with_Remote_Work', 'Sleep_Quality']
final nominal features: ['Work_Location', 'Job_Role']


In [11]:
#define encoders
ord_enc = OrdinalEncoder(handle_unknown = "use_encoded_value", unknown_value = -1)   #converts categories to 0,1,2 etc
ohe = OneHotEncoder(handle_unknown = "ignore")  #creates dummy coulumns

#building the preprocessor
preprocessor = ColumnTransformer(
    transformers = [
        ("num", "passthrough", numeric_features),
        ("ord", ord_enc, ordinal_features),
        ("nom", ohe, nominal_features)
    ], remainder = "drop"
)

In [14]:
#splitting the data
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size = 0.2, random_state = 42, stratify = y)

#defining the models
models = {
    "RandomForest": RandomForestClassifier(random_state = 42, n_estimators = 300, n_jobs = -1),
    "HistGradientBoosting": HistGradientBoostingClassifier (random_state = 42)
}

for name, model in models.items():
    pipe = Pipeline([
        ("prep", preprocessor),
        ("model", model)
    ])

    pipe.fit(X_train, y_train)
    preds = pipe.predict(X_test) 

    acc = accuracy_score(y_test, preds)
    f1 = f1_score(y_test, preds, average="macro")
    cm = confusion_matrix(y_test, preds)
    report = classification_report(y_test, preds)

    print(f"\n{'='*60}")
    print(f"MODEL: {name}")
    print(f"Accuracy:   {acc:.4f} ({acc:.2%})")
    print(f"F1 (macro): {f1:.4f}")
    print(f"Confusion Matrix:\n{cm}")
    print(f"\nDetailed Classification Report:\n{report}")


MODEL: RandomForest
Accuracy:   0.3370 (33.70%)
F1 (macro): 0.3368
Confusion Matrix:
[[115 110 112]
 [109 105 115]
 [111 106 117]]

Detailed Classification Report:
              precision    recall  f1-score   support

        High       0.34      0.34      0.34       337
         Low       0.33      0.32      0.32       329
      Medium       0.34      0.35      0.35       334

    accuracy                           0.34      1000
   macro avg       0.34      0.34      0.34      1000
weighted avg       0.34      0.34      0.34      1000


MODEL: HistGradientBoosting
Accuracy:   0.3310 (33.10%)
F1 (macro): 0.3309
Confusion Matrix:
[[105 109 123]
 [114 111 104]
 [102 117 115]]

Detailed Classification Report:
              precision    recall  f1-score   support

        High       0.33      0.31      0.32       337
         Low       0.33      0.34      0.33       329
      Medium       0.34      0.34      0.34       334

    accuracy                           0.33      1000
   macro 

In [15]:
results = []

for name, model in models.items():
    pipe = Pipeline([
        ("prep", preprocessor),
        ("model", model)
    ])
    pipe.fit(X_train, y_train)
    preds = pipe.predict(X_test)

    results.append({
        "Model": name,
        "Accuracy": accuracy_score(y_test, preds),
        "F1_macro": f1_score(y_test, preds, average="macro")
    })

pd.DataFrame(results)


Unnamed: 0,Model,Accuracy,F1_macro
0,RandomForest,0.337,0.336824
1,HistGradientBoosting,0.331,0.330906
