In [3]:
import pandas as pd
import numpy as py

from sklearn.model_selection import train_test_split, StratifiedKFold, cross_val_score
from sklearn.compose import ColumnTransformer
from sklearn.pipeline import Pipeline
from sklearn.preprocessing import OneHotEncoder, OrdinalEncoder

from sklearn.ensemble import RandomForestClassifier, HistGradientBoostingClassifier
from sklearn.metrics import accuracy_score, f1_score, confusion_matrix, classification_report

In [10]:
#loading the data and quick checks
df = pd.read_csv("../data/Impact_of_Remote_Work_on_Mental_Health.csv")
print(f"Shape: {df.shape}")
print(f"\nColumns:\n {df.columns}")
print(f"\nTarget:\n {df["Stress_Level"].value_counts(normalize=True)}")

Shape: (5000, 20)

Columns:
 Index(['Employee_ID', 'Age', 'Gender', 'Job_Role', 'Industry',
       'Years_of_Experience', 'Work_Location', 'Hours_Worked_Per_Week',
       'Number_of_Virtual_Meetings', 'Work_Life_Balance_Rating',
       'Stress_Level', 'Mental_Health_Condition',
       'Access_to_Mental_Health_Resources', 'Productivity_Change',
       'Social_Isolation_Rating', 'Satisfaction_with_Remote_Work',
       'Company_Support_for_Remote_Work', 'Physical_Activity', 'Sleep_Quality',
       'Region'],
      dtype='object')

Target:
 Stress_Level
High      0.3372
Medium    0.3338
Low       0.3290
Name: proportion, dtype: float64


In [12]:
# Feature groups for preprocessing

ordinal_features = ["Work_Life_Balance_Rating", "Social_Isolation_Rating", "Satisfaction_with_Remote_Work", "Company_Support_for_Remote_Work", 
                    "Sleep_Quality"]
nominal_features = ["Work_Location", "Job_Role"]
numeric_features = ["Age", "Years_of_Experience", "Hours_Worked_Per_Week","Number_of_Virtual_Meetings"]

print(f"Ordinal: {ordinal_features}")
print(f"Nominal: {nominal_features}")
print(f"Numeric: {numeric_features}")

Ordinal: ['Work_Life_Balance_Rating', 'Social_Isolation_Rating', 'Satisfaction_with_Remote_Work', 'Company_Support_for_Remote_Work', 'Sleep_Quality']
Nominal: ['Work_Location', 'Job_Role']
Numeric: ['Age', 'Years_of_Experience', 'Hours_Worked_Per_Week', 'Number_of_Virtual_Meetings']


In [14]:
#unwanted columns from the data
drop_cols = ["Employee_ID", "Mental_Health_Condition", "Productivity_Change", "Physical_Activity","Gender",
            "Industry", "Region"]

#deleting the columns and updating df
df = df.drop(columns=drop_cols, errors="ignore")
print(df)

      Age           Job_Role  Years_of_Experience Work_Location  \
0      32                 HR                   13        Hybrid   
1      40     Data Scientist                    3        Remote   
2      59  Software Engineer                   22        Hybrid   
3      27  Software Engineer                   20        Onsite   
4      49              Sales                   32        Onsite   
...   ...                ...                  ...           ...   
4995   32              Sales                    4        Onsite   
4996   39              Sales                   27        Onsite   
4997   42              Sales                   21        Hybrid   
4998   27              Sales                   26        Remote   
4999   29                 HR                   30        Onsite   

      Hours_Worked_Per_Week  Number_of_Virtual_Meetings  \
0                        47                           7   
1                        52                           4   
2                 

In [16]:
#dropping the target from feature and defining feature 
x = df.drop(columns = ["Stress_Level"])

#defining the target (y)
y = df["Stress_Level"]

In [19]:
#evaluation
print(f"x shape: {x.shape}")
print(f"\ny distribution: \n {y.value_counts(normalize = True)}")

x shape: (5000, 12)

y distribution: 
 Stress_Level
High      0.3372
Medium    0.3338
Low       0.3290
Name: proportion, dtype: float64
