In [1]:
# This notebook includes model developement with target variable as Stress_Level

In [2]:
import pandas as pd
from sklearn.model_selection import train_test_split, cross_val_score
from sklearn.metrics import classification_report, confusion_matrix, accuracy_score, precision_recall_curve

In [3]:
file_path = 'Data/Cleaned_Impact_of_Remote_Work_on_Mental_Health.csv'
df = pd.read_csv(file_path)

#### Encoding

In [4]:
df['Satisfaction_with_Remote_Work_Encoded'] = df['Satisfaction_with_Remote_Work'].apply(lambda x: 1 if x == 'Satisfied' else 0)

In [5]:
df['Sleep_Quality_Encoded'] = df['Sleep_Quality'].apply(lambda x: 2 if x == 'Good' else (1 if x == 'Average' else 0))

In [6]:
def classify_stress(row):
    if row['Stress_Level'] == 'High':
        return 0
    elif row['Stress_Level'] == 'Medium':
        return 1
    else:
        return 2

df['Stress_Level_Encoded'] = df.apply(classify_stress, axis = 1)

In [7]:
features_to_drop = ['Stress_Level', 'Employee_ID', 'Satisfaction_with_Remote_Work', 'Sleep_Quality', 'Stress_Level_Encoded']

In [8]:
X = pd.get_dummies(df.drop(features_to_drop, axis = 1), drop_first = True)
y = df['Stress_Level_Encoded']

In [9]:
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size = 0.2, random_state = 42)

In [10]:
class_distribution = y_train.value_counts()
print(class_distribution)

Stress_Level_Encoded
0    1351
1    1330
2    1319
Name: count, dtype: int64


## Using XGBoost

#### Tuning

In [11]:
from bayes_opt import BayesianOptimization
from sklearn.model_selection import cross_val_score
from sklearn.model_selection import StratifiedKFold
from xgboost import XGBClassifier
import numpy as np

def XGB_Bayesian(max_depth, learning_rate, n_estimators, subsample, colsample_bytree, reg_alpha, reg_lambda):
    model = XGBClassifier(
        max_depth = int(max_depth),
        learning_rate = learning_rate,
        n_estimators = int(n_estimators),
        subsample = subsample,
        colsample_bytree = colsample_bytree,
        reg_alpha = reg_alpha,
        reg_lambda = reg_lambda,
        objective = 'multi:softprob',
        num_class = 3,
        random_state = 42
    )
    
    # Performing cross-validation
    cv = StratifiedKFold(n_splits = 5, shuffle = True, random_state = 42)
    scores = cross_val_score(model, X_train, y_train, cv = cv, scoring = 'f1_macro')
    return np.mean(scores)

param_bounds = {
    'max_depth': (3, 10),
    'learning_rate': (0.01, 0.1),
    'n_estimators': (50, 300),
    'subsample': (0.4, 1.0),
    'reg_alpha': (0.0, 1.0),
    'reg_lambda': (0.0, 1.0),
    'colsample_bytree': (0.6, 1.0)
}

optimizer = BayesianOptimization(
    f = XGB_Bayesian,
    pbounds = param_bounds,
    random_state = 42,
    verbose = 2
)

optimizer.maximize(init_points = 20, n_iter = 50)
print("Best Parameters:", optimizer.max)

|   iter    |  target   | colsam... | learni... | max_depth | n_esti... | reg_alpha | reg_la... | subsample |
-------------------------------------------------------------------------------------------------------------
| [39m1        [39m | [39m0.3373   [39m | [39m0.7498   [39m | [39m0.09556  [39m | [39m8.124    [39m | [39m199.7    [39m | [39m0.156    [39m | [39m0.156    [39m | [39m0.4349   [39m |
| [35m2        [39m | [35m0.3413   [39m | [35m0.9465   [39m | [35m0.0641   [39m | [35m7.957    [39m | [35m55.15    [39m | [35m0.9699   [39m | [35m0.8324   [39m | [35m0.5274   [39m |
| [35m3        [39m | [35m0.3425   [39m | [35m0.6727   [39m | [35m0.02651  [39m | [35m5.13     [39m | [35m181.2    [39m | [35m0.4319   [39m | [35m0.2912   [39m | [35m0.7671   [39m |
| [39m4        [39m | [39m0.3263   [39m | [39m0.6558   [39m | [39m0.03629  [39m | [39m5.565    [39m | [39m164.0    [39m | [39m0.7852   [39m | [39m0.1997   [39m | [

#### Train

In [12]:
xgb_model = XGBClassifier(
    learning_rate = optimizer.max['params']['learning_rate'],
    n_estimators = int(optimizer.max['params']['n_estimators']),
    max_depth = int(optimizer.max['params']['max_depth']),
    colsample_bytree = optimizer.max['params']['colsample_bytree'],
    subsample = optimizer.max['params']['subsample'],
    reg_alpha = optimizer.max['params']['reg_alpha'],
    reg_lambda = optimizer.max['params']['reg_lambda'],
    objective = 'multi:softprob',
    num_class = 3,
    random_state = 42
)

xgb_model.fit(X_train, y_train)

#### Predict

In [13]:
y_pred = xgb_model.predict(X_test)

print("\nClassification Report:\n", classification_report(y_test, y_pred))
print("Confusion Matrix:\n", confusion_matrix(y_test, y_pred))


Classification Report:
               precision    recall  f1-score   support

           0       0.35      0.37      0.36       335
           1       0.33      0.31      0.32       339
           2       0.34      0.34      0.34       326

    accuracy                           0.34      1000
   macro avg       0.34      0.34      0.34      1000
weighted avg       0.34      0.34      0.34      1000

Confusion Matrix:
 [[125 112  98]
 [119 105 115]
 [116  98 112]]


#### Feature Importance

In [14]:
feature_importance = pd.DataFrame({
    'Feature': X.columns,
    'Importance': xgb_model.feature_importances_
}).sort_values(by = 'Importance', ascending = False)

print(feature_importance)

                                  Feature  Importance
31          Productivity_Change_No Change    0.027634
18                     Industry_Education    0.027618
2                   Hours_Worked_Per_Week    0.027305
29  Access_to_Mental_Health_Resources_Yes    0.027301
14                     Job_Role_Marketing    0.027105
17             Job_Role_Software Engineer    0.026837
0                                     Age    0.026683
4                Work_Life_Balance_Rating    0.026616
6         Company_Support_for_Remote_Work    0.026580
19                       Industry_Finance    0.026454
3              Number_of_Virtual_Meetings    0.026331
23                        Industry_Retail    0.026232
9                             Gender_Male    0.025973
34                            Region_Asia    0.025935
16                         Job_Role_Sales    0.025839
38                   Region_South America    0.025755
5                 Social_Isolation_Rating    0.025753
32  Physical_Activity_Occasi