In [1]:
# This notebook includes model developement with target variable as Stress_Level

In [2]:
import pandas as pd
from sklearn.model_selection import train_test_split, cross_val_score
from sklearn.metrics import classification_report, confusion_matrix, accuracy_score

In [3]:
file_path = 'Data/Cleaned_Impact_of_Remote_Work_on_Mental_Health.csv'
df = pd.read_csv(file_path)

#### Remove outliers

In [4]:
df = df.drop(df[(df['Stress_Level'] == 'High') & (df['Work_Life_Balance_Rating'] == 5)].index)
df = df.drop(df[(df['Stress_Level'] == 'Low') & (df['Work_Life_Balance_Rating'] == 1)].index)

#### Encoding

In [5]:
df['Satisfaction_with_Remote_Work_Encoded'] = df['Satisfaction_with_Remote_Work'].apply(lambda x: 1 if x == 'Satisfied' else 0)

In [6]:
df['Sleep_Quality_Encoded'] = df['Sleep_Quality'].apply(lambda x: 2 if x == 'Good' else (1 if x == 'Average' else 0))

In [7]:
def classify_stress(row):
    if row['Stress_Level'] == 'High':
        return 1
    elif row['Stress_Level'] == 'Medium' and (row['Work_Life_Balance_Rating'] < 3 or row['Sleep_Quality_Encoded'] == 0):
        return 1
    else:
        return 0

df['Stress_Level_Binary'] = df.apply(classify_stress, axis = 1)

In [8]:
features_to_drop = ['Stress_Level', 'Employee_ID', 'Satisfaction_with_Remote_Work', 'Sleep_Quality', 'Region', 'Gender', 
                    'Job_Role', 'Stress_Level_Binary']

In [9]:
X = pd.get_dummies(df.drop(features_to_drop, axis = 1), drop_first = True)
y = df['Stress_Level_Binary']

In [10]:
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size = 0.3, random_state = 42)

In [11]:
class_distribution = y_train.value_counts()
print(class_distribution)

Stress_Level_Binary
1    1679
0    1350
Name: count, dtype: int64


In [12]:
scale_pos_weight = class_distribution.max() / class_distribution.min()
scale_pos_weight

np.float64(1.2437037037037038)

## Using XGBoost

#### Tuning

In [14]:
from bayes_opt import BayesianOptimization
from sklearn.model_selection import cross_val_score
from sklearn.model_selection import StratifiedKFold
from xgboost import XGBClassifier
import numpy as np

def XGB_Bayesian(max_depth, learning_rate, n_estimators, subsample, colsample_bytree, reg_alpha, reg_lambda):
    model = XGBClassifier(
        max_depth = int(max_depth),
        learning_rate = learning_rate,
        n_estimators = int(n_estimators),
        subsample = subsample,
        colsample_bytree = colsample_bytree,
        reg_alpha = reg_alpha,
        reg_lambda = reg_lambda,
        scale_pos_weight = scale_pos_weight,
        random_state = 42
    )
    
    # Performing cross-validation
    cv = StratifiedKFold(n_splits = 5, shuffle = True, random_state = 42)
    scores = cross_val_score(model, X_train, y_train, cv = cv, scoring = 'accuracy')
    return np.mean(scores)

param_bounds = {
    'max_depth': (3, 10),
    'learning_rate': (0.01, 0.1),
    'n_estimators': (50, 300),
    'subsample': (0.4, 1.0),
    'reg_alpha': (0.0, 1.0),
    'reg_lambda': (0.0, 1.0),
    'colsample_bytree': (0.6, 1.0)
}

optimizer = BayesianOptimization(
    f = XGB_Bayesian,
    pbounds = param_bounds,
    random_state = 42,
    verbose = 2
)

optimizer.maximize(init_points = 20, n_iter = 50)
print("Best Parameters:", optimizer.max)

|   iter    |  target   | colsam... | learni... | max_depth | n_esti... | reg_alpha | reg_la... | subsample |
-------------------------------------------------------------------------------------------------------------
| [39m1        [39m | [39m0.7045   [39m | [39m0.7498   [39m | [39m0.09556  [39m | [39m8.124    [39m | [39m199.7    [39m | [39m0.156    [39m | [39m0.156    [39m | [39m0.4349   [39m |
| [35m2        [39m | [35m0.7276   [39m | [35m0.9465   [39m | [35m0.0641   [39m | [35m7.957    [39m | [35m55.15    [39m | [35m0.9699   [39m | [35m0.8324   [39m | [35m0.5274   [39m |
| [35m3        [39m | [35m0.7339   [39m | [35m0.6727   [39m | [35m0.02651  [39m | [35m5.13     [39m | [35m181.2    [39m | [35m0.4319   [39m | [35m0.2912   [39m | [35m0.7671   [39m |
| [39m4        [39m | [39m0.7329   [39m | [39m0.6558   [39m | [39m0.03629  [39m | [39m5.565    [39m | [39m164.0    [39m | [39m0.7852   [39m | [39m0.1997   [39m | [

#### Train

In [15]:
xgb_model = XGBClassifier(
    scale_pos_weight = scale_pos_weight,
    learning_rate = optimizer.max['params']['learning_rate'],
    n_estimators = int(optimizer.max['params']['n_estimators']),
    max_depth = int(optimizer.max['params']['max_depth']),
    colsample_bytree = optimizer.max['params']['colsample_bytree'],
    subsample = optimizer.max['params']['subsample'],
    reg_alpha = optimizer.max['params']['reg_alpha'],
    reg_lambda = optimizer.max['params']['reg_lambda'],
    random_state = 42
)

xgb_model.fit(X_train, y_train)

#### Predict

In [16]:
y_pred = xgb_model.predict(X_test)

print("\nClassification Report:\n", classification_report(y_test, y_pred))
print("Confusion Matrix:\n", confusion_matrix(y_test, y_pred))


Classification Report:
               precision    recall  f1-score   support

           0       0.72      0.69      0.70       596
           1       0.75      0.77      0.76       703

    accuracy                           0.73      1299
   macro avg       0.73      0.73      0.73      1299
weighted avg       0.73      0.73      0.73      1299

Confusion Matrix:
 [[411 185]
 [162 541]]
