In [83]:
# This notebook includes model developement with target variable as Stress_Level

In [13]:
import pandas as pd
from sklearn.preprocessing import StandardScaler
from xgboost import XGBClassifier
from sklearn.model_selection import train_test_split, GridSearchCV, cross_val_score
from sklearn.metrics import classification_report, confusion_matrix, accuracy_score

In [39]:
file_path = 'Data/Cleaned_Impact_of_Remote_Work_on_Mental_Health.csv'
df = pd.read_csv(file_path)

In [40]:
df['Mental_Health_Condition'].value_counts()

Mental_Health_Condition
Burnout       1280
Anxiety       1278
Depression    1246
Unknown       1196
Name: count, dtype: int64

In [41]:
df['Satisfaction_with_Remote_Work_Encoded'] = df['Satisfaction_with_Remote_Work'].apply(lambda x: 1 if x == 'Satisfied' else 0)

In [42]:
df['Sleep_Quality_Encoded'] = df['Sleep_Quality'].apply(lambda x: 2 if x == 'Good' else (1 if x == 'Average' else 0))

In [43]:
def classify_stress(row):
    if row['Stress_Level'] == 'High':
        return 1
    elif row['Stress_Level'] == 'Medium' and (row['Mental_Health_Condition'] != 'Unknown' or row['Sleep_Quality_Encoded'] == 0):
        return 1
    elif row['Stress_Level'] == 'Low' and (row['Mental_Health_Condition'] != 'Unknown' or row['Sleep_Quality_Encoded'] == 0):
        return 1
    else:
        return 0

df['Stress_Level_Binary'] = df.apply(classify_stress, axis = 1)

In [44]:
features_to_drop = ['Stress_Level', 'Employee_ID', 'Satisfaction_with_Remote_Work', 'Sleep_Quality', 'Region', 'Gender', 
                    'Job_Role', 'Stress_Level_Binary']

In [45]:
X = pd.get_dummies(df.drop(features_to_drop, axis = 1), drop_first = True)
y = df['Stress_Level_Binary']

In [46]:
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size = 0.2, random_state = 42)

In [47]:
class_distribution = y_train.value_counts()
print(class_distribution)

Stress_Level_Binary
1    3584
0     416
Name: count, dtype: int64


In [48]:
from imblearn.over_sampling import SMOTE

smote = SMOTE(random_state = 42)
X_train_resampled, y_train_resampled = smote.fit_resample(X_train, y_train)

In [49]:
class_distribution2 = y_train_resampled.value_counts()
print(class_distribution2)

Stress_Level_Binary
1    3584
0    3584
Name: count, dtype: int64


## Using XGBoost

In [57]:
from bayes_opt import BayesianOptimization
from sklearn.model_selection import cross_val_score
from sklearn.model_selection import StratifiedKFold
import numpy as np

def XGB_Bayesian(max_depth, learning_rate, n_estimators, subsample, colsample_bytree, reg_alpha, reg_lambda):
    model = XGBClassifier(
        max_depth = int(max_depth),
        learning_rate = learning_rate,
        n_estimators = int(n_estimators),
        subsample = subsample,
        colsample_bytree = colsample_bytree,
        reg_alpha = reg_alpha,
        reg_lambda = reg_lambda,
        scale_pos_weight = 1,
        random_state = 42
    )
    
    # Performing cross-validation
    cv = StratifiedKFold(n_splits = 5, shuffle = True, random_state = 42)
    scores = cross_val_score(model, X_train, y_train, cv = cv, scoring = 'f1')
    return np.mean(scores)

param_bounds = {
    'max_depth': (3, 10),
    'learning_rate': (0.01, 0.1),
    'n_estimators': (50, 300),
    'subsample': (0.4, 1.0),
    'reg_alpha': (0.0, 1.0),
    'reg_lambda': (0.0, 1.0),
    'colsample_bytree': (0.6, 1.0)
}

optimizer = BayesianOptimization(
    f = XGB_Bayesian,
    pbounds = param_bounds,
    random_state = 42,
    verbose = 2
)

optimizer.maximize(init_points = 20, n_iter = 50)
print("Best Parameters:", optimizer.max)

|   iter    |  target   | colsam... | learni... | max_depth | n_esti... | reg_alpha | reg_la... | subsample |
-------------------------------------------------------------------------------------------------------------
| [39m1        [39m | [39m0.9607   [39m | [39m0.7498   [39m | [39m0.09556  [39m | [39m8.124    [39m | [39m199.7    [39m | [39m0.156    [39m | [39m0.156    [39m | [39m0.4349   [39m |
| [35m2        [39m | [35m0.9668   [39m | [35m0.9465   [39m | [35m0.0641   [39m | [35m7.957    [39m | [35m55.15    [39m | [35m0.9699   [39m | [35m0.8324   [39m | [35m0.5274   [39m |
| [39m3        [39m | [39m0.9663   [39m | [39m0.6727   [39m | [39m0.02651  [39m | [39m5.13     [39m | [39m181.2    [39m | [39m0.4319   [39m | [39m0.2912   [39m | [39m0.7671   [39m |
| [39m4        [39m | [39m0.9656   [39m | [39m0.6558   [39m | [39m0.03629  [39m | [39m5.565    [39m | [39m164.0    [39m | [39m0.7852   [39m | [39m0.1997   [39m | [

In [58]:
xgb_model = XGBClassifier(
    scale_pos_weight = 1,
    learning_rate = 0.016436662989208373,
    n_estimators = 222,
    max_depth = 4,
    colsample_bytree = 0.9800724956182749,
    subsample = 0.7919377274900352,
    reg_alpha = 0.48518241591646816,
    reg_lambda = 0.46933473866314557,
    random_state = 42
)

xgb_model.fit(X_train, y_train)

In [74]:
y_pred = xgb_model.predict(X_test)

print("\nClassification Report:\n", classification_report(y_test, y_pred))
print("Confusion Matrix:\n", confusion_matrix(y_test, y_pred))


Classification Report:
               precision    recall  f1-score   support

           0       0.64      0.93      0.76        98
           1       0.99      0.94      0.97       902

    accuracy                           0.94      1000
   macro avg       0.81      0.94      0.86      1000
weighted avg       0.96      0.94      0.95      1000

Confusion Matrix:
 [[ 91   7]
 [ 52 850]]


In [91]:
from sklearn.metrics import classification_report, precision_recall_curve

y_probs = xgb_model.predict_proba(X_test)[:, 1]
precision, recall, thresholds = precision_recall_curve(y_test, y_probs)
f1_scores = 2 * (precision * recall) / (precision + recall)
best_threshold = thresholds[np.argmax(f1_scores)]

y_pred_adjusted = (y_probs >= best_threshold).astype(int)
print("\nClassification Report:\n", classification_report(y_test, y_pred_adjusted))
print("Confusion Matrix:\n", confusion_matrix(y_test, y_pred_adjusted))


Classification Report:
               precision    recall  f1-score   support

           0       0.66      0.91      0.76        98
           1       0.99      0.95      0.97       902

    accuracy                           0.94      1000
   macro avg       0.82      0.93      0.87      1000
weighted avg       0.96      0.94      0.95      1000

Confusion Matrix:
 [[ 89   9]
 [ 46 856]]


## Using RandomForest

In [82]:
from sklearn.ensemble import RandomForestClassifier

def RandomForest_Bayesian(n_estimators, max_depth, min_samples_split, min_samples_leaf):
    model = RandomForestClassifier(
        max_depth = int(max_depth),
        n_estimators = int(n_estimators),
        min_samples_split = int(min_samples_split),
        min_samples_leaf = int(min_samples_leaf),
        random_state = 42
    )
    
    # Performing cross-validation
    cv = StratifiedKFold(n_splits = 5, shuffle = True, random_state = 42)
    scores = cross_val_score(model, X_train, y_train, cv = cv, scoring = 'f1')
    return np.mean(scores)

param_bounds = {
    'n_estimators': (50, 300),
    'max_depth': (3, 20),
    'min_samples_split': (2, 10),
    'min_samples_leaf': (1, 5)
}

optimizer = BayesianOptimization(
    f = RandomForest_Bayesian,
    pbounds = param_bounds,
    random_state = 42,
    verbose = 2
)

optimizer.maximize(init_points = 20, n_iter = 50)
print("Best Parameters:", optimizer.max)

|   iter    |  target   | max_depth | min_sa... | min_sa... | n_esti... |
-------------------------------------------------------------------------
| [39m1        [39m | [39m0.9654   [39m | [39m9.367    [39m | [39m4.803    [39m | [39m7.856    [39m | [39m199.7    [39m |
| [39m2        [39m | [39m0.9454   [39m | [39m5.652    [39m | [39m1.624    [39m | [39m2.465    [39m | [39m266.5    [39m |
| [39m3        [39m | [39m0.9648   [39m | [39m13.22    [39m | [39m3.832    [39m | [39m2.165    [39m | [39m292.5    [39m |
| [39m4        [39m | [39m0.962    [39m | [39m17.15    [39m | [39m1.849    [39m | [39m3.455    [39m | [39m95.85    [39m |
| [39m5        [39m | [39m0.9617   [39m | [39m8.172    [39m | [39m3.099    [39m | [39m5.456    [39m | [39m122.8    [39m |
| [39m6        [39m | [39m0.9642   [39m | [39m13.4     [39m | [39m1.558    [39m | [39m4.337    [39m | [39m141.6    [39m |
| [39m7        [39m | [39m0.9637   [39m | [

In [87]:
rf_model = RandomForestClassifier(
    n_estimators = 63,
    max_depth = 19,
    min_samples_split = 2,
    min_samples_leaf = 4,
    random_state = 42,
)

rf_model.fit(X_train, y_train)

In [88]:
y_pred2 = xgb_model.predict(X_test)

print("\nClassification Report:\n", classification_report(y_test, y_pred2))
print("Confusion Matrix:\n", confusion_matrix(y_test, y_pred2))


Classification Report:
               precision    recall  f1-score   support

           0       0.64      0.93      0.76        98
           1       0.99      0.94      0.97       902

    accuracy                           0.94      1000
   macro avg       0.81      0.94      0.86      1000
weighted avg       0.96      0.94      0.95      1000

Confusion Matrix:
 [[ 91   7]
 [ 52 850]]


In [89]:
y_probs2 = xgb_model.predict_proba(X_test)[:, 1]
precision, recall, thresholds = precision_recall_curve(y_test, y_probs)
f1_scores = 2 * (precision * recall) / (precision + recall)
best_threshold = thresholds[np.argmax(f1_scores)]

y_pred_adjusted2 = (y_probs >= best_threshold).astype(int)
print("\nClassification Report:\n", classification_report(y_test, y_pred_adjusted2))
print("Confusion Matrix:\n", confusion_matrix(y_test, y_pred_adjusted2))


Classification Report:
               precision    recall  f1-score   support

           0       0.66      0.91      0.76        98
           1       0.99      0.95      0.97       902

    accuracy                           0.94      1000
   macro avg       0.82      0.93      0.87      1000
weighted avg       0.96      0.94      0.95      1000

Confusion Matrix:
 [[ 89   9]
 [ 46 856]]
