In [16]:
# This notebook includes model developement with target a new target variable as Health_Risk_Indicator

In [2]:
import pandas as pd
from sklearn.model_selection import train_test_split, cross_val_score
from sklearn.metrics import classification_report, confusion_matrix, accuracy_score

In [3]:
file_path = 'Data/Cleaned_Impact_of_Remote_Work_on_Mental_Health.csv'
df = pd.read_csv(file_path)

#### Encoding

In [4]:
df['Satisfaction_with_Remote_Work_Encoded'] = df['Satisfaction_with_Remote_Work'].apply(lambda x: 1 if x == 'Satisfied' else 0)

In [5]:
df['Sleep_Quality_Encoded'] = df['Sleep_Quality'].apply(lambda x: 2 if x == 'Good' else (1 if x == 'Average' else 0))

In [6]:
def classify_heath_risk(row):
    if row['Stress_Level'] == 'High':
        return 1
    elif row['Stress_Level'] == 'Medium' and (row['Mental_Health_Condition'] != 'Unknown' or row['Sleep_Quality_Encoded'] == 0):
        return 1
    elif row['Stress_Level'] == 'Low' and (row['Mental_Health_Condition'] != 'Unknown' or row['Sleep_Quality_Encoded'] == 0):
        return 1
    else:
        return 0

df['Health_Risk_Indicator'] = df.apply(classify_heath_risk, axis = 1)

In [7]:
features_to_drop = ['Stress_Level', 'Employee_ID', 'Satisfaction_with_Remote_Work', 'Sleep_Quality', 'Region', 'Gender', 
                    'Job_Role', 'Health_Risk_Indicator']

In [8]:
X = pd.get_dummies(df.drop(features_to_drop, axis = 1), drop_first = True)
y = df['Health_Risk_Indicator']

In [9]:
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size = 0.2, random_state = 42)

In [10]:
class_distribution = y_train.value_counts()
print(class_distribution)

Health_Risk_Indicator
1    3584
0     416
Name: count, dtype: int64


In [11]:
scale_pos_weight = class_distribution.max() / class_distribution.min()
scale_pos_weight

np.float64(8.615384615384615)

## Using XGBoost

#### Tuning

In [12]:
from bayes_opt import BayesianOptimization
from sklearn.model_selection import cross_val_score
from sklearn.model_selection import StratifiedKFold
from xgboost import XGBClassifier
import numpy as np

def XGB_Bayesian(max_depth, learning_rate, n_estimators, subsample, colsample_bytree, reg_alpha, reg_lambda):
    model = XGBClassifier(
        max_depth = int(max_depth),
        learning_rate = learning_rate,
        n_estimators = int(n_estimators),
        subsample = subsample,
        colsample_bytree = colsample_bytree,
        reg_alpha = reg_alpha,
        reg_lambda = reg_lambda,
        scale_pos_weight = scale_pos_weight,
        random_state = 42
    )
    
    # Performing cross-validation
    cv = StratifiedKFold(n_splits = 5, shuffle = True, random_state = 42)
    scores = cross_val_score(model, X_train, y_train, cv = cv, scoring = 'f1')
    return np.mean(scores)

param_bounds = {
    'max_depth': (3, 10),
    'learning_rate': (0.01, 0.1),
    'n_estimators': (50, 300),
    'subsample': (0.4, 1.0),
    'reg_alpha': (0.0, 1.0),
    'reg_lambda': (0.0, 1.0),
    'colsample_bytree': (0.6, 1.0)
}

optimizer = BayesianOptimization(
    f = XGB_Bayesian,
    pbounds = param_bounds,
    random_state = 42,
    verbose = 2
)

optimizer.maximize(init_points = 20, n_iter = 50)
print("Best Parameters:", optimizer.max)

|   iter    |  target   | colsam... | learni... | max_depth | n_esti... | reg_alpha | reg_la... | subsample |
-------------------------------------------------------------------------------------------------------------
| [39m1        [39m | [39m0.9555   [39m | [39m0.7498   [39m | [39m0.09556  [39m | [39m8.124    [39m | [39m199.7    [39m | [39m0.156    [39m | [39m0.156    [39m | [39m0.4349   [39m |
| [39m2        [39m | [39m0.9478   [39m | [39m0.9465   [39m | [39m0.0641   [39m | [39m7.957    [39m | [39m55.15    [39m | [39m0.9699   [39m | [39m0.8324   [39m | [39m0.5274   [39m |
| [39m3        [39m | [39m0.9448   [39m | [39m0.6727   [39m | [39m0.02651  [39m | [39m5.13     [39m | [39m181.2    [39m | [39m0.4319   [39m | [39m0.2912   [39m | [39m0.7671   [39m |
| [39m4        [39m | [39m0.9445   [39m | [39m0.6558   [39m | [39m0.03629  [39m | [39m5.565    [39m | [39m164.0    [39m | [39m0.7852   [39m | [39m0.1997   [39m | [

#### Train

In [13]:
xgb_model = XGBClassifier(
    scale_pos_weight = scale_pos_weight,
    learning_rate = optimizer.max['params']['learning_rate'],
    n_estimators = int(optimizer.max['params']['n_estimators']),
    max_depth = int(optimizer.max['params']['max_depth']),
    colsample_bytree = optimizer.max['params']['colsample_bytree'],
    subsample = optimizer.max['params']['subsample'],
    reg_alpha = optimizer.max['params']['reg_alpha'],
    reg_lambda = optimizer.max['params']['reg_lambda'],
    random_state = 42
)

xgb_model.fit(X_train, y_train)

#### Predict

In [14]:
y_pred = xgb_model.predict(X_test)

print("\nClassification Report:\n", classification_report(y_test, y_pred))
print("Confusion Matrix:\n", confusion_matrix(y_test, y_pred))


Classification Report:
               precision    recall  f1-score   support

           0       0.61      0.55      0.58        98
           1       0.95      0.96      0.96       902

    accuracy                           0.92      1000
   macro avg       0.78      0.76      0.77      1000
weighted avg       0.92      0.92      0.92      1000

Confusion Matrix:
 [[ 54  44]
 [ 35 867]]


#### Change threshold to improve performance

In [15]:
from sklearn.metrics import classification_report, precision_recall_curve

y_probs = xgb_model.predict_proba(X_test)[:, 1]
precision, recall, thresholds = precision_recall_curve(y_test, y_probs)
f1_scores = 2 * (precision * recall) / (precision + recall)
best_threshold = thresholds[np.argmax(f1_scores)]

y_pred_adjusted = (y_probs >= best_threshold).astype(int)
print("\nClassification Report:\n", classification_report(y_test, y_pred_adjusted))
print("Confusion Matrix:\n", confusion_matrix(y_test, y_pred_adjusted))


Classification Report:
               precision    recall  f1-score   support

           0       0.65      0.96      0.77        98
           1       1.00      0.94      0.97       902

    accuracy                           0.94      1000
   macro avg       0.82      0.95      0.87      1000
weighted avg       0.96      0.94      0.95      1000

Confusion Matrix:
 [[ 94   4]
 [ 51 851]]
