In [1]:
# This notebook includes model developement with target variable as Satisfaction_with_Remote_Work

In [2]:
import pandas as pd
from sklearn.model_selection import train_test_split, cross_val_score
from sklearn.metrics import classification_report, confusion_matrix, accuracy_score, precision_recall_curve

In [3]:
file_path = 'Data/Cleaned_Impact_of_Remote_Work_on_Mental_Health.csv'
df = pd.read_csv(file_path)

#### Data cleanup for unrelated values for Satisfaction_with_Remote_Work

In [4]:
df = df.loc[df['Work_Location'] != 'Onsite']

#### Remove outliers

In [5]:
df = df.drop(df[(df['Satisfaction_with_Remote_Work'] == 'Unsatisfied') & (df['Work_Life_Balance_Rating'] == 5)].index)

#### Encoding

In [6]:
df['Sleep_Quality_Encoded'] = df['Sleep_Quality'].apply(lambda x: 2 if x == 'Good' else (1 if x == 'Average' else 0))

In [7]:
df['Stress_Level_Encoded'] = df['Stress_Level'].apply(lambda x: 2 if x == 'Low' else (1 if x == 'Medium' else 0))

In [8]:
def classify_satifaction(row):
    if row['Satisfaction_with_Remote_Work'] == 'Satisfied':
        return 2
    elif row['Satisfaction_with_Remote_Work'] == 'Neutral':
        return 1
    else:
        return 0

df['Satisfaction_with_Remote_Work_Encoded'] = df.apply(classify_satifaction, axis = 1)

In [9]:
features_to_drop = ['Employee_ID', 'Satisfaction_with_Remote_Work', 'Satisfaction_with_Remote_Work_Encoded', 'Sleep_Quality', 
                    'Stress_Level']

In [10]:
X = pd.get_dummies(df.drop(features_to_drop, axis = 1), drop_first = True)
y = df['Satisfaction_with_Remote_Work_Encoded']

In [11]:
from sklearn.model_selection import train_test_split

X_train, X_test, y_train, y_test = train_test_split(X, y, test_size = 0.2, random_state = 42)

In [12]:
class_distribution = df['Satisfaction_with_Remote_Work_Encoded'].value_counts()
print(class_distribution)

Satisfaction_with_Remote_Work_Encoded
1    1142
2    1083
0     904
Name: count, dtype: int64


## Using RandomForest

#### Tuning

In [13]:
from bayes_opt import BayesianOptimization
from sklearn.model_selection import cross_val_score
from sklearn.model_selection import StratifiedKFold
from sklearn.ensemble import RandomForestClassifier
import numpy as np

def RandomForest_Bayesian(n_estimators, max_depth, min_samples_split, min_samples_leaf):
    model = RandomForestClassifier(
        max_depth = int(max_depth),
        n_estimators = int(n_estimators),
        min_samples_split = int(min_samples_split),
        min_samples_leaf = int(min_samples_leaf),
        class_weight = 'balanced',
        random_state = 42
    )
    
    # Performing cross-validation
    cv = StratifiedKFold(n_splits = 5, shuffle = True, random_state = 42)
    scores = cross_val_score(model, X_train, y_train, cv = cv, scoring = 'f1_macro')
    return np.mean(scores)

param_bounds = {
    'n_estimators': (50, 300),
    'max_depth': (3, 20),
    'min_samples_split': (2, 10),
    'min_samples_leaf': (1, 5)
}

optimizer = BayesianOptimization(
    f = RandomForest_Bayesian,
    pbounds = param_bounds,
    random_state = 42,
    verbose = 2
)

optimizer.maximize(init_points = 20, n_iter = 50)
print("Best Parameters:", optimizer.max)

|   iter    |  target   | max_depth | min_sa... | min_sa... | n_esti... |
-------------------------------------------------------------------------
| [39m1        [39m | [39m0.3306   [39m | [39m9.367    [39m | [39m4.803    [39m | [39m7.856    [39m | [39m199.7    [39m |
| [35m2        [39m | [35m0.3308   [39m | [35m5.652    [39m | [35m1.624    [39m | [35m2.465    [39m | [35m266.5    [39m |
| [35m3        [39m | [35m0.3373   [39m | [35m13.22    [39m | [35m3.832    [39m | [35m2.165    [39m | [35m292.5    [39m |
| [35m4        [39m | [35m0.3479   [39m | [35m17.15    [39m | [35m1.849    [39m | [35m3.455    [39m | [35m95.85    [39m |
| [39m5        [39m | [39m0.34     [39m | [39m8.172    [39m | [39m3.099    [39m | [39m5.456    [39m | [39m122.8    [39m |
| [39m6        [39m | [39m0.3427   [39m | [39m13.4     [39m | [39m1.558    [39m | [39m4.337    [39m | [39m141.6    [39m |
| [39m7        [39m | [39m0.3325   [39m | [

#### Train

In [14]:
rf_model = RandomForestClassifier(
    n_estimators = int(optimizer.max['params']['n_estimators']),
    max_depth = int(optimizer.max['params']['max_depth']),
    min_samples_split = int(optimizer.max['params']['min_samples_split']),
    min_samples_leaf = int(optimizer.max['params']['min_samples_leaf']),
    class_weight = 'balanced',
    random_state = 42
)

rf_model.fit(X_train, y_train)

#### Predict

In [15]:
y_pred = rf_model.predict(X_test)

print("\nClassification Report:\n", classification_report(y_test, y_pred))
print("Confusion Matrix:\n", confusion_matrix(y_test, y_pred))


Classification Report:
               precision    recall  f1-score   support

           0       0.28      0.32      0.30       174
           1       0.39      0.39      0.39       231
           2       0.40      0.37      0.38       221

    accuracy                           0.36       626
   macro avg       0.36      0.36      0.36       626
weighted avg       0.36      0.36      0.36       626

Confusion Matrix:
 [[55 62 57]
 [77 89 65]
 [63 77 81]]


#### Feature Importance

In [16]:
feature_importance = pd.DataFrame({
    'Feature': X.columns,
    'Importance': rf_model.feature_importances_
}).sort_values(by = 'Importance', ascending = False)

print(feature_importance)

                                  Feature  Importance
1                     Years_of_Experience    0.092815
0                                     Age    0.091412
2                   Hours_Worked_Per_Week    0.087171
4                Work_Life_Balance_Rating    0.082332
3              Number_of_Virtual_Meetings    0.075181
5                 Social_Isolation_Rating    0.046731
6         Company_Support_for_Remote_Work    0.045858
7                   Sleep_Quality_Encoded    0.032101
8                    Stress_Level_Encoded    0.029740
24                   Work_Location_Remote    0.018010
28  Access_to_Mental_Health_Resources_Yes    0.017582
29           Productivity_Change_Increase    0.017454
26     Mental_Health_Condition_Depression    0.016633
30          Productivity_Change_No Change    0.016569
32               Physical_Activity_Weekly    0.016455
9                             Gender_Male    0.016345
31  Physical_Activity_Occasional or Never    0.016332
11               Gender_Pref