# MSc. Dissertation - DataScience By Robert Solomon
### Analysis of Remote Work Impact on Employee Well-Being (Cleaned Secondary Dataset)

In [8]:
# Importing necessary libraries here below:

import pandas as pd
import scipy.stats as stats
from statsmodels.stats.multicomp import pairwise_tukeyhsd

### 1. Loading in the cleaned pre-procesed secondary dataset

In [10]:
wfh_mentalHealth_data_cleaned = pd.read_csv('../Secondary_Research/SR_Dataset/cleaned_Remote_Work_on_Mental_Health.csv')

In [11]:
wfh_mentalHealth_data_cleaned

Unnamed: 0,Employee_ID,Age,Gender,Years_of_Experience,Work_Location,Hours_Worked_Per_Week,Number_of_Virtual_Meetings,Work_Life_Balance_Rating,Stress_Level,Access_to_Mental_Health_Resources,...,Industry_Healthcare,Industry_IT,Industry_Manufacturing,Industry_Retail,Region_Asia,Region_Europe,Region_North America,Region_Oceania,Region_South America,Work_Stress_Score
0,EMP0002,40,1.0,3,1,52,4,1,2,0,...,False,True,False,False,True,False,False,False,False,55
1,EMP0010,30,1.0,28,2,57,6,1,1,1,...,False,True,False,False,False,False,True,False,False,62
2,EMP0013,40,1.0,1,1,21,7,2,3,1,...,False,False,False,False,False,True,False,False,False,26
3,EMP0016,56,1.0,13,1,44,11,4,1,0,...,True,False,False,False,False,False,False,False,False,51
4,EMP0020,59,0.0,13,1,59,4,3,2,0,...,False,False,False,False,False,False,False,False,True,60
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
1300,EMP4980,26,0.0,26,2,34,10,5,2,1,...,False,False,False,True,False,False,False,False,False,39
1301,EMP4982,57,0.0,17,3,58,15,2,2,1,...,False,False,False,False,False,False,False,True,False,71
1302,EMP4992,60,1.0,9,2,57,4,2,1,1,...,False,True,False,False,False,False,False,True,False,59
1303,EMP4996,32,0.0,4,3,24,2,5,3,1,...,False,False,False,False,True,False,False,False,False,21


In [12]:
# Summary statistics for numerical columns
summary_stats = wfh_mentalHealth_data_cleaned.describe()

# Saving to excel file
summary_stats.to_csv("../Secondary_Research/results/descriptive_stats.csv")

# Displaying key metrics
print(summary_stats)

               Age       Gender  Years_of_Experience  Work_Location  \
count  1305.000000  1305.000000          1305.000000    1305.000000   
mean     40.558621     0.508812            17.850575       1.977011   
std      11.367123     0.500114            10.052312       0.820234   
min      22.000000     0.000000             1.000000       1.000000   
25%      30.000000     0.000000             9.000000       1.000000   
50%      41.000000     1.000000            18.000000       2.000000   
75%      50.000000     1.000000            26.000000       3.000000   
max      60.000000     1.000000            35.000000       3.000000   

       Hours_Worked_Per_Week  Number_of_Virtual_Meetings  \
count            1305.000000                 1305.000000   
mean               39.346360                    7.429885   
std                11.927035                    4.637788   
min                20.000000                    0.000000   
25%                29.000000                    3.000000   


sdsd

### 2. Grouping stress levels by work location

In [15]:
remote_stress = wfh_mentalHealth_data_cleaned[wfh_mentalHealth_data_cleaned['Work_Location'] == 1]['Stress_Level']
hybrid_stress = wfh_mentalHealth_data_cleaned[wfh_mentalHealth_data_cleaned['Work_Location'] == 2]['Stress_Level']
onsite_stress = wfh_mentalHealth_data_cleaned[wfh_mentalHealth_data_cleaned['Work_Location'] == 3]['Stress_Level']

### 2.1 Performing one-way ANOVA (for Work Location and Stress Levels) for Secondary Data

In [17]:
anova_result = stats.f_oneway(remote_stress, hybrid_stress, onsite_stress)

In [18]:
# Printing the results
print("One-Way ANOVA Results for Work Location and Stress Levels")
print(f"F-statistic: {anova_result.statistic:.4f}, p-value: {anova_result.pvalue:.4f}")

# Interpreting the results
if anova_result.pvalue < 0.05:
    print("Result: Significant difference found in stress levels across work locations.")
else:
    print("Result: No significant difference in stress levels across work locations.")

One-Way ANOVA Results for Work Location and Stress Levels
F-statistic: 0.0822, p-value: 0.9211
Result: No significant difference in stress levels across work locations.


### 2.1.1 Interpretation of the above Output
If p-value < 0.05, we reject the null hypothesis, meaning stress levels differ significantly across remote, hybrid, and onsite workers.
If p-value ≥ 0.05, we fail to reject the null hypothesis, meaning stress levels are not significantly different between work types which in this case appear to be the case.

In [20]:
# Displaying snippet of cleaned dataset for reassurance: 
wfh_mentalHealth_data_cleaned

Unnamed: 0,Employee_ID,Age,Gender,Years_of_Experience,Work_Location,Hours_Worked_Per_Week,Number_of_Virtual_Meetings,Work_Life_Balance_Rating,Stress_Level,Access_to_Mental_Health_Resources,...,Industry_Healthcare,Industry_IT,Industry_Manufacturing,Industry_Retail,Region_Asia,Region_Europe,Region_North America,Region_Oceania,Region_South America,Work_Stress_Score
0,EMP0002,40,1.0,3,1,52,4,1,2,0,...,False,True,False,False,True,False,False,False,False,55
1,EMP0010,30,1.0,28,2,57,6,1,1,1,...,False,True,False,False,False,False,True,False,False,62
2,EMP0013,40,1.0,1,1,21,7,2,3,1,...,False,False,False,False,False,True,False,False,False,26
3,EMP0016,56,1.0,13,1,44,11,4,1,0,...,True,False,False,False,False,False,False,False,False,51
4,EMP0020,59,0.0,13,1,59,4,3,2,0,...,False,False,False,False,False,False,False,False,True,60
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
1300,EMP4980,26,0.0,26,2,34,10,5,2,1,...,False,False,False,True,False,False,False,False,False,39
1301,EMP4982,57,0.0,17,3,58,15,2,2,1,...,False,False,False,False,False,False,False,True,False,71
1302,EMP4992,60,1.0,9,2,57,4,2,1,1,...,False,True,False,False,False,False,False,True,False,59
1303,EMP4996,32,0.0,4,3,24,2,5,3,1,...,False,False,False,False,True,False,False,False,False,21


### 2.2 Conducting a Post-Hoc Test (Tukey’s HSD)

In [22]:
# Applying Tukey’s HSD test
tkey_results = pairwise_tukeyhsd(wfh_mentalHealth_data_cleaned['Stress_Level'], wfh_mentalHealth_data_cleaned['Work_Location'])

# Printing results
print("Tukey’s HSD Post-Hoc Test Results")
print(tkey_results)

Tukey’s HSD Post-Hoc Test Results
Multiple Comparison of Means - Tukey HSD, FWER=0.05
group1 group2 meandiff p-adj   lower  upper  reject
---------------------------------------------------
     1      2    0.022 0.9142 -0.1061 0.1501  False
     1      3   0.0126 0.9712 -0.1157 0.1409  False
     2      3  -0.0094 0.9842 -0.1397 0.1208  False
---------------------------------------------------


### 2.2.1 Interpretation of the above Tukey’s HSD Output
The above test shows which specific work arrangement groups (Remote(1) vs. Hybrid(2), Remote(1) vs. Onsite(3), Hybrid(2) vs. Onsite(3)) have significant stress level differences.

### 2.3 Work-Life Balance Differences

In [25]:
anova_result_1 = stats.f_oneway(
    wfh_mentalHealth_data_cleaned[wfh_mentalHealth_data_cleaned['Work_Location'] == 1]['Work_Life_Balance_Rating'],
    wfh_mentalHealth_data_cleaned[wfh_mentalHealth_data_cleaned['Work_Location'] == 2]['Work_Life_Balance_Rating'],
    wfh_mentalHealth_data_cleaned[wfh_mentalHealth_data_cleaned['Work_Location'] == 3]['Work_Life_Balance_Rating']
)

In [26]:
anova_result_1

F_onewayResult(statistic=1.4663814318161144, pvalue=0.23113983826766768)