In [6]:
import pandas as pd
import numpy as np
from IPython.display import display


# Load data
df = pd.read_csv('post_pandemic_remote_work_health_impact_2025.csv')
df.head()

# Filter to include only remote workers
df_remote = df[df['Work_Arrangement'] == 'Remote'].copy()
# Check info
display(df)

from IPython.display import display

# Check for duplicates
print("Duplicates:", df.duplicated().sum())

# Quick look at nulls
print("Missing values per column:\n", df.isnull().sum())

# Take care of columns with missing values
df['Mental_Health_Status'] = df['Mental_Health_Status'].fillna('None')
df['Physical_Health_Issues'] = df['Physical_Health_Issues'].fillna('None')
print(df.isnull().sum()) 

# Categorical summaries (using display for better output)
categoricals = ['Gender', 'Region', 'Industry', 'Job_Role', 'Work_Arrangement', 
                'Mental_Health_Status', 'Burnout_Level', 'Physical_Health_Issues', 'Salary_Range']

for col in categoricals:
    print(f"\n{col}:")
    display(df[col].value_counts(dropna=False).to_frame(name='Count'))


Unnamed: 0,Survey_Date,Age,Gender,Region,Industry,Job_Role,Work_Arrangement,Hours_Per_Week,Mental_Health_Status,Burnout_Level,Work_Life_Balance_Score,Physical_Health_Issues,Social_Isolation_Score,Salary_Range
0,2025-06-01,27,Female,Asia,Professional Services,Data Analyst,Onsite,64,Stress Disorder,High,3,Shoulder Pain; Neck Pain,2,$40K-60K
1,2025-06-01,37,Female,Asia,Professional Services,Data Analyst,Onsite,37,Stress Disorder,High,4,Back Pain,2,$80K-100K
2,2025-06-01,32,Female,Africa,Education,Business Analyst,Onsite,36,ADHD,High,3,Shoulder Pain; Eye Strain,2,$80K-100K
3,2025-06-01,40,Female,Europe,Education,Data Analyst,Onsite,63,ADHD,Medium,1,Shoulder Pain; Eye Strain,2,$60K-80K
4,2025-06-01,30,Male,South America,Manufacturing,DevOps Engineer,Hybrid,65,,Medium,5,,4,$60K-80K
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
3152,2025-06-26,62,Female,South America,Professional Services,Data Analyst,Hybrid,38,PTSD,Medium,4,Shoulder Pain; Neck Pain,3,$80K-100K
3153,2025-06-26,24,Female,South America,Professional Services,Software Engineer,Remote,54,,Medium,4,Eye Strain,4,$100K-120K
3154,2025-06-26,45,Female,North America,Professional Services,HR Manager,Onsite,59,PTSD,Medium,1,Shoulder Pain,3,$40K-60K
3155,2025-06-26,38,Male,North America,Education,Operations Manager,Onsite,52,Depression,Medium,3,Shoulder Pain; Eye Strain; Neck Pain,5,$80K-100K


Duplicates: 0
Missing values per column:
 Survey_Date                  0
Age                          0
Gender                       0
Region                       0
Industry                     0
Job_Role                     0
Work_Arrangement             0
Hours_Per_Week               0
Mental_Health_Status       799
Burnout_Level                0
Work_Life_Balance_Score      0
Physical_Health_Issues     280
Social_Isolation_Score       0
Salary_Range                 0
dtype: int64
Survey_Date                0
Age                        0
Gender                     0
Region                     0
Industry                   0
Job_Role                   0
Work_Arrangement           0
Hours_Per_Week             0
Mental_Health_Status       0
Burnout_Level              0
Work_Life_Balance_Score    0
Physical_Health_Issues     0
Social_Isolation_Score     0
Salary_Range               0
dtype: int64

Gender:


Unnamed: 0_level_0,Count
Gender,Unnamed: 1_level_1
Male,1535
Female,1500
Non-binary,90
Prefer not to say,32



Region:


Unnamed: 0_level_0,Count
Region,Unnamed: 1_level_1
South America,575
Africa,532
Oceania,523
Asia,517
Europe,513
North America,497



Industry:


Unnamed: 0_level_0,Count
Industry,Unnamed: 1_level_1
Professional Services,730
Technology,593
Manufacturing,370
Finance,367
Education,287
Healthcare,247
Marketing,202
Retail,188
Customer Service,173



Job_Role:


Unnamed: 0_level_0,Count
Job_Role,Unnamed: 1_level_1
Research Scientist,151
DevOps Engineer,149
Social Media Manager,144
Customer Service Manager,144
Data Analyst,143
Operations Manager,142
HR Manager,141
IT Support,140
Data Scientist,136
Project Manager,136



Work_Arrangement:


Unnamed: 0_level_0,Count
Work_Arrangement,Unnamed: 1_level_1
Onsite,1562
Hybrid,1007
Remote,588



Mental_Health_Status:


Unnamed: 0_level_0,Count
Mental_Health_Status,Unnamed: 1_level_1
,799
PTSD,423
Anxiety,394
Burnout,392
Depression,386
ADHD,385
Stress Disorder,378



Burnout_Level:


Unnamed: 0_level_0,Count
Burnout_Level,Unnamed: 1_level_1
Medium,1366
High,1046
Low,745



Physical_Health_Issues:


Unnamed: 0_level_0,Count
Physical_Health_Issues,Unnamed: 1_level_1
,280
Shoulder Pain; Eye Strain,262
Back Pain; Eye Strain,258
Eye Strain,256
Back Pain; Shoulder Pain; Eye Strain,255
Back Pain,253
Shoulder Pain,235
Back Pain; Shoulder Pain,218
Back Pain; Eye Strain; Neck Pain,93
Neck Pain,92



Salary_Range:


Unnamed: 0_level_0,Count
Salary_Range,Unnamed: 1_level_1
$60K-80K,1014
$80K-100K,964
$40K-60K,510
$100K-120K,459
$120K+,210


In [8]:
# Summary statistics for all numeric columns
display(df.describe())

# Outlier checks
print("\nRows with Age < 18 (should be none):")
display(df[df['Age'] < 18])

print("\nRows with Age > 65 (potential high-end outliers):")
display(df[df['Age'] > 65])

print("\nRows with Hours_Per_Week < 0 (invalid, should be none):")
display(df[df['Hours_Per_Week'] < 0])

print("\nRows with Hours_Per_Week > 80 (should be none):")
display(df[df['Hours_Per_Week'] > 80])

# Standardize Gender (strip spaces, set title case)
df['Gender'] = df['Gender'].str.strip().str.title()

# List all unique physical health issues (excluding 'None')
issues_set = set()
df['Physical_Health_Issues'].dropna().str.split(';').apply(
    lambda x: [issues_set.add(i.strip()) for i in x if i.strip() != 'None']
)
print("\nAll unique physical health issues in data:", issues_set)

# One-hot encode selected health issues for remote workers, using True/False
issues = ['Back Pain', 'Shoulder Pain', 'Neck Pain', 'Eye Strain', 'Wrist Pain']
df_remote['Physical_Health_Issues'] = df_remote['Physical_Health_Issues'].fillna('None').astype(str)
for issue in issues:
    colname = issue.replace(' ', '_')
    df_remote[colname] = df_remote['Physical_Health_Issues'].apply(lambda x: issue in x)


Unnamed: 0,Age,Hours_Per_Week,Work_Life_Balance_Score,Social_Isolation_Score
count,3157.0,3157.0,3157.0,3157.0
mean,43.732024,49.904973,2.996516,2.704783
std,12.661095,8.897699,1.163307,1.188887
min,22.0,35.0,1.0,1.0
25%,33.0,42.0,2.0,2.0
50%,44.0,50.0,3.0,3.0
75%,55.0,57.0,4.0,4.0
max,65.0,65.0,5.0,5.0



Rows with Age < 18 (should be none):


Unnamed: 0,Survey_Date,Age,Gender,Region,Industry,Job_Role,Work_Arrangement,Hours_Per_Week,Mental_Health_Status,Burnout_Level,Work_Life_Balance_Score,Physical_Health_Issues,Social_Isolation_Score,Salary_Range



Rows with Age > 65 (potential high-end outliers):


Unnamed: 0,Survey_Date,Age,Gender,Region,Industry,Job_Role,Work_Arrangement,Hours_Per_Week,Mental_Health_Status,Burnout_Level,Work_Life_Balance_Score,Physical_Health_Issues,Social_Isolation_Score,Salary_Range



Rows with Hours_Per_Week < 0 (invalid, should be none):


Unnamed: 0,Survey_Date,Age,Gender,Region,Industry,Job_Role,Work_Arrangement,Hours_Per_Week,Mental_Health_Status,Burnout_Level,Work_Life_Balance_Score,Physical_Health_Issues,Social_Isolation_Score,Salary_Range



Rows with Hours_Per_Week > 80 (should be none):


Unnamed: 0,Survey_Date,Age,Gender,Region,Industry,Job_Role,Work_Arrangement,Hours_Per_Week,Mental_Health_Status,Burnout_Level,Work_Life_Balance_Score,Physical_Health_Issues,Social_Isolation_Score,Salary_Range



All unique physical health issues in data: {'Wrist Pain', 'Back Pain', 'Shoulder Pain', 'Neck Pain', 'Eye Strain'}


In [27]:
print(df_remote.columns)


Index(['Age', 'Hours_Per_Week', 'Work_Life_Balance_Score',
       'Social_Isolation_Score', 'Back_Pain', 'Shoulder_Pain', 'Neck_Pain',
       'Eye_Strain', 'Wrist_Pain', 'Gender_Male', 'Gender_Non-binary',
       'Gender_Prefer not to say', 'Region_Asia', 'Region_Europe',
       'Region_North America', 'Region_Oceania', 'Region_South America',
       'Industry_Education', 'Industry_Finance', 'Industry_Healthcare',
       'Industry_Manufacturing', 'Industry_Marketing',
       'Industry_Professional Services', 'Industry_Retail',
       'Industry_Technology', 'Job_Role_Business Analyst',
       'Job_Role_Consultant', 'Job_Role_Content Writer',
       'Job_Role_Customer Service Manager', 'Job_Role_Data Analyst',
       'Job_Role_Data Scientist', 'Job_Role_DevOps Engineer',
       'Job_Role_Digital Marketing Specialist', 'Job_Role_Executive Assistant',
       'Job_Role_Financial Analyst', 'Job_Role_HR Manager',
       'Job_Role_IT Support', 'Job_Role_Marketing Specialist',
       'Job_Role_