In [16]:
import pandas as pd
import numpy as np

# Load data
df = pd.read_csv('post_pandemic_remote_work_health_impact_2025.csv')
df.head()

# Filter to include only remote workers
df_remote = df[df['Work_Arrangement'] == 'Remote'].copy()
# Check info
df.info()

# Check for duplicates
print("Duplicates:", df.duplicated().sum())

# Quick look at nulls
print("Missing values per column:\n", df.isnull().sum())

#Take care of columns with missing values
df['Mental_Health_Status'] = df['Mental_Health_Status'].fillna('Unknown')
df['Physical_Health_Issues'] = df['Physical_Health_Issues'].fillna('None')
print(df.isnull().sum()) 

# Categorical summaries
categoricals = ['Gender', 'Region', 'Industry', 'Job_Role', 'Work_Arrangement', 
                'Mental_Health_Status', 'Burnout_Level', 'Physical_Health_Issues', 'Salary_Range']

for col in categoricals:
    print(f"\n{col}:\n", df[col].value_counts(dropna=False))


<class 'pandas.core.frame.DataFrame'>
RangeIndex: 3157 entries, 0 to 3156
Data columns (total 14 columns):
 #   Column                   Non-Null Count  Dtype 
---  ------                   --------------  ----- 
 0   Survey_Date              3157 non-null   object
 1   Age                      3157 non-null   int64 
 2   Gender                   3157 non-null   object
 3   Region                   3157 non-null   object
 4   Industry                 3157 non-null   object
 5   Job_Role                 3157 non-null   object
 6   Work_Arrangement         3157 non-null   object
 7   Hours_Per_Week           3157 non-null   int64 
 8   Mental_Health_Status     2358 non-null   object
 9   Burnout_Level            3157 non-null   object
 10  Work_Life_Balance_Score  3157 non-null   int64 
 11  Physical_Health_Issues   2877 non-null   object
 12  Social_Isolation_Score   3157 non-null   int64 
 13  Salary_Range             3157 non-null   object
dtypes: int64(4), object(10)
memory usage: 34

In [24]:
#Summary statistics for all numeric columns
print(df.describe())

#Outlier checks
print("\nRows with Age < 18 (should be none):")
print(df[df['Age'] < 18])

print("\nRows with Hours_Per_Week > 80 (should be none):")
print(df[df['Hours_Per_Week'] > 80])

#Standardize Gender (strip spaces, set title case)
df['Gender'] = df['Gender'].str.strip().str.title()

#List all unique physical health issues (excluding 'None')
issues = set()
df['Physical_Health_Issues'].dropna().str.split(';').apply(
    lambda x: [issues.add(i.strip()) for i in x if i.strip() != 'None']
)
print("\nAll unique physical health issues in data:", issues)



issues = ['Back Pain', 'Shoulder Pain', 'Neck Pain', 'Eye Strain', 'Wrist Pain']
df_remote['Physical_Health_Issues'] = df_remote['Physical_Health_Issues'].fillna('None').astype(str)
for issue in issues:
    colname = issue.replace(' ', '_')
    df_remote[colname] = df_remote['Physical_Health_Issues'].apply(lambda x: int(issue in x))

               Age  Hours_Per_Week  Work_Life_Balance_Score  \
count  3157.000000     3157.000000              3157.000000   
mean     43.732024       49.904973                 2.996516   
std      12.661095        8.897699                 1.163307   
min      22.000000       35.000000                 1.000000   
25%      33.000000       42.000000                 2.000000   
50%      44.000000       50.000000                 3.000000   
75%      55.000000       57.000000                 4.000000   
max      65.000000       65.000000                 5.000000   

       Social_Isolation_Score  
count             3157.000000  
mean                 2.704783  
std                  1.188887  
min                  1.000000  
25%                  2.000000  
50%                  3.000000  
75%                  4.000000  
max                  5.000000  

Rows with Age < 18 (should be none):
Empty DataFrame
Columns: [Survey_Date, Age, Gender, Region, Industry, Job_Role, Work_Arrangement, Hours_Pe

In [27]:
print(df_remote.columns)


Index(['Age', 'Hours_Per_Week', 'Work_Life_Balance_Score',
       'Social_Isolation_Score', 'Back_Pain', 'Shoulder_Pain', 'Neck_Pain',
       'Eye_Strain', 'Wrist_Pain', 'Gender_Male', 'Gender_Non-binary',
       'Gender_Prefer not to say', 'Region_Asia', 'Region_Europe',
       'Region_North America', 'Region_Oceania', 'Region_South America',
       'Industry_Education', 'Industry_Finance', 'Industry_Healthcare',
       'Industry_Manufacturing', 'Industry_Marketing',
       'Industry_Professional Services', 'Industry_Retail',
       'Industry_Technology', 'Job_Role_Business Analyst',
       'Job_Role_Consultant', 'Job_Role_Content Writer',
       'Job_Role_Customer Service Manager', 'Job_Role_Data Analyst',
       'Job_Role_Data Scientist', 'Job_Role_DevOps Engineer',
       'Job_Role_Digital Marketing Specialist', 'Job_Role_Executive Assistant',
       'Job_Role_Financial Analyst', 'Job_Role_HR Manager',
       'Job_Role_IT Support', 'Job_Role_Marketing Specialist',
       'Job_Role_

In [29]:
df_remote.to_csv('remote_workers_model_ready.csv', index=False)
