In [1]:
# This notebook includes data cleaning and type correction steps.

In [2]:
import pandas as pd

In [10]:
file_path = 'Data/Impact_of_Remote_Work_on_Mental_Health.csv'
df = pd.read_csv(file_path)

In [11]:
null_counts = df.isnull().sum()

In [12]:
null_counts

Employee_ID                             0
Age                                     0
Gender                                  0
Job_Role                                0
Industry                                0
Years_of_Experience                     0
Work_Location                           0
Hours_Worked_Per_Week                   0
Number_of_Virtual_Meetings              0
Work_Life_Balance_Rating                0
Stress_Level                            0
Mental_Health_Condition              1196
Access_to_Mental_Health_Resources       0
Productivity_Change                     0
Social_Isolation_Rating                 0
Satisfaction_with_Remote_Work           0
Company_Support_for_Remote_Work         0
Physical_Activity                    1629
Sleep_Quality                           0
Region                                  0
dtype: int64

In [19]:
null_counts / df.shape[0] * 100

Employee_ID                           0.00
Age                                   0.00
Gender                                0.00
Job_Role                              0.00
Industry                              0.00
Years_of_Experience                   0.00
Work_Location                         0.00
Hours_Worked_Per_Week                 0.00
Number_of_Virtual_Meetings            0.00
Work_Life_Balance_Rating              0.00
Stress_Level                          0.00
Mental_Health_Condition              23.92
Access_to_Mental_Health_Resources     0.00
Productivity_Change                   0.00
Social_Isolation_Rating               0.00
Satisfaction_with_Remote_Work         0.00
Company_Support_for_Remote_Work       0.00
Physical_Activity                    32.58
Sleep_Quality                         0.00
Region                                0.00
dtype: float64

In [13]:
df['Mental_Health_Condition'].value_counts()

Mental_Health_Condition
Burnout       1280
Anxiety       1278
Depression    1246
Name: count, dtype: int64

In [15]:
df['Mental_Health_Condition'] = df['Mental_Health_Condition'].fillna("Unknown")

In [16]:
df['Mental_Health_Condition'].value_counts()

Mental_Health_Condition
Burnout       1280
Anxiety       1278
Depression    1246
Unknown       1196
Name: count, dtype: int64

In [20]:
null_counts_2 = df.isnull().sum()

In [21]:
null_counts_2 / df.shape[0] * 100

Employee_ID                           0.00
Age                                   0.00
Gender                                0.00
Job_Role                              0.00
Industry                              0.00
Years_of_Experience                   0.00
Work_Location                         0.00
Hours_Worked_Per_Week                 0.00
Number_of_Virtual_Meetings            0.00
Work_Life_Balance_Rating              0.00
Stress_Level                          0.00
Mental_Health_Condition               0.00
Access_to_Mental_Health_Resources     0.00
Productivity_Change                   0.00
Social_Isolation_Rating               0.00
Satisfaction_with_Remote_Work         0.00
Company_Support_for_Remote_Work       0.00
Physical_Activity                    32.58
Sleep_Quality                         0.00
Region                                0.00
dtype: float64

In [22]:
df['Physical_Activity'].value_counts()

Physical_Activity
Weekly    1755
Daily     1616
Name: count, dtype: int64

In [23]:
df['Physical_Activity'] = df['Physical_Activity'].fillna("Occasional or Never")

In [24]:
df['Physical_Activity'].value_counts()

Physical_Activity
Weekly                 1755
Occasional or Never    1629
Daily                  1616
Name: count, dtype: int64

In [25]:
null_counts_3 = df.isnull().sum()

In [26]:
null_counts_3 / df.shape[0] * 100

Employee_ID                          0.0
Age                                  0.0
Gender                               0.0
Job_Role                             0.0
Industry                             0.0
Years_of_Experience                  0.0
Work_Location                        0.0
Hours_Worked_Per_Week                0.0
Number_of_Virtual_Meetings           0.0
Work_Life_Balance_Rating             0.0
Stress_Level                         0.0
Mental_Health_Condition              0.0
Access_to_Mental_Health_Resources    0.0
Productivity_Change                  0.0
Social_Isolation_Rating              0.0
Satisfaction_with_Remote_Work        0.0
Company_Support_for_Remote_Work      0.0
Physical_Activity                    0.0
Sleep_Quality                        0.0
Region                               0.0
dtype: float64

In [27]:
df.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 5000 entries, 0 to 4999
Data columns (total 20 columns):
 #   Column                             Non-Null Count  Dtype 
---  ------                             --------------  ----- 
 0   Employee_ID                        5000 non-null   object
 1   Age                                5000 non-null   int64 
 2   Gender                             5000 non-null   object
 3   Job_Role                           5000 non-null   object
 4   Industry                           5000 non-null   object
 5   Years_of_Experience                5000 non-null   int64 
 6   Work_Location                      5000 non-null   object
 7   Hours_Worked_Per_Week              5000 non-null   int64 
 8   Number_of_Virtual_Meetings         5000 non-null   int64 
 9   Work_Life_Balance_Rating           5000 non-null   int64 
 10  Stress_Level                       5000 non-null   object
 11  Mental_Health_Condition            5000 non-null   object
 12  Access

In [31]:
unique_values = {column: df[column].unique() for column in df.select_dtypes(include='object').columns}

In [32]:
unique_values

{'Employee_ID': array(['EMP0001', 'EMP0002', 'EMP0003', ..., 'EMP4998', 'EMP4999',
        'EMP5000'], dtype=object),
 'Gender': array(['Non-binary', 'Female', 'Male', 'Prefer not to say'], dtype=object),
 'Job_Role': array(['HR', 'Data Scientist', 'Software Engineer', 'Sales', 'Marketing',
        'Designer', 'Project Manager'], dtype=object),
 'Industry': array(['Healthcare', 'IT', 'Education', 'Finance', 'Consulting',
        'Manufacturing', 'Retail'], dtype=object),
 'Work_Location': array(['Hybrid', 'Remote', 'Onsite'], dtype=object),
 'Stress_Level': array(['Medium', 'High', 'Low'], dtype=object),
 'Mental_Health_Condition': array(['Depression', 'Anxiety', 'Unknown', 'Burnout'], dtype=object),
 'Access_to_Mental_Health_Resources': array(['No', 'Yes'], dtype=object),
 'Productivity_Change': array(['Decrease', 'Increase', 'No Change'], dtype=object),
 'Satisfaction_with_Remote_Work': array(['Unsatisfied', 'Satisfied', 'Neutral'], dtype=object),
 'Physical_Activity': array(['Weekly

In [33]:
categorical_columns = [
    'Gender', 'Job_Role', 'Industry', 'Work_Location', 'Stress_Level',
    'Mental_Health_Condition', 'Access_to_Mental_Health_Resources',
    'Productivity_Change', 'Satisfaction_with_Remote_Work', 'Physical_Activity',
    'Sleep_Quality', 'Region'
]

for column in categorical_columns:
    df[column] = df[column].astype('category')

In [34]:
df.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 5000 entries, 0 to 4999
Data columns (total 20 columns):
 #   Column                             Non-Null Count  Dtype   
---  ------                             --------------  -----   
 0   Employee_ID                        5000 non-null   object  
 1   Age                                5000 non-null   int64   
 2   Gender                             5000 non-null   category
 3   Job_Role                           5000 non-null   category
 4   Industry                           5000 non-null   category
 5   Years_of_Experience                5000 non-null   int64   
 6   Work_Location                      5000 non-null   category
 7   Hours_Worked_Per_Week              5000 non-null   int64   
 8   Number_of_Virtual_Meetings         5000 non-null   int64   
 9   Work_Life_Balance_Rating           5000 non-null   int64   
 10  Stress_Level                       5000 non-null   category
 11  Mental_Health_Condition            5000 non

In [35]:
df.to_csv('Data/Cleaned_Impact_of_Remote_Work_on_Mental_Health.csv', index = False)