In [1]:
pip install pyspark



In [3]:
import numpy as np
import pandas as pd
import seaborn as sns
import matplotlib as mpl
import matplotlib.pyplot as plt

In [2]:
# Mount drive de google
from google.colab import drive
drive.mount('/content/drive')

Mounted at /content/drive


## Spark Session

In [5]:
from pyspark.sql import SparkSession

# Initialize a Spark session
spark = SparkSession.builder \
    .appName("Mental Health Analysis") \
    .getOrCreate()

## Load Data

In [6]:
# Load the dataset into a Spark DataFrame
data_path = "/content/drive/MyDrive/4. Projects/Healthcare/Healthcare - Remote Work/Impact_of_Remote_Work_on_Mental_Health.csv"
df = spark.read.option("header", "true").csv(data_path)

# Show the first few records of the DataFrame
df.show(5)

+-----------+---+----------+-----------------+----------+-------------------+-------------+---------------------+--------------------------+------------------------+------------+-----------------------+---------------------------------+-------------------+-----------------------+-----------------------------+-------------------------------+-----------------+-------------+-------------+
|Employee_ID|Age|    Gender|         Job_Role|  Industry|Years_of_Experience|Work_Location|Hours_Worked_Per_Week|Number_of_Virtual_Meetings|Work_Life_Balance_Rating|Stress_Level|Mental_Health_Condition|Access_to_Mental_Health_Resources|Productivity_Change|Social_Isolation_Rating|Satisfaction_with_Remote_Work|Company_Support_for_Remote_Work|Physical_Activity|Sleep_Quality|       Region|
+-----------+---+----------+-----------------+----------+-------------------+-------------+---------------------+--------------------------+------------------------+------------+-----------------------+--------------------

## Data processing and cleaning

In [8]:
from pyspark.sql.functions import col

In [9]:
# Convert appropriate columns to their correct data types
df = df.withColumn("Age", col("Age").cast("int")) \
       .withColumn("Years_of_Experience", col("Years_of_Experience").cast("int")) \
       .withColumn("Hours_Worked_Per_Week", col("Hours_Worked_Per_Week").cast("int"))



In [10]:
# Drop rows with missing values in critical columns (if needed)
df = df.dropna(subset=["Age", "Gender", "Industry", "Region"])

In [12]:
# Show schema to check the changes
df.printSchema()

root
 |-- Employee_ID: string (nullable = true)
 |-- Age: integer (nullable = true)
 |-- Gender: string (nullable = true)
 |-- Job_Role: string (nullable = true)
 |-- Industry: string (nullable = true)
 |-- Years_of_Experience: integer (nullable = true)
 |-- Work_Location: string (nullable = true)
 |-- Hours_Worked_Per_Week: integer (nullable = true)
 |-- Number_of_Virtual_Meetings: string (nullable = true)
 |-- Work_Life_Balance_Rating: string (nullable = true)
 |-- Stress_Level: string (nullable = true)
 |-- Mental_Health_Condition: string (nullable = true)
 |-- Access_to_Mental_Health_Resources: string (nullable = true)
 |-- Productivity_Change: string (nullable = true)
 |-- Social_Isolation_Rating: string (nullable = true)
 |-- Satisfaction_with_Remote_Work: string (nullable = true)
 |-- Company_Support_for_Remote_Work: string (nullable = true)
 |-- Physical_Activity: string (nullable = true)
 |-- Sleep_Quality: string (nullable = true)
 |-- Region: string (nullable = true)



In [13]:
# Display the cleaned data
df.show(5)

+-----------+---+----------+-----------------+----------+-------------------+-------------+---------------------+--------------------------+------------------------+------------+-----------------------+---------------------------------+-------------------+-----------------------+-----------------------------+-------------------------------+-----------------+-------------+-------------+
|Employee_ID|Age|    Gender|         Job_Role|  Industry|Years_of_Experience|Work_Location|Hours_Worked_Per_Week|Number_of_Virtual_Meetings|Work_Life_Balance_Rating|Stress_Level|Mental_Health_Condition|Access_to_Mental_Health_Resources|Productivity_Change|Social_Isolation_Rating|Satisfaction_with_Remote_Work|Company_Support_for_Remote_Work|Physical_Activity|Sleep_Quality|       Region|
+-----------+---+----------+-----------------+----------+-------------------+-------------+---------------------+--------------------------+------------------------+------------+-----------------------+--------------------

### Data Aggregation

In [19]:
# Group by Region and calculate average Age, Years_of_Experience and "Hours_Worked_Per_Week"
df.groupBy("Region").agg(
    {"Age": "avg", "Years_of_Experience": "avg", "Hours_Worked_Per_Week":"avg"}
).show()

+-------------+------------------------+------------------+--------------------------+
|       Region|avg(Years_of_Experience)|          avg(Age)|avg(Hours_Worked_Per_Week)|
+-------------+------------------------+------------------+--------------------------+
|       Europe|      17.948809523809523| 41.13690476190476|        39.385714285714286|
|       Africa|      17.309302325581395| 40.45348837209303|         39.32093023255814|
|North America|      17.925353925353924|40.792792792792795|        39.742599742599744|
|South America|      18.020556227327692| 40.78960096735187|         40.21644498186215|
|      Oceania|      17.559400230680506|41.021914648212224|         39.44290657439446|
|         Asia|      18.133896260554884| 41.77925211097708|         39.61037394451146|
+-------------+------------------------+------------------+--------------------------+



In [18]:
# Count employees by Industry and Mental_Health_Condition
df.groupBy("Industry", "Mental_Health_Condition").count().show()


+-------------+-----------------------+-----+
|     Industry|Mental_Health_Condition|count|
+-------------+-----------------------+-----+
|      Finance|                Burnout|  202|
|    Education|                   None|  169|
|   Consulting|                Anxiety|  178|
|       Retail|                Burnout|  180|
|    Education|             Depression|  171|
|   Consulting|                Burnout|  166|
|      Finance|             Depression|  179|
|   Healthcare|                Burnout|  191|
|Manufacturing|                Anxiety|  181|
|           IT|             Depression|  182|
|       Retail|                Anxiety|  192|
|Manufacturing|                Burnout|  175|
|      Finance|                   None|  180|
|       Retail|                   None|  163|
|    Education|                Anxiety|  176|
|      Finance|                Anxiety|  186|
|   Healthcare|                Anxiety|  160|
|           IT|                   None|  167|
|   Consulting|             Depres

In [17]:
# Average Hours_Worked_Per_Week by Job_Role
df.groupBy("Job_Role").agg(
    {"Hours_Worked_Per_Week": "avg"}
).show()


+-----------------+--------------------------+
|         Job_Role|avg(Hours_Worked_Per_Week)|
+-----------------+--------------------------+
|            Sales|        39.860845839017735|
|  Project Manager|        39.922764227642276|
|               HR|        39.660614525139664|
|         Designer|        38.881051175656985|
|   Data Scientist|         38.95402298850575|
|        Marketing|        39.734992679355784|
|Software Engineer|        40.271448663853725|
+-----------------+--------------------------+



In [20]:
# Employees who are "Unsatisfied" with remote work
unsatisfied_employees = df.filter(df.Satisfaction_with_Remote_Work == "Unsatisfied")
unsatisfied_employees.show(5)


+-----------+---+----------+-----------------+----------+-------------------+-------------+---------------------+--------------------------+------------------------+------------+-----------------------+---------------------------------+-------------------+-----------------------+-----------------------------+-------------------------------+-----------------+-------------+-------------+
|Employee_ID|Age|    Gender|         Job_Role|  Industry|Years_of_Experience|Work_Location|Hours_Worked_Per_Week|Number_of_Virtual_Meetings|Work_Life_Balance_Rating|Stress_Level|Mental_Health_Condition|Access_to_Mental_Health_Resources|Productivity_Change|Social_Isolation_Rating|Satisfaction_with_Remote_Work|Company_Support_for_Remote_Work|Physical_Activity|Sleep_Quality|       Region|
+-----------+---+----------+-----------------+----------+-------------------+-------------+---------------------+--------------------------+------------------------+------------+-----------------------+--------------------