In [0]:
from pyspark.sql.types import *

# Define the schema for employees
employee_schema = StructType(
    [
        StructField("employee_id", IntegerType(), True),
        StructField("employee_name", StringType(), True),
        StructField("manager_id", IntegerType(), True),
    ]
)

# Define the schema for managers
manager_schema = StructType(
    [
        StructField("manager_id", IntegerType(), True),
        StructField("manager_name", StringType(), True),
    ]
)

employee_data = [
    (1, "John Doe", 101),
    (2, "Jane Smith", 102),
    (3, "Sam Brown", 101),
    (4, "Lisa White", 102),
    (5, "Mike Johnson", 103),
    (6, "Emma Watson", 104),
    (7, "Robert Downey", 103),
    (8, "Chris Evans", 102),
    (9, "Scarlett Johansson", 104),
    (10, "Tom Holland", 101),
]

# Sample manager data (4 entries)
manager_data = [
    (101, "Michael Scott"),
    (102, "Jim Halpert"),
    (103, "Dwight Schrute"),
    (104, "Pam Beesly"),
]
# Create DataFrames for employees and managers
employee_df = spark.createDataFrame(employee_data, schema=employee_schema)
manager_df = spark.createDataFrame(manager_data, schema=manager_schema)

# display the DataFrames
employee_df.display()
manager_df.display()

employee_id,employee_name,manager_id
1,John Doe,101
2,Jane Smith,102
3,Sam Brown,101
4,Lisa White,102
5,Mike Johnson,103
6,Emma Watson,104
7,Robert Downey,103
8,Chris Evans,102
9,Scarlett Johansson,104
10,Tom Holland,101


manager_id,manager_name
101,Michael Scott
102,Jim Halpert
103,Dwight Schrute
104,Pam Beesly


In [0]:
from pyspark.sql.functions import col

# Join employees with managers based on manager_id
org_chart_df = employee_df.join(
    manager_df, employee_df.manager_id == manager_df.manager_id, "inner"
).select(employee_df.employee_name, manager_df.manager_name)

# display the organizational hierarchy
org_chart_df.display()

employee_name,manager_name
John Doe,Michael Scott
Sam Brown,Michael Scott
Tom Holland,Michael Scott
Jane Smith,Jim Halpert
Lisa White,Jim Halpert
Chris Evans,Jim Halpert
Mike Johnson,Dwight Schrute
Robert Downey,Dwight Schrute
Emma Watson,Pam Beesly
Scarlett Johansson,Pam Beesly


In [0]:
employee_df.createOrReplaceTempView("Employees")
manager_df.createOrReplaceTempView("Managers")

In [0]:
# SQL query to join employees and managers based on manager_id
org_chart_sql = """
SELECT e.employee_name, m.manager_name
FROM employees e
JOIN managers m
ON e.manager_id = m.manager_id
"""

# Run the query
org_chart_df_sql = spark.sql(org_chart_sql)

# display the result of the organizational hierarchy
org_chart_df_sql.display()

employee_name,manager_name
John Doe,Michael Scott
Sam Brown,Michael Scott
Tom Holland,Michael Scott
Jane Smith,Jim Halpert
Lisa White,Jim Halpert
Chris Evans,Jim Halpert
Mike Johnson,Dwight Schrute
Robert Downey,Dwight Schrute
Emma Watson,Pam Beesly
Scarlett Johansson,Pam Beesly
