In [0]:
from pyspark.sql.types import *

# Define schema for Employees
employees_schema = StructType(
    [
        StructField("employee_id", IntegerType(), nullable=False),
        StructField("employee_name", StringType(), nullable=False),
        StructField("manager_id", IntegerType(), nullable=True),
    ]
)

# Define schema for Departments
departments_schema = StructType(
    [
        StructField("department_id", IntegerType(), nullable=False),
        StructField("department_name", StringType(), nullable=False),
        StructField("parent_department_id", IntegerType(), nullable=True),
    ]
)

# Sample data for Employees
employees_data = [
    (1, "John Doe", None),
    (2, "Jane Smith", 1),
    (3, "Alex Brown", 2),
    (4, "Emily Davis", 2),
    (5, "Michael Clark", 1),
    (6, "Sarah Wilson", 5),
    (7, "David Lee", 5),
    (8, "Laura Turner", 1),
    (9, "Kevin Green", 8),
]

# Sample data for Departments
departments_data = [
    (1, "Human Resources", None),
    (2, "Recruitment", 1),
    (3, "Employee Relations", 1),
    (4, "Training", 3),
    (5, "Finance", None),
    (6, "Accounting", 5),
    (7, "Payroll", 5),
]

# Create DataFrames with defined schemas
employees_df = spark.createDataFrame(employees_data, schema=employees_schema)
departments_df = spark.createDataFrame(departments_data, schema=departments_schema)

# display the DataFrames
print("Employees DataFrame:")
employees_df.display()

print("Departments DataFrame:")
departments_df.display()

Employees DataFrame:


employee_id,employee_name,manager_id
1,John Doe,
2,Jane Smith,1.0
3,Alex Brown,2.0
4,Emily Davis,2.0
5,Michael Clark,1.0
6,Sarah Wilson,5.0
7,David Lee,5.0
8,Laura Turner,1.0
9,Kevin Green,8.0


Departments DataFrame:


department_id,department_name,parent_department_id
1,Human Resources,
2,Recruitment,1.0
3,Employee Relations,1.0
4,Training,3.0
5,Finance,
6,Accounting,5.0
7,Payroll,5.0


In [0]:
employees_df.createOrReplaceTempView("Employees ")
departments_df.createOrReplaceTempView("Departments")

#Department Hierarchy

In [0]:
from pyspark.sql.functions import *

# Start building hierarchy by getting the top-level departments
hierarchy_df = departments_df.filter(col("parent_department_id").isNull()).withColumn(
    "level", lit(0)
)

# Initialize the current hierarchy level
current_hierarchy_df = hierarchy_df

# Loop to add child departments
for level in range(1, 5):  # Adjust the range as per the depth of hierarchy
    next_level_df = (
        departments_df.alias("d")
        .join(
            current_hierarchy_df.alias("h"),
            col("d.parent_department_id") == col("h.department_id"),
        )
        .select(
            col("d.department_id"),
            col("d.department_name"),
            col("d.parent_department_id"),
            (col("h.level") + 1).alias("level"),
        )
    )

    # Union the new level to the existing hierarchy
    hierarchy_df = hierarchy_df.union(next_level_df)

    # Update current hierarchy for the next iteration
    current_hierarchy_df = next_level_df

# Show the final department hierarchy
hierarchy_df.orderBy("level", "department_id").display()

department_id,department_name,parent_department_id,level
1,Human Resources,,0
5,Finance,,0
2,Recruitment,1.0,1
3,Employee Relations,1.0,1
6,Accounting,5.0,1
7,Payroll,5.0,1
4,Training,3.0,2


#Employee Hierarchy

In [0]:
# Start building hierarchy by getting the top-level employees (those without a manager)
hierarchy_df = employees_df.filter(col("manager_id").isNull()).withColumn(
    "level", lit(0)
)

# Initialize the current hierarchy level
current_hierarchy_df = hierarchy_df

# Loop to simulate recursion and add employees under managers
for level in range(1, 5):  # Adjust range depending on how deep the hierarchy is
    next_level_df = (
        employees_df.alias("e")
        .join(
            current_hierarchy_df.alias("h"), col("e.manager_id") == col("h.employee_id")
        )
        .select(
            col("e.employee_id"),
            col("e.employee_name"),
            col("e.manager_id"),
            (col("h.level") + 1).alias("level"),
        )
    )

    # Union the new level to the existing hierarchy
    hierarchy_df = hierarchy_df.union(next_level_df)

    # Update current hierarchy for the next iteration
    current_hierarchy_df = next_level_df

# Show the final employee hierarchy
hierarchy_df.orderBy("level", "employee_id").display()

employee_id,employee_name,manager_id,level
1,John Doe,,0
2,Jane Smith,1.0,1
5,Michael Clark,1.0,1
8,Laura Turner,1.0,1
3,Alex Brown,2.0,2
4,Emily Davis,2.0,2
6,Sarah Wilson,5.0,2
7,David Lee,5.0,2
9,Kevin Green,8.0,2
