In [0]:
from pyspark.sql import SparkSession
from pyspark.sql.types import *

# Initialize Spark session
spark = SparkSession.builder \
    .appName("EmployeeDepartmentAnalysis") \
    .getOrCreate()

# Define schemas for Employee and Department tables
employee_schema = StructType([
    StructField("EmployeeID", IntegerType(), True),
    StructField("EmployeeName", StringType(), True),
    StructField("Age", IntegerType(), True),
    StructField("JobTitle", StringType(), True),
    StructField("DepartmentID", IntegerType(), True),
    StructField("Salary", IntegerType(), True)
])

department_schema = StructType([
    StructField("DepartmentID", IntegerType(), True),
    StructField("DepartmentName", StringType(), True),
    StructField("JobTitle", StringType(), True)
])

# Sample data for Employee and Department tables
employee_data = [
    (1, 'John Smith', 30, 'Manager', 1, 75000),
    (2, 'Jane Doe', 25, 'Sales Associate', 2, 45000),
    (3, 'Mark Johnson', 35, 'Developer', 3, 90000),
    (4, 'Sarah Lee', 28, 'Marketing', 2, 50000),
    (5, 'James Brown', 31, 'Analyst', 1, 65000),
    (6, 'John Smith', 28, 'Sales Associate', 2, 40000),
    (7, 'Jane Doe', 32, 'Developer', 3, 85000),
    (8, 'Mark Johnson', 29, 'Marketing', 2, 55000),
    (9, 'Sarah Lee', 26, 'Analyst', 1, 60000),
    (10, 'James Brown', 34, 'Manager', 1, 80000)
]

department_data = [
    (1, 'Sales', 'Sales Associate'),
    (2, 'Marketing', 'Marketing Manager'),
    (3, 'IT', 'Developer'),
    (4, 'Operations', 'Operations Manager'),
    (5, 'HR', 'HR Manager'),
    (6, 'Finance', 'Finance Manager'),
    (7, 'Administration', 'Administrator'),
    (8, 'Customer Support', 'Customer Support Agent'),
    (9, 'Quality Assurance', 'QA Tester'),
    (10, 'Production', 'Production Manager'),
    (11, 'Logistics', 'Logistics Coordinator'),
    (12, 'R&D', 'Research Scientist'),
    (13, 'Legal', 'Legal Counsel'),
    (14, 'Public Relations', 'PR Manager'),
    (15, 'Procurement', 'Procurement Manager'),
    (16, 'Engineering', 'Engineer')
]

# Create DataFrames
employee_df = spark.createDataFrame(employee_data, schema=employee_schema)
department_df = spark.createDataFrame(department_data, schema=department_schema)

# Register DataFrames as temporary views
employee_df.createOrReplaceTempView("employee")
department_df.createOrReplaceTempView("department")

# Example query: Joining Employee with Department
joined_df = spark.sql("""
    SELECT e.EmployeeID, e.EmployeeName, e.Age, e.JobTitle, d.DepartmentName, e.Salary
    FROM employee e
    JOIN department d ON e.DepartmentID = d.DepartmentID
""")

# Show the result
joined_df.display()


EmployeeID,EmployeeName,Age,JobTitle,DepartmentName,Salary
1,John Smith,30,Manager,Sales,75000
5,James Brown,31,Analyst,Sales,65000
9,Sarah Lee,26,Analyst,Sales,60000
10,James Brown,34,Manager,Sales,80000
2,Jane Doe,25,Sales Associate,Marketing,45000
4,Sarah Lee,28,Marketing,Marketing,50000
6,John Smith,28,Sales Associate,Marketing,40000
8,Mark Johnson,29,Marketing,Marketing,55000
3,Mark Johnson,35,Developer,IT,90000
7,Jane Doe,32,Developer,IT,85000
