In [0]:
from pyspark.sql import SparkSession

spark = SparkSession.builder\
        .appName("Combining Existing Data")\
        .getOrCreate()

spark

In [0]:
# Employee Data
employee_data = [
    ("Ananya", "HR", 52000),
    ("Rahul", "Engineering", 65000),
    ("Priya", "Engineering", 60000),
    ("Zoya", "Marketing", 48000),
    ("Karan", "HR", 53000),
    ("Naveen", "Engineering", 70000),
    ("Fatima", "Marketing", 45000)
]
columns_emp = ["Name", "Department", "Salary"]
df_emp = spark.createDataFrame(employee_data, columns_emp)
df_emp.show()

# Performance Data
performance_data = [
    ("Ananya", 2023, 4.5),
    ("Rahul",  2023, 4.9),
    ("Priya", 2023, 4.3),
    ("Zoya", 2023, 3.8),
    ("Karan", 2023, 4.1),
    ("Naveen", 2023, 4.7),
    ("Fatima", 2023, 3.9)
]
columns_perf = ["Name", "Year", "Rating"]
df_perf = spark.createDataFrame(performance_data, columns_perf)
df_perf.show()

# Project Data
project_data = [
    ("Ananya", "HR Portal", 120),
    ("Rahul", "Data Platform", 200),
    ("Priya", "Data Platform", 180),
    ("Zoya", "Campaign Tracker", 100),
    ("Karan", "HR Portal", 130),
    ("Naveen", "ML Pipeline", 220),
    ("Fatima", "Campaign Tracker", 90)
]
columns_proj = ["Name", "Project", "HoursWorked"]
df_proj = spark.createDataFrame(project_data, columns_proj)
df_proj.show()

+------+-----------+------+
|  Name| Department|Salary|
+------+-----------+------+
|Ananya|         HR| 52000|
| Rahul|Engineering| 65000|
| Priya|Engineering| 60000|
|  Zoya|  Marketing| 48000|
| Karan|         HR| 53000|
|Naveen|Engineering| 70000|
|Fatima|  Marketing| 45000|
+------+-----------+------+

+------+----+------+
|  Name|Year|Rating|
+------+----+------+
|Ananya|2023|   4.5|
| Rahul|2023|   4.9|
| Priya|2023|   4.3|
|  Zoya|2023|   3.8|
| Karan|2023|   4.1|
|Naveen|2023|   4.7|
|Fatima|2023|   3.9|
+------+----+------+

+------+----------------+-----------+
|  Name|         Project|HoursWorked|
+------+----------------+-----------+
|Ananya|       HR Portal|        120|
| Rahul|   Data Platform|        200|
| Priya|   Data Platform|        180|
|  Zoya|Campaign Tracker|        100|
| Karan|       HR Portal|        130|
|Naveen|     ML Pipeline|        220|
|Fatima|Campaign Tracker|         90|
+------+----------------+-----------+



In [0]:
#Joins and Advanced Aggregations
from pyspark.sql.functions import sum, avg
# 1. Join employee_data , performance_data , and project_data .
df_joined = df_emp.join(df_perf, on="Name").join(df_proj, on="Name")
df_joined.show()

# 2. Compute total hours worked per department.
df_hours_dept = df_joined.groupBy("Department").agg(sum("HoursWorked").alias("TotalHours"))
df_hours_dept.show()

# 3. Compute average rating per project.
df_avg_rating_proj = df_joined.groupBy("Project").agg(avg("Rating").alias("AvgRating"))
df_avg_rating_proj.show()

+------+-----------+------+----+------+----------------+-----------+
|  Name| Department|Salary|Year|Rating|         Project|HoursWorked|
+------+-----------+------+----+------+----------------+-----------+
|Ananya|         HR| 52000|2023|   4.5|       HR Portal|        120|
| Rahul|Engineering| 65000|2023|   4.9|   Data Platform|        200|
| Priya|Engineering| 60000|2023|   4.3|   Data Platform|        180|
|  Zoya|  Marketing| 48000|2023|   3.8|Campaign Tracker|        100|
| Karan|         HR| 53000|2023|   4.1|       HR Portal|        130|
|Naveen|Engineering| 70000|2023|   4.7|     ML Pipeline|        220|
|Fatima|  Marketing| 45000|2023|   3.9|Campaign Tracker|         90|
+------+-----------+------+----+------+----------------+-----------+

+-----------+----------+
| Department|TotalHours|
+-----------+----------+
|         HR|       250|
|Engineering|       600|
|  Marketing|       190|
+-----------+----------+

+----------------+------------------+
|         Project|        

In [0]:
#Handling Missing Data
from pyspark.sql import Row
from pyspark.sql.functions import when, avg
# 4. Add a row to performance_data with a None rating.
new_perf_row = Row("Meena", 2023, None)
df_perf_with_null = df_perf.union(spark.createDataFrame([new_perf_row], df_perf.schema))
df_perf_with_null.show()

# 5. Filter rows with null values.
df_perf_with_null.filter(df_perf_with_null["Rating"].isNull()).show()

# 6. Replace null ratings with the department average.
# Join to get department info
df_perf_joined = df_perf_with_null.join(df_emp, on="Name", how="left")

# Calculate department average
dept_avg = df_perf_joined.groupBy("Department").agg(avg("Rating").alias("DeptAvg"))

# Join again to get average per row
df_filled = df_perf_joined.join(dept_avg, on="Department", how="left")

# Replace nulls
df_filled = df_filled.withColumn(
    "RatingFilled",
    when(df_filled["Rating"].isNull(), df_filled["DeptAvg"]).otherwise(df_filled["Rating"])
)
df_filled.select("Name", "Department", "Rating", "RatingFilled").show()

+------+----+------+
|  Name|Year|Rating|
+------+----+------+
|Ananya|2023|   4.5|
| Rahul|2023|   4.9|
| Priya|2023|   4.3|
|  Zoya|2023|   3.8|
| Karan|2023|   4.1|
|Naveen|2023|   4.7|
|Fatima|2023|   3.9|
| Meena|2023|  NULL|
+------+----+------+

+-----+----+------+
| Name|Year|Rating|
+-----+----+------+
|Meena|2023|  NULL|
+-----+----+------+

+------+-----------+------+------------+
|  Name| Department|Rating|RatingFilled|
+------+-----------+------+------------+
|Ananya|         HR|   4.5|         4.5|
| Rahul|Engineering|   4.9|         4.9|
| Priya|Engineering|   4.3|         4.3|
|  Zoya|  Marketing|   3.8|         3.8|
| Karan|         HR|   4.1|         4.1|
|Naveen|Engineering|   4.7|         4.7|
|Fatima|  Marketing|   3.9|         3.9|
| Meena|       NULL|  NULL|        NULL|
+------+-----------+------+------------+



In [0]:
#Built-In Functions and UDF
from pyspark.sql.functions import udf
from pyspark.sql.types import IntegerType
# 7. Create a column PerformanceCategory :
# Excellent (>=4.7),
# Good (4.0–4.69),
# Average (<4.0)
df_perf_cat = df_joined.withColumn(
    "PerformanceCategory",
    when(df_joined["Rating"] >= 4.7, "Excellent")
    .when(df_joined["Rating"] >= 4.0, "Good")
    .otherwise("Average")
)
df_perf_cat.select("Name", "Rating", "PerformanceCategory").show()

# 8. Create a UDF to assign bonus:
# If project hours > 200 →
# 10,000
# Else →
# 5,000
def calc_bonus(hours):
    return 10000 if hours > 200 else 5000

bonus_udf = udf(calc_bonus, IntegerType())

df_bonus = df_perf_cat.withColumn("Bonus", bonus_udf("HoursWorked"))
df_bonus.select("Name", "HoursWorked", "Bonus").show()

+------+------+-------------------+
|  Name|Rating|PerformanceCategory|
+------+------+-------------------+
|Ananya|   4.5|               Good|
| Rahul|   4.9|          Excellent|
| Priya|   4.3|               Good|
|  Zoya|   3.8|            Average|
| Karan|   4.1|               Good|
|Naveen|   4.7|          Excellent|
|Fatima|   3.9|            Average|
+------+------+-------------------+

+------+-----------+-----+
|  Name|HoursWorked|Bonus|
+------+-----------+-----+
|Ananya|        120| 5000|
| Priya|        180| 5000|
| Rahul|        200| 5000|
|  Zoya|        100| 5000|
| Karan|        130| 5000|
|Naveen|        220|10000|
|Fatima|         90| 5000|
+------+-----------+-----+



In [0]:
#Date and Time Functions
from pyspark.sql.functions import lit, months_between, current_date, to_date
from pyspark.sql.functions import to_date
# 9. Add a column JoinDate with 2021-06-01 for all, then add MonthsWorked as difference from today.
df_with_dates = df_bonus.withColumn("JoinDate", to_date(lit("2021-06-01")))
df_with_dates = df_with_dates.withColumn("MonthsWorked", months_between(current_date(), "JoinDate"))
df_with_dates.select("Name", "JoinDate", "MonthsWorked").show()

# 10. Calculate how many employees joined before 2022.
count_before_2022 = df_with_dates.filter(df_with_dates["JoinDate"] < to_date(lit("2022-01-01"))).count()
print("Employees joined before 2022:", count_before_2022)

+------+----------+------------+
|  Name|  JoinDate|MonthsWorked|
+------+----------+------------+
|Ananya|2021-06-01| 48.32258065|
| Priya|2021-06-01| 48.32258065|
| Rahul|2021-06-01| 48.32258065|
|  Zoya|2021-06-01| 48.32258065|
| Karan|2021-06-01| 48.32258065|
|Naveen|2021-06-01| 48.32258065|
|Fatima|2021-06-01| 48.32258065|
+------+----------+------------+

Employees joined before 2022: 7


In [0]:
#Unions
#11. Create another small team DataFrame and union() it with employee_data .
# extra_employees = [
# ("Meena", "HR", 48000),
# ("Raj", "Marketing", 51000)
# ]
extra_employees = [
    ("Meena", "HR", 48000),
    ("Raj", "Marketing", 51000)
]
df_extra = spark.createDataFrame(extra_employees, columns_emp)

df_emp_extended = df_emp.union(df_extra)
df_emp_extended.show()

+------+-----------+------+
|  Name| Department|Salary|
+------+-----------+------+
|Ananya|         HR| 52000|
| Rahul|Engineering| 65000|
| Priya|Engineering| 60000|
|  Zoya|  Marketing| 48000|
| Karan|         HR| 53000|
|Naveen|Engineering| 70000|
|Fatima|  Marketing| 45000|
| Meena|         HR| 48000|
|   Raj|  Marketing| 51000|
+------+-----------+------+



In [0]:
#Saving Results
#12. Save the final merged dataset (all 3 joins) as a partitioned Parquet file based on Department .
df_with_dates.write.mode("overwrite").partitionBy("Department").parquet("dbfs:/tmp/final_employee_data_partitioned")