In [0]:
from pyspark.sql import SparkSession

spark = SparkSession.builder\
        .appName("PySpark Advanced Exercise")\
        .getOrCreate()

spark

In [0]:
employee_data = [
    ("Ananya", "HR", 52000),
    ("Rahul", "Engineering", 75000),
    ("Priya", "Marketing", 58000),
    ("Zoya", "Engineering", 62000),
    ("Karan", "Sales", 49000),
    ("Naveen", "Engineering", 68000),
    ("Fatima", "HR", 54000)
]

columns_employee = ["Name", "Department", "Salary"]
df_employee = spark.createDataFrame(employee_data, columns_employee)
df_employee.show()

+------+-----------+------+
|  Name| Department|Salary|
+------+-----------+------+
|Ananya|         HR| 52000|
| Rahul|Engineering| 75000|
| Priya|  Marketing| 58000|
|  Zoya|Engineering| 62000|
| Karan|      Sales| 49000|
|Naveen|Engineering| 68000|
|Fatima|         HR| 54000|
+------+-----------+------+



In [0]:
performance_data = [
    ("Ananya", 2023, 4.5),
    ("Rahul", 2023, 4.9),
    ("Priya", 2023, 4.3),
    ("Zoya", 2023, 3.8),
    ("Karan", 2023, 4.1),
    ("Naveen", 2023, 4.7),
    ("Fatima", 2023, 3.9)
]

columns_perf = ["Name", "Year", "Rating"]
df_perf = spark.createDataFrame(performance_data, columns_perf)
df_perf.show()

+------+----+------+
|  Name|Year|Rating|
+------+----+------+
|Ananya|2023|   4.5|
| Rahul|2023|   4.9|
| Priya|2023|   4.3|
|  Zoya|2023|   3.8|
| Karan|2023|   4.1|
|Naveen|2023|   4.7|
|Fatima|2023|   3.9|
+------+----+------+



In [0]:
# GroupBy and Aggregations
from pyspark.sql.functions import avg, count, max, min

# 1. Average salary by department
df_employee.groupBy("Department").agg(avg("Salary").alias("Average_Salary")).show()

# 2. Count of employees per department
df_employee.groupBy("Department").agg(count("*").alias("Employee_Count")).show()

# 3. Max and min salary in Engineering
df_employee.filter(df_employee.Department == "Engineering") \
    .agg(
        max("Salary").alias("Max_Salary"),
        min("Salary").alias("Min_Salary")
    ).show()

+-----------+-----------------+
| Department|   Average_Salary|
+-----------+-----------------+
|         HR|          53000.0|
|Engineering|68333.33333333333|
|  Marketing|          58000.0|
|      Sales|          49000.0|
+-----------+-----------------+

+-----------+--------------+
| Department|Employee_Count|
+-----------+--------------+
|         HR|             2|
|Engineering|             3|
|  Marketing|             1|
|      Sales|             1|
+-----------+--------------+

+----------+----------+
|Max_Salary|Min_Salary|
+----------+----------+
|     75000|     62000|
+----------+----------+



In [0]:
#Join and Combine Data
# 4. Inner join on Name
df_joined = df_employee.join(df_perf, on="Name", how="inner")

# 5. Show each employee’s salary and performance rating
df_joined.select("Name", "Salary", "Rating").show()

# 6. Filter: rating > 4.5 and salary > 60000
df_joined.filter((df_joined.Rating > 4.5) & (df_joined.Salary > 60000)).show()

+------+------+------+
|  Name|Salary|Rating|
+------+------+------+
|Ananya| 52000|   4.5|
|Fatima| 54000|   3.9|
| Karan| 49000|   4.1|
|Naveen| 68000|   4.7|
| Priya| 58000|   4.3|
| Rahul| 75000|   4.9|
|  Zoya| 62000|   3.8|
+------+------+------+

+------+-----------+------+----+------+
|  Name| Department|Salary|Year|Rating|
+------+-----------+------+----+------+
|Naveen|Engineering| 68000|2023|   4.7|
| Rahul|Engineering| 75000|2023|   4.9|
+------+-----------+------+----+------+



In [0]:
#Window & Rank (Bonus Challenge)
from pyspark.sql.window import Window
from pyspark.sql.functions import rank, sum as _sum

# 7. Rank employees by salary department-wise
window_dept = Window.partitionBy("Department").orderBy(df_employee["Salary"].desc())

df_employee.withColumn("Salary_Rank", rank().over(window_dept)).show()

# 8. Cumulative salary per department
df_employee.withColumn("Cumulative_Salary", _sum("Salary").over(window_dept)).show()

+------+-----------+------+-----------+
|  Name| Department|Salary|Salary_Rank|
+------+-----------+------+-----------+
| Rahul|Engineering| 75000|          1|
|Naveen|Engineering| 68000|          2|
|  Zoya|Engineering| 62000|          3|
|Fatima|         HR| 54000|          1|
|Ananya|         HR| 52000|          2|
| Priya|  Marketing| 58000|          1|
| Karan|      Sales| 49000|          1|
+------+-----------+------+-----------+

+------+-----------+------+-----------------+
|  Name| Department|Salary|Cumulative_Salary|
+------+-----------+------+-----------------+
| Rahul|Engineering| 75000|            75000|
|Naveen|Engineering| 68000|           143000|
|  Zoya|Engineering| 62000|           205000|
|Fatima|         HR| 54000|            54000|
|Ananya|         HR| 52000|           106000|
| Priya|  Marketing| 58000|            58000|
| Karan|      Sales| 49000|            49000|
+------+-----------+------+-----------------+



In [0]:
#Date Operations
from pyspark.sql.functions import current_date, datediff, col, floor
import random
from datetime import datetime

# 9. Add JoinDate with random dates between 2020 and 2023
def random_date():
    start = datetime(2020, 1, 1)
    end = datetime(2023, 12, 31)
    return start + (end - start) * random.random()

# Generate random dates for each employee
random_dates = [random_date().strftime('%Y-%m-%d') for _ in range(df_employee.count())]
df_random_dates = spark.createDataFrame([(d,) for d in random_dates], ["JoinDate"])

# Add index to both DataFrames to join properly
df_emp_indexed = df_employee.rdd.zipWithIndex().toDF(["data", "index"])
df_dates_indexed = df_random_dates.rdd.zipWithIndex().toDF(["date", "index"])

# Join on index and extract final employee DataFrame with JoinDate
df_employee_with_date = df_emp_indexed.join(df_dates_indexed, "index") \
    .selectExpr("data.Name", "data.Department", "data.Salary", "date.JoinDate") \
    .withColumn("JoinDate", col("JoinDate").cast("date"))  # ✅ FIXED HERE

df_employee_with_date.show(truncate=False)

# Step 10: Calculate YearsWithCompany using current_date and datediff
df_employee_final = df_employee_with_date.withColumn(
    "YearsWithCompany",
    floor(datediff(current_date(), col("JoinDate")) / 365)
)

# Display final DataFrame
df_employee_final.show(truncate=False)

+------+-----------+------+----------+
|Name  |Department |Salary|JoinDate  |
+------+-----------+------+----------+
|Ananya|HR         |52000 |2020-05-22|
|Rahul |Engineering|75000 |2022-11-22|
|Priya |Marketing  |58000 |2020-08-21|
|Zoya  |Engineering|62000 |2020-02-22|
|Karan |Sales      |49000 |2020-09-03|
|Naveen|Engineering|68000 |2021-08-20|
|Fatima|HR         |54000 |2020-05-10|
+------+-----------+------+----------+

+------+-----------+------+----------+----------------+
|Name  |Department |Salary|JoinDate  |YearsWithCompany|
+------+-----------+------+----------+----------------+
|Ananya|HR         |52000 |2020-05-22|5               |
|Rahul |Engineering|75000 |2022-11-22|2               |
|Priya |Marketing  |58000 |2020-08-21|4               |
|Zoya  |Engineering|62000 |2020-02-22|5               |
|Karan |Sales      |49000 |2020-09-03|4               |
|Naveen|Engineering|68000 |2021-08-20|3               |
|Fatima|HR         |54000 |2020-05-10|5               |
+------+--

In [0]:
#Writing to Files
# 11. Write df_employee_final to CSV with headers
df_csv_read = spark.read.option("header", True).csv("dbfs:/tmp/employee_csv_output")
df_csv_read.show(truncate=False)

# 12. Save df_joined (employee + performance) to Parquet
df_parquet_read = spark.read.parquet("dbfs:/tmp/employee_perf_parquet")
df_parquet_read.show(truncate=False)

+------+-----------+------+----------+----------------+
|Name  |Department |Salary|JoinDate  |YearsWithCompany|
+------+-----------+------+----------+----------------+
|Ananya|HR         |52000 |2020-05-22|5               |
|Rahul |Engineering|75000 |2022-11-22|2               |
|Priya |Marketing  |58000 |2020-08-21|4               |
|Zoya  |Engineering|62000 |2020-02-22|5               |
|Karan |Sales      |49000 |2020-09-03|4               |
|Naveen|Engineering|68000 |2021-08-20|3               |
|Fatima|HR         |54000 |2020-05-10|5               |
+------+-----------+------+----------+----------------+

+------+-----------+------+----+------+
|Name  |Department |Salary|Year|Rating|
+------+-----------+------+----+------+
|Ananya|HR         |52000 |2023|4.5   |
|Fatima|HR         |54000 |2023|3.9   |
|Karan |Sales      |49000 |2023|4.1   |
|Naveen|Engineering|68000 |2023|4.7   |
|Priya |Marketing  |58000 |2023|4.3   |
|Rahul |Engineering|75000 |2023|4.9   |
|Zoya  |Engineering|620