### You have an employee dataset with columns employee_id, leave_type, leave_duration. Write a query to find the total leave duration taken by each employee for each leave type.

In [0]:
from pyspark.sql import SparkSession
from pyspark.sql.functions import sum 

# Sample data
data = [
    (1, "Sick Leave", 2),
    (1, "Casual Leave", 1),
    (1, "Sick Leave", 3),
    (2, "Casual Leave", 2),
    (2, "Sick Leave", 1),
    (3, "Maternity", 30),
    (3, "Sick Leave", 2),
    (1, "Casual Leave", 2),
    (2, "Casual Leave", 1),
    (3, "Casual Leave", 3)
]

# Define schema
columns = ["employee_id", "leave_type", "leave_duration"]

# Create DataFrame
df = spark.createDataFrame(data, columns)

# Show the sample DataFrame
df.show()


+-----------+------------+--------------+
|employee_id|  leave_type|leave_duration|
+-----------+------------+--------------+
|          1|  Sick Leave|             2|
|          1|Casual Leave|             1|
|          1|  Sick Leave|             3|
|          2|Casual Leave|             2|
|          2|  Sick Leave|             1|
|          3|   Maternity|            30|
|          3|  Sick Leave|             2|
|          1|Casual Leave|             2|
|          2|Casual Leave|             1|
|          3|Casual Leave|             3|
+-----------+------------+--------------+



In [0]:
# Group by employee_id and leave_type and sum the leave_duration
leave_summary = df.groupBy("employee_id", "leave_type").agg(
    sum("leave_duration").alias("total_leave_duration")
)

# Show the result
leave_summary.show()


+-----------+------------+--------------------+
|employee_id|  leave_type|total_leave_duration|
+-----------+------------+--------------------+
|          1|  Sick Leave|                   5|
|          1|Casual Leave|                   3|
|          2|Casual Leave|                   3|
|          2|  Sick Leave|                   1|
|          3|   Maternity|                  30|
|          3|  Sick Leave|                   2|
|          3|Casual Leave|                   3|
+-----------+------------+--------------------+

