In [0]:
from pyspark.sql import SparkSession
from pyspark.sql.functions import *
from pyspark.sql.window import Window
spark = SparkSession.builder.appName("IPLAggregations").getOrCreate()
match_data = [
    (2020, "MI", "CSK", "MI", "completed", 120, 1),
    (2020, "DC", "RCB", "DC", "completed", 119, 1),
    (2020, "SRH", "KKR", "SRH", "completed", 156, 1),
    (2020, "MI", "DC", "MI", "completed", 120, 2),
    (2020, "RCB", "SRH", None, "abandoned", None, None),
    (2021, "CSK", "MI", "CSK", "completed", 120, 1),
    (2021, "RCB", "KKR", "KKR", "completed", 90, 1),
    (2021, "DC", "SRH", "DC", "completed", 120, 1),
    (2021, "MI", "RCB", "MI", "completed", 120, 2),
    (2021, "KKR", "DC", None, "tied", 120, 1),
    (2022, "MI", "CSK", "MI", "completed", 120, 1),
    (2022, "GT", "RR", "GT", "completed", 120, 1),
    (2022, "RCB", "KKR", "RCB", "completed", 120, 1),
    (2022, "LSG", "DC", "LSG", "completed", 120, 1),
    (2022, "SRH", "PBKS", None, "abandoned", None, None)
]
ball_data = [
    (2020, "MI", 1, 1, "Rohit"),
    (2020, "MI", 1, 2, "Rohit"),
    (2020, "MI", 1, 3, "De Kock"),
    (2020, "CSK", 1, 1, "Dhoni"),
    (2021, "CSK", 1, 1, "Ruturaj"),
    (2021, "CSK", 1, 2, "Ruturaj"),
    (2021, "MI", 1, 1, "Rohit"),
    (2022, "GT", 1, 1, "Hardik"),
    (2022, "GT", 1, 2, "Hardik")
]
match_df = spark.createDataFrame(match_data, ["year", "team1", "team2", "winner", "status", "ball_no", "inning"])
ball_df = spark.createDataFrame(ball_data, ["year", "team", "match_id", "ball_no", "batsman"])

match_status = match_df.groupBy("year", "status").count() \
    .groupBy("status").pivot("year").sum("count") \
    .na.fill(0).orderBy("status")
avg_balls = match_df.filter(col("ball_no").isNotNull()) \
    .groupBy("year", "inning").agg(
        avg("ball_no").alias("avg_balls")
    ).groupBy("year").agg(
        avg("avg_balls").alias("avg_balls_per_inning")
    ).orderBy("year")
wins_per_team = match_df.filter(col("status") == "completed") \
    .groupBy("year", "winner").count() \
    .orderBy("year", desc("count"))
batsmen_list = ball_df.groupBy("year", "team").agg(
    collect_list("batsman").alias("batsmen_list"),
    collect_set("batsman").alias("batsmen_set")
)

print("\n2. Match status counts by year:")
match_status.show()

print("\n3. Average balls per inning:")
avg_balls.show()

print("\n4. Number of wins per team by year:")
wins_per_team.show()

print("\n5. Batsmen collection comparison:")
batsmen_list.show(truncate=False)

spark.stop()


2. Match status counts by year:
+---------+----+----+----+
|   status|2020|2021|2022|
+---------+----+----+----+
|abandoned|   1|   0|   1|
|completed|   4|   4|   4|
|     tied|   0|   1|   0|
+---------+----+----+----+


3. Average balls per inning:
+----+--------------------+
|year|avg_balls_per_inning|
+----+--------------------+
|2020|  125.83333333333333|
|2021|              116.25|
|2022|               120.0|
+----+--------------------+


4. Number of wins per team by year:
+----+------+-----+
|year|winner|count|
+----+------+-----+
|2020|    MI|    2|
|2020|    DC|    1|
|2020|   SRH|    1|
|2021|   CSK|    1|
|2021|   KKR|    1|
|2021|    DC|    1|
|2021|    MI|    1|
|2022|    MI|    1|
|2022|    GT|    1|
|2022|   LSG|    1|
|2022|   RCB|    1|
+----+------+-----+


5. Batsmen collection comparison:
+----+----+-----------------------+----------------+
|year|team|batsmen_list           |batsmen_set     |
+----+----+-----------------------+----------------+
|2020|MI  |[Rohit,