In [None]:
!sudo apt-get install -y openjdk-17-jdk

In [None]:
!pip install -q pyspark

from pyspark.sql import SparkSession, functions as F, Window

spark = (SparkSession.builder
         .appName("NBA-PlayByPlay-PA5")
         .config("spark.sql.shuffle.partitions", "8")
         .config("spark.driver.memory", "4g")
         .getOrCreate())

spark

In [3]:
PBP_PATH = "/content/clean_play_by_play.csv"
pbp = (spark.read
       .option("header", True)
       .csv(PBP_PATH))

pbp.printSchema()
print("Number of Rows:", pbp.count())
pbp.show(5, truncate=False)

root
 |-- game_id: string (nullable = true)
 |-- period: string (nullable = true)
 |-- pctimestring: string (nullable = true)
 |-- eventmsgtype: string (nullable = true)
 |-- eventmsgactiontype: string (nullable = true)
 |-- actor_id: string (nullable = true)
 |-- actor_name: string (nullable = true)
 |-- actor_team_id: string (nullable = true)
 |-- actor_team_abbr: string (nullable = true)
 |-- homedescription: string (nullable = true)
 |-- visitordescription: string (nullable = true)

Number of Rows: 925
+--------+------+------------------------+------------+------------------+--------+--------------+-------------+---------------+--------------------------------------------+--------------------------+
|game_id |period|pctimestring            |eventmsgtype|eventmsgactiontype|actor_id|actor_name    |actor_team_id|actor_team_abbr|homedescription                             |visitordescription        |
+--------+------+------------------------+------------+------------------+--------+---

In [4]:
from pyspark.sql import functions as F

#filter the datset to keep only rows representing the rebound events
rebounds = pbp.filter(F.col("eventmsgtype") == 4)
player_rebnd_stats = (
    rebounds
    .groupBy("actor_id", "actor_name", "actor_team_abbr")
    .agg(
        F.count("*").alias("total_rebounds"),
        F.countDistinct("game_id").alias("games_played")
    )
    .orderBy(F.desc("total_rebounds"))
)

player_rebnd_stats.show(10, truncate=False

)

+--------+---------------------+---------------+--------------+------------+
|actor_id|actor_name           |actor_team_abbr|total_rebounds|games_played|
+--------+---------------------+---------------+--------------+------------+
|1903    |Kenny Thomas         |PHI            |3             |3           |
|203991  |Clint Capela         |ATL            |2             |2           |
|1502    |Adonal Foyle         |GSW            |2             |2           |
|203507  |Giannis Antetokounmpo|MIL            |2             |2           |
|703     |Kurt Thomas          |NYK            |2             |2           |
|2547    |Chris Bosh           |TOR            |2             |2           |
|101107  |Marvin Williams      |CHA            |2             |2           |
|2216    |Zach Randolph        |MEM            |2             |2           |
|1628425 |Sterling Brown       |MIL            |1             |1           |
|2744    |Al Jefferson         |MIN            |1             |1           |

In [5]:
pbp_with_date = (
    pbp
    .withColumn("event_time", F.to_timestamp("pctimestring"))
    .withColumn("season_year", F.year("event_time").cast("string"))
)

season_map = [("2023", "2023-24"),
              ("2024", "2024-25"),
              ("2025", "2025-26")
          ]
season_df = spark.createDataFrame(season_map, ["season_year", "season_label"])
pbp_joined = pbp_with_date.join(season_df, "season_year", "left")

rebounds_joined = pbp_joined.filter(F.col("eventmsgtype") == 4)

player_season_rebnd = (
    rebounds_joined
    .groupBy("season_label", "actor_id", "actor_name")
    .agg(F.count("*").alias("total_rebounds_season"),
         F.countDistinct("game_id").alias("game_played"),
         (F.count("*") / F.countDistinct("game_id")).alias("avg_rebnd_per_game"))
    .orderBy(F.desc("total_rebounds_season"))
)
print("Seasonal rebound summary per player:")
player_season_rebnd.show(10, truncate=False)

Seasonal rebound summary per player:
+------------+--------+----------------+---------------------+-----------+------------------+
|season_label|actor_id|actor_name      |total_rebounds_season|game_played|avg_rebnd_per_game|
+------------+--------+----------------+---------------------+-----------+------------------+
|2025-26     |1903    |Kenny Thomas    |4                    |4          |1.0               |
|2025-26     |203991  |Clint Capela    |3                    |3          |1.0               |
|2025-26     |1502    |Adonal Foyle    |2                    |2          |1.0               |
|2025-26     |763     |Tony Massenburg |2                    |2          |1.0               |
|2025-26     |201142  |Kevin Durant    |2                    |2          |1.0               |
|2025-26     |2730    |Dwight Howard   |2                    |2          |1.0               |
|2025-26     |406     |Shaquille O'Neal|2                    |2          |1.0               |
|2025-26     |389     |