In [4]:
!sudo apt-get install -y openjdk-17-jdk

Reading package lists... Done
Building dependency tree... Done
Reading state information... Done
Note, selecting 'openjdk-25-testsupport' for glob 'openjdk-*'
Note, selecting 'openjdk-11-jdk' for glob 'openjdk-*'
Note, selecting 'openjdk-11-jre' for glob 'openjdk-*'
Note, selecting 'openjdk-11-jre-headless' for glob 'openjdk-*'
Note, selecting 'openjdk-19-jre-headless' for glob 'openjdk-*'
Note, selecting 'openjdk-8-jre-zero' for glob 'openjdk-*'
Note, selecting 'openjdk-11-jdk-headless' for glob 'openjdk-*'
Note, selecting 'openjdk-21-jdk-headless' for glob 'openjdk-*'
Note, selecting 'openjdk-19-jdk-headless' for glob 'openjdk-*'
Note, selecting 'openjdk-25-jdk-headless' for glob 'openjdk-*'
Note, selecting 'openjdk-25-source' for glob 'openjdk-*'
Note, selecting 'openjdk-21-demo' for glob 'openjdk-*'
Note, selecting 'openjdk-18-jdk-headless' for glob 'openjdk-*'
Note, selecting 'openjdk-17-dbg' for glob 'openjdk-*'
Note, selecting 'openjdk-17-doc' for glob 'openjdk-*'
Note, selectin

In [6]:
from pyspark.sql import SparkSession
from pyspark.sql import functions as F

spark = (SparkSession.builder
         .appName("NBA_Fouls_Per_Game")
         .master("local[*]")
         .config("spark.sql.shuffle.partitions", "64")
         .getOrCreate())
spark

PySparkRuntimeError: [JAVA_GATEWAY_EXITED] Java gateway process exited before sending its port number.

In [None]:
#make sure to save the dataset in the drive before executing this.
from os import mkdir
from google.colab import drive
drive.mount('/content/drive')

!rm -rf data
!mkdir -p data
!cp "/content/drive/MyDrive/play_by_play.csv" data/play_by_play.csv

!ls -lh data

Drive already mounted at /content/drive; to attempt to forcibly remount, call drive.mount("/content/drive", force_remount=True).
total 2.2G
-rw------- 1 root root 2.2G Nov 28 00:18 play_by_play.csv


In [None]:
PBP_PATH = "data/play_by_play.csv"
pbp = (spark.read
       .option("header", True)
       .option("inferschema", True)
       .csv(PBP_PATH))

pbp.printSchema()
print("Number of Rows:", pbp.count())
pbp.show(100, truncate=False)

root
 |-- game_id: integer (nullable = true)
 |-- eventnum: integer (nullable = true)
 |-- eventmsgtype: integer (nullable = true)
 |-- eventmsgactiontype: integer (nullable = true)
 |-- period: integer (nullable = true)
 |-- wctimestring: string (nullable = true)
 |-- pctimestring: timestamp (nullable = true)
 |-- homedescription: string (nullable = true)
 |-- neutraldescription: string (nullable = true)
 |-- visitordescription: string (nullable = true)
 |-- score: string (nullable = true)
 |-- scoremargin: string (nullable = true)
 |-- person1type: double (nullable = true)
 |-- player1_id: integer (nullable = true)
 |-- player1_name: string (nullable = true)
 |-- player1_team_id: double (nullable = true)
 |-- player1_team_city: string (nullable = true)
 |-- player1_team_nickname: string (nullable = true)
 |-- player1_team_abbreviation: string (nullable = true)
 |-- person2type: double (nullable = true)
 |-- player2_id: integer (nullable = true)
 |-- player2_name: string (nullable = t

In [None]:

fouls = pbp.filter(F.col("eventmsgtype") == 6)
print("Number of Rows:", fouls.count())
fouls.show(20, truncate=False)

Number of Rows: 1305746
+--------+--------+------------+------------------+------+------------+-------------------+-------------------------+------------------+--------------------------------------+-----+-----------+-----------+----------+------------------+---------------+-----------------+---------------------+-------------------------+-----------+----------+------------+---------------+-----------------+---------------------+-------------------------+-----------+----------+------------+---------------+-----------------+---------------------+-------------------------+--------------------+
|game_id |eventnum|eventmsgtype|eventmsgactiontype|period|wctimestring|pctimestring       |homedescription          |neutraldescription|visitordescription                    |score|scoremargin|person1type|player1_id|player1_name      |player1_team_id|player1_team_city|player1_team_nickname|player1_team_abbreviation|person2type|player2_id|player2_name|player2_team_id|player2_team_city|player2_team_n

In [None]:
fouls = fouls.withColumn(
    "foul_team",
    F.col("player1_team_abbreviation")
)
fouls.show(20, truncate=False)

+--------+--------+------------+------------------+------+------------+-------------------+-------------------------+------------------+----------------------+-----+-----------+-----------+----------+----------------+---------------+-----------------+---------------------+-------------------------+-----------+----------+------------+---------------+-----------------+---------------------+-------------------------+-----------+----------+------------+---------------+-----------------+---------------------+-------------------------+--------------------+---------+
|game_id |eventnum|eventmsgtype|eventmsgactiontype|period|wctimestring|pctimestring       |homedescription          |neutraldescription|visitordescription    |score|scoremargin|person1type|player1_id|player1_name    |player1_team_id|player1_team_city|player1_team_nickname|player1_team_abbreviation|person2type|player2_id|player2_name|player2_team_id|player2_team_city|player2_team_nickname|player2_team_abbreviation|person3type|play

In [None]:
#fouls made by a player in a game
fouls_per_game = (fouls.groupBy("game_id", "player1_id", "player1_name", "foul_team")
.agg(F.count("*").alias("foul_count"))
)
fouls_per_game.show(30, truncate=False)

+--------+----------+--------------------+---------+----------+
|game_id |player1_id|player1_name        |foul_team|foul_count|
+--------+----------+--------------------+---------+----------+
|29600002|682       |Bob Sura            |CLE      |3         |
|29600011|203       |Nate McMillan       |SEA      |6         |
|29600023|143       |Terry Dehere        |LAC      |3         |
|29600015|168       |Chris Mills         |CLE      |5         |
|29600020|951       |Ray Allen           |MIL      |1         |
|29600020|386       |Elliot Perry        |MIL      |2         |
|29600028|735       |Bryant Reeves       |VAN      |6         |
|29600026|293       |Charles Smith       |SAS      |5         |
|29600034|251       |Sean Elliott        |SAS      |2         |
|29600037|170       |Joe Kleine          |PHX      |3         |
|29600043|105       |Dan Majerle         |MIA      |1         |
|29600046|324       |Anthony Peeler      |VAN      |2         |
|29600048|36        |Sarunas Marciulioni

In [None]:
# total fouls made by team in a game
fouls_per_game_team = (fouls.groupBy("game_id", "foul_team")
.agg(F.count("*").alias("team_foul_count"))
)
fouls_per_game_team.show(20, truncate=False)

+--------+---------+---------------+
|game_id |foul_team|team_foul_count|
+--------+---------+---------------+
|29600062|NYK      |36             |
|29600069|SAC      |31             |
|29600075|TOR      |27             |
|29600136|GSW      |22             |
|29600194|DEN      |34             |
|29600228|MIL      |23             |
|29600296|PHX      |25             |
|29600346|WAS      |25             |
|29600364|PHX      |21             |
|29600390|DEN      |20             |
|29600429|BOS      |21             |
|29600553|MIL      |26             |
|29600555|MIN      |21             |
|29600585|NYK      |31             |
|29600612|SAC      |17             |
|29600618|ORL      |17             |
|29600676|IND      |14             |
|29600706|LAC      |28             |
|29600721|POR      |26             |
|29600794|PHI      |18             |
+--------+---------+---------------+
only showing top 20 rows

