In [1]:
import pandas as pd
import json
import simdjson
import os
from nba_api.stats.endpoints import playbyplayv2, commonplayerinfo
from time import sleep

In [11]:
# 12_05_2015_NYK_at_MIL/0021500295.json
game_path = "./test.json"
f = open(game_path)
json_data = simdjson.load(f)
f.close()
df = pd.read_json(simdjson.dumps(json_data["events"]))

In [12]:
home_team = pd.DataFrame(df.iloc[0]["home"])
home_team = pd.concat([home_team.drop("players", axis="columns"), home_team["players"].apply(pd.Series)], axis="columns")
visitor_team = pd.DataFrame(df.iloc[0]["visitor"])
visitor_team = pd.concat([visitor_team.drop("players", axis="columns"), visitor_team["players"].apply(pd.Series)], axis="columns")
df = df.drop(["home", "visitor"], axis="columns")
visitor_team.head()

name            string
teamid           Int64
abbreviation    string
lastname        string
firstname       string
playerid         Int64
jersey          string
position        string
dtype: object

In [16]:
# Get heights for each player in inches
def get_height_for_player(player_row):
    id = player_row["playerid"]
    # NBA API will block the IP if we request too quickly
    # https://github.com/swar/nba_api/issues/176#issuecomment-771991604
    sleep(0.6)
    player_info = commonplayerinfo.CommonPlayerInfo(player_id=id)
    height_str = player_info.get_normalized_dict()["CommonPlayerInfo"][0]["HEIGHT"]
    height_ft, height_in = height_str.split("-")
    height = int(height_ft) * 12 + int(height_in)
    return height

def get_heights_for_team(team_df: pd.DataFrame):
    team_id = team_df.iloc[0]["teamid"]
    if not os.path.exists(f"./test-team-{team_id}.parquet"):
        team_df["height"] = team_df.apply(get_height_for_player, axis="columns")
        return team_df
    

home_team = get_heights_for_team(home_team)
visitor_team = get_heights_for_team(visitor_team)
visitor_team.head()
visitor_team.to_csv('./test-team.csv')

# home_team.to_parquet(f"./test-team-{home_team.iloc[0]['teamid']}.parquet")
# visitor_team.to_parquet(f"./test-team-{visitor_team.iloc[0]['teamid']}.parquet")

In [6]:
df = df.explode("moments", ignore_index=True)
# No clue how NAs got here
df = df.dropna()
# Downsample by 5
# Original length is 161318
df = df.iloc[range(0, len(df.index), 5)]

In [7]:
# Based on https://github.com/gmf05/nba/blob/master/scripts/py/sloan1.py
# Added workarounds where data is malformed
# Needs to be vectorized/batched for better performance
def process_moment(moment_input):
    event_id = moment_input["eventId"]
    moment = moment_input["moments"]
    period, moment_id, sec_remain, shotclock_remain  = moment[0:4]
    # Rare instances where ball or player coordinates aren't there, not sure why
    ball_present = False
    players_present = False
    ball_x, ball_y, ball_z = (pd.NA, pd.NA, pd.NA)
    if moment[-1][0][0] == -1:
        ball_present = True
        ball_x, ball_y, ball_z = moment[-1][0][2:5]
    player_id = [0] * 10
    player_x = [0] * 10
    player_y = [0] * 10
    if len(moment[-1]) > 1:
        players_present = True
    if players_present:
        for i in range(len(moment[-1]) - 1):
            # Loop over players
            if ball_present:
                player_id[i] = moment[-1][i + 1][1] # +1 bc ball is row 0!
                player_x[i], player_y[i] = moment[-1][i + 1][2:4] # +1 bc ball is row 0!
            else:
                player_id[i] = moment[-1][i][1]
                player_x[i], player_y[i] = moment[-1][i][2:4]
    
    moment_output = {
        "event_id": event_id,
        "period": period,
        "moment_id": moment_id,
        "sec_remain": sec_remain,
        "shotclock_remain": shotclock_remain,
        "ball_x": ball_x,
        "ball_y": ball_y,
        "ball_z": ball_z,
        "player_id": player_id,
        "player_x": player_x,
        "player_y": player_y,
    }
    return pd.Series(moment_output)
# df.to_csv("test.csv")
df = df.apply(process_moment, axis="columns")

In [8]:
# df.head(500)
df.to_csv('test.csv')