# Preprocess the data:

## Imports:

In [1]:
import py7zr
from datasets import load_dataset, load_dataset_builder

  from .autonotebook import tqdm as notebook_tqdm


In [2]:
import os

## Constants:

In [3]:
NUM_PROC = os.cpu_count()

## Look at dataset:

In [3]:
ds_builder = load_dataset_builder("dcayton/nba_tracking_data_15_16", 'full')

In [4]:
ds_builder.info.description

'This dataset is designed to give further easy access to tracking data.\nBy merging all .7z files into one large .json file, access is easier to retrieve all information at once.\n'

In [5]:
ds_builder.info.features

{'gameid': Value(dtype='string', id=None),
 'gamedate': Value(dtype='string', id=None),
 'event_info': {'id': Value(dtype='string', id=None),
  'type': Value(dtype='int64', id=None),
  'possession_team_id': Value(dtype='float64', id=None),
  'desc_home': Value(dtype='string', id=None),
  'desc_away': Value(dtype='string', id=None)},
 'primary_info': {'team': Value(dtype='string', id=None),
  'player_id': Value(dtype='float64', id=None),
  'team_id': Value(dtype='float64', id=None)},
 'secondary_info': {'team': Value(dtype='string', id=None),
  'player_id': Value(dtype='float64', id=None),
  'team_id': Value(dtype='float64', id=None)},
 'visitor': {'name': Value(dtype='string', id=None),
  'teamid': Value(dtype='int64', id=None),
  'abbreviation': Value(dtype='string', id=None),
  'players': [{'lastname': Value(dtype='string', id=None),
    'firstname': Value(dtype='string', id=None),
    'playerid': Value(dtype='int64', id=None),
    'jersey': Value(dtype='string', id=None),
    'posit

In [8]:
ds_builder.info

DatasetInfo(description='This dataset is designed to give further easy access to tracking data.\nBy merging all .7z files into one large .json file, access is easier to retrieve all information at once.\n', citation='@misc{Linou2016,\ntitle = {NBA-Player-Movements},\nauthor={Kostya Linou},\npublisher={SportVU},\nyear={2016}\n', homepage='https://github.com/linouk23/NBA-Player-Movements/tree/master/', license='', features={'gameid': Value(dtype='string', id=None), 'gamedate': Value(dtype='string', id=None), 'event_info': {'id': Value(dtype='string', id=None), 'type': Value(dtype='int64', id=None), 'possession_team_id': Value(dtype='float64', id=None), 'desc_home': Value(dtype='string', id=None), 'desc_away': Value(dtype='string', id=None)}, 'primary_info': {'team': Value(dtype='string', id=None), 'player_id': Value(dtype='float64', id=None), 'team_id': Value(dtype='float64', id=None)}, 'secondary_info': {'team': Value(dtype='string', id=None), 'player_id': Value(dtype='float64', id=None

## Load Dataset:

In [54]:
# Looks like there is only a train split (loading without split yields a dictionary with only "train": data not split already)
dataset = load_dataset("dcayton/nba_tracking_data_15_16", name = "tiny", split="train", trust_remote_code=True)

In [4]:
dataset[0]['moments'][0]

{'quarter': 1,
 'game_clock': 707.65,
 'shot_clock': 24.0,
 'ball_coordinates': {'x': 5.48747, 'y': 24.25562, 'z': 4.88724},
 'player_coordinates': [{'teamid': 1610612754,
   'playerid': 201588,
   'x': 22.84608,
   'y': 43.37048,
   'z': 0.0},
  {'teamid': 1610612754,
   'playerid': 101133,
   'x': 6.9984,
   'y': 27.64791,
   'z': 0.0},
  {'teamid': 1610612754,
   'playerid': 101145,
   'x': 15.35528,
   'y': 7.98835,
   'z': 0.0},
  {'teamid': 1610612754,
   'playerid': 202730,
   'x': 28.86235,
   'y': 19.02946,
   'z': 0.0},
  {'teamid': 1610612754,
   'playerid': 202331,
   'x': 9.36919,
   'y': 46.68742,
   'z': 0.0},
  {'teamid': 1610612748,
   'playerid': 2548,
   'x': 10.42233,
   'y': 12.393,
   'z': 0.0},
  {'teamid': 1610612748,
   'playerid': 2547,
   'x': 18.48075,
   'y': 22.48179,
   'z': 0.0},
  {'teamid': 1610612748,
   'playerid': 2736,
   'x': 6.93975,
   'y': 37.22873,
   'z': 0.0},
  {'teamid': 1610612748,
   'playerid': 201609,
   'x': 10.11883,
   'y': 34.4858,

In [5]:
type(dataset[0]['moments'])

list

In [None]:
len(dataset[0]['moments'])

150

## Downsample dataset:

In [33]:
example = dataset[0]

In [None]:
list(map(lambda x : x.pop('lastname', None), x.pop('firstname', None), x, example['home']['players']))

[(None, None, {'playerid': 101133, 'jersey': '28', 'position': 'C'}),
 (None, None, {'playerid': 101139, 'jersey': '0', 'position': 'F-G'}),
 (None, None, {'playerid': 101145, 'jersey': '11', 'position': 'G'}),
 (None, None, {'playerid': 201155, 'jersey': '2', 'position': 'G'}),
 (None, None, {'playerid': 201588, 'jersey': '3', 'position': 'G'}),
 (None, None, {'playerid': 201941, 'jersey': '27', 'position': 'C'}),
 (None, None, {'playerid': 201978, 'jersey': '10', 'position': 'F'}),
 (None, None, {'playerid': 202331, 'jersey': '13', 'position': 'F'}),
 (None, None, {'playerid': 202730, 'jersey': '5', 'position': 'F-C'}),
 (None, None, {'playerid': 203524, 'jersey': '44', 'position': 'F'}),
 (None, None, {'playerid': 203922, 'jersey': '40', 'position': 'G-F'}),
 (None, None, {'playerid': 1626167, 'jersey': '33', 'position': 'F-C'}),
 (None, None, {'playerid': 1626202, 'jersey': '1', 'position': 'G'})]

In [15]:
example['gameid']=0

In [17]:
dataset[0]['gameid']

'0021500333'

In [23]:
del example['event_info']['desc_away']

In [None]:
    # del example['gamedate']
    # del example['event_info']['desc_home']
    # del example['event_info']['desc_away']
    # del example['primary_info']['team']
    # del example['primary_info']['team_id']
    # del example['secondary_info']['team']
    # del example['secondary_info']['team_id']
    # del example['visitor']['name']
    # del example['visitor']['teamid']
    # del example['visitor']['abbreviation']
    # del example['visitor']['players'][:]['lastname']
    # del example['visitor']['players'][:]['firstname']
    # del example['visitor']['players'][:]['number']
    # del example['home']['name']
    # del example['home']['teamid']
    # del example['home']['abbreviation']
    # del example['home']['players']
    # del example['home']['players'][:]['firstname']
    # del example['home']['players'][:]['number']
    # del example['moments']['quarter']
    # del example['moments']['game_clock']
    # del example['moments']['shot_clock']
    # del example['moments']['player_coordinates']['teamid']

In [49]:
def downsample(example):
    example['moments'] = example['moments'][::10]
    return example

In [71]:
def prune_example(example):
    return {
        # top‐level ID
        "n_gameid": example["gameid"],

        # only the fields we care about, renaming `id` → `eventid`
        "n_event_info": {
            "id": example["event_info"]["id"],
            "type": example["event_info"]["type"],
            "possession_team_id": example["event_info"]["possession_team_id"],
        },

        # only player_id from primary and secondary
        "n_primary_info": {
            "player_id": example["primary_info"]["player_id"]
        },
        "n_secondary_info": {
            "player_id": example["secondary_info"]["player_id"]
        },

        # visitor: keep only each player’s position
        "n_visitor": {
            "players": [
                {"position": p["position"]}
                for p in example["visitor"]["players"]
            ]
        },

        # home: keep only playerid and position
        "n_home": {
            "players": [
                {"playerid": p["playerid"], "position": p["position"]}
                for p in example["home"]["players"]
            ]
        },

        # downsampled moments: only ball_coords and each player’s id+xyz
        "n_moments": [
            {
                "ball_coordinates": {
                    "x": m["ball_coordinates"]["x"],
                    "y": m["ball_coordinates"]["y"],
                    "z": m["ball_coordinates"]["z"],
                },
                "player_coordinates": [
                    {
                        "playerid": pc["playerid"],
                        "x": pc["x"],
                        "y": pc["y"],
                        "z": pc["z"],
                    }
                    for pc in m["player_coordinates"]
                ],
            }
            for m in example["moments"]
        ],
    }


In [8]:
type(downsample(dataset[0]))

dict

In [9]:
downsample(dataset[0])['moments_ds']

[{'quarter': 1,
  'game_clock': 707.65,
  'shot_clock': 24.0,
  'ball_coordinates': {'x': 5.48747, 'y': 24.25562, 'z': 4.88724},
  'player_coordinates': [{'teamid': 1610612754,
    'playerid': 201588,
    'x': 22.84608,
    'y': 43.37048,
    'z': 0.0},
   {'teamid': 1610612754,
    'playerid': 101133,
    'x': 6.9984,
    'y': 27.64791,
    'z': 0.0},
   {'teamid': 1610612754,
    'playerid': 101145,
    'x': 15.35528,
    'y': 7.98835,
    'z': 0.0},
   {'teamid': 1610612754,
    'playerid': 202730,
    'x': 28.86235,
    'y': 19.02946,
    'z': 0.0},
   {'teamid': 1610612754,
    'playerid': 202331,
    'x': 9.36919,
    'y': 46.68742,
    'z': 0.0},
   {'teamid': 1610612748,
    'playerid': 2548,
    'x': 10.42233,
    'y': 12.393,
    'z': 0.0},
   {'teamid': 1610612748,
    'playerid': 2547,
    'x': 18.48075,
    'y': 22.48179,
    'z': 0.0},
   {'teamid': 1610612748,
    'playerid': 2736,
    'x': 6.93975,
    'y': 37.22873,
    'z': 0.0},
   {'teamid': 1610612748,
    'playeri

In [10]:
len(downsample(dataset[0])['moments_ds'])

15

In [65]:
%%time
dataset_dsed = dataset.map(downsample, num_proc=NUM_PROC)

CPU times: user 146 ms, sys: 233 μs, total: 146 ms
Wall time: 149 ms


In [57]:
len(dataset_dsed[0]['moments'])

15

In [70]:
dataset_dsed.column_names

['gameid',
 'gamedate',
 'event_info',
 'primary_info',
 'secondary_info',
 'visitor',
 'home',
 'moments']

In [72]:
datased_prcsd  = dataset_dsed.map(prune_example, num_proc=NUM_PROC, remove_columns=dataset_dsed.column_names)

Map (num_proc=16): 100%|██████████| 2219/2219 [00:02<00:00, 811.12 examples/s] 


In [74]:
datased_prcsd.rename_columns({'n_gameid':'gameid',
                              'n_event_info':'event_info',
                              'n_primary_info':'primary_info',
                              'n_secondary_info':'secondary_info',
                              'n_visitor':'visitor',
                              'n_home':'home',
                              'n_moments':'moments'})

Dataset({
    features: ['gameid', 'event_info', 'primary_info', 'secondary_info', 'visitor', 'home', 'moments'],
    num_rows: 2219
})

In [75]:
datased_prcsd

Dataset({
    features: ['n_gameid', 'n_event_info', 'n_primary_info', 'n_secondary_info', 'n_visitor', 'n_home', 'n_moments'],
    num_rows: 2219
})

In [12]:
dataset[0]['visitor']

{'name': 'Miami Heat',
 'teamid': 1610612748,
 'abbreviation': 'MIA',
 'players': [{'lastname': 'Andersen',
   'firstname': 'Chris',
   'playerid': 2365,
   'jersey': '11',
   'position': 'F-C'},
  {'lastname': 'Stoudemire',
   'firstname': "Amar'e",
   'playerid': 2405,
   'jersey': '5',
   'position': 'F-C'},
  {'lastname': 'Bosh',
   'firstname': 'Chris',
   'playerid': 2547,
   'jersey': '1',
   'position': 'F'},
  {'lastname': 'Wade',
   'firstname': 'Dwyane',
   'playerid': 2548,
   'jersey': '3',
   'position': 'G'},
  {'lastname': 'Haslem',
   'firstname': 'Udonis',
   'playerid': 2617,
   'jersey': '40',
   'position': 'F'},
  {'lastname': 'Deng',
   'firstname': 'Luol',
   'playerid': 2736,
   'jersey': '9',
   'position': 'F'},
  {'lastname': 'Udrih',
   'firstname': 'Beno',
   'playerid': 2757,
   'jersey': '19',
   'position': 'G'},
  {'lastname': 'Green',
   'firstname': 'Gerald',
   'playerid': 101123,
   'jersey': '14',
   'position': 'G'},
  {'lastname': 'Dragic',
   '

In [10]:
dataset.features

{'gameid': Value(dtype='string', id=None),
 'gamedate': Value(dtype='string', id=None),
 'event_info': {'id': Value(dtype='string', id=None),
  'type': Value(dtype='int64', id=None),
  'possession_team_id': Value(dtype='float64', id=None),
  'desc_home': Value(dtype='string', id=None),
  'desc_away': Value(dtype='string', id=None)},
 'primary_info': {'team': Value(dtype='string', id=None),
  'player_id': Value(dtype='float64', id=None),
  'team_id': Value(dtype='float64', id=None)},
 'secondary_info': {'team': Value(dtype='string', id=None),
  'player_id': Value(dtype='float64', id=None),
  'team_id': Value(dtype='float64', id=None)},
 'visitor': {'name': Value(dtype='string', id=None),
  'teamid': Value(dtype='int64', id=None),
  'abbreviation': Value(dtype='string', id=None),
  'players': [{'lastname': Value(dtype='string', id=None),
    'firstname': Value(dtype='string', id=None),
    'playerid': Value(dtype='int64', id=None),
    'jersey': Value(dtype='string', id=None),
    'posit