In [1]:
import pandas as pd

from glob import glob
from tqdm import tqdm

In [2]:
def get_scoring_team(row):
    if row['HOMEDESCRIPTION'] == '':
        return 'HOME'
    if row['VISITORDESCRIPTION'] == '':
        return 'AWAY'
    return 'NULL'

In [3]:
def calculate_time(row):
    quarter = int(row['PERIOD'])
    minutes = int(row['PCTIMESTRING'].split(':')[0])
    seconds = int(row['PCTIMESTRING'].split(':')[1])
    return 2880 - (720)*(quarter - 1) - (11 - minutes)*(60) - (60 - seconds)

In [4]:
def process(df):
    g = df[
        (df.SCORE.notnull()) &
        (df.HOMEDESCRIPTION.notnull() | df.VISITORDESCRIPTION.notnull()) &
        (df.PERIOD <= 4)
    ].sort_values('EVENTNUM')
    
    g['TIME'] = g.apply(calculate_time, axis=1)
    
    g['AWAY'] = g['SCORE'].str.split(' - ').apply(lambda x: x[1])
    g['HOME'] = g['SCORE'].str.split(' - ').apply(lambda x: x[0])
    
    g['HOMEDESCRIPTION'].fillna('', inplace=True)
    g['VISITORDESCRIPTION'].fillna('', inplace=True)
    
    g['SCORINGTEAM'] = g.apply(get_scoring_team, axis=1)
    
    g['HOMEMARGIN'] = g['SCOREMARGIN'].apply(lambda x: int(x.replace('TIE','0')))
    
    new = pd.melt(
        g,
        id_vars=['GAME_ID','EVENTNUM','PERIOD','TIME','SCORINGTEAM','HOMEMARGIN','SCORE'],
        var_name='SIDE',
        value_name='POINTS',
    ).sort_values('EVENTNUM')\
    .query('SCORINGTEAM == SIDE')
    
    return new

In [5]:
for y in range(2019, 2020):
    files = glob('../data/playbyplay/{}/*.csv'.format(y))
    summary = pd.concat([process(pd.read_csv(f, dtype={'GAME_ID':str,'SCOREMARGIN':str})) for f in tqdm(files)])
    summary.sort_values(['GAME_ID','EVENTNUM']).to_csv('processed/pbp-summary-{}.csv'.format(y), index=False)

100%|██████████| 1189/1189 [00:46<00:00, 25.81it/s]
100%|██████████| 1189/1189 [00:48<00:00, 24.51it/s]
100%|██████████| 725/725 [00:26<00:00, 27.30it/s]
100%|██████████| 1189/1189 [00:44<00:00, 26.54it/s]
100%|██████████| 1189/1189 [00:44<00:00, 26.87it/s]
100%|██████████| 1189/1189 [00:58<00:00, 20.25it/s]
100%|██████████| 1189/1189 [00:59<00:00, 24.58it/s]
100%|██████████| 1189/1189 [00:57<00:00, 20.51it/s]
100%|██████████| 1230/1230 [01:03<00:00, 19.25it/s]
100%|██████████| 1230/1230 [01:07<00:00, 18.09it/s]
100%|██████████| 1230/1230 [01:08<00:00, 17.97it/s]
100%|██████████| 1230/1230 [01:06<00:00, 18.58it/s]
100%|██████████| 1230/1230 [01:01<00:00, 21.16it/s]
100%|██████████| 1230/1230 [01:00<00:00, 20.18it/s]
100%|██████████| 1230/1230 [01:00<00:00, 21.16it/s]
100%|██████████| 990/990 [00:47<00:00, 21.25it/s]
100%|██████████| 1229/1229 [01:00<00:00, 20.43it/s]
100%|██████████| 1230/1230 [01:02<00:00, 19.71it/s]
100%|██████████| 1230/1230 [00:57<00:00, 21.27it/s]
100%|██████████|