### Purpose of this script:

Extract player tracking data from the tracking game range dataset to create a master table that has a unique row for each player who is on the field for each play for a regular season game

In [1]:
import boto3
import pandas as pd
from sagemaker import get_execution_role
import json
import s3fs
import re
from datetime import datetime
from time import sleep
import math
import numpy as np
import sys
# import pyarrow
# from pyathena import connect

In [2]:
#read in tracking game range playID table previously extracted from Athena
tracking_playid = pd.read_csv('tracking_game_range_playid.csv')

In [53]:
#functions to download raw data from the tracking_game_range S3 bucket for a given year

def get_tracking_game_filenames(year):
    fs = s3fs.S3FileSystem()
    s3_path='nyg-hackathon-154843675742/source_file/nfl/tracking_game_range/year={}/'.format(year)
    # list of all s3 file paths
    tracking_data_filenames = fs.ls('s3://{}/'.format(s3_path))
    print('Number of Tracking Game Range files for {} : '.format(year), len(tracking_data_filenames))
    return tracking_data_filenames

def load_tracking_game_data(file):
    s3 = boto3.resource('s3')
    split_filename = file.split('/',maxsplit=1)
    bucket = split_filename[0]
    datakey = split_filename[1]
    obj= s3.Object(bucket, datakey)
    body = obj.get()['Body'].read().decode('utf-8')
    data = json.loads(body)
    return data

In [54]:
#import ppg_ge tables to get regular season gameIds
play_master_df = pd.read_csv('../common/data/master_play_table_v1.csv', low_memory=False)

In [55]:
#pull gameIds for only regular season games
reg_season_gameIds = sorted([str(x) for x in play_master_df.gameId.unique()])
start_idx = len('nyg-hackathon-154843675742/source_file/nfl/tracking_game_range/year=2019/tracking_game_range_')

In [57]:
#extract all regular season filenames
player_dfs = []
filenames = get_tracking_game_filenames(2019)
reg_filenames = sorted([x for x in filenames if x[start_idx:start_idx+10] in reg_season_gameIds])

print('Number of regular season files: {}'.format(len(reg_filenames)))


Number of Tracking Game Range files for 2019 :  75369
number of regular season files: 57290


In [58]:
def extract_tracking_data(df):
    df['gameId'] = data['gameId']
    df['playId'] = playid
    
    
    #extract time slices, x and y coordinates from playerTrackingdata
    df['coordinates'] = df.apply(lambda row: [[dict['time'], dict['x'], dict['y']] for dict in row.playerTrackingData], axis = 1)
    df['isOnField'] = df.apply(lambda row: [dict['isOnField'] for dict in row.playerTrackingData], axis = 1)

    #get start and times from play master df which holds the start (Snap) time and end of the play
    play_times = play_master_df[['gameId', 'playId','snapTime', 'endTime']].copy()
    play_times.rename(columns={'endTime': 'play_endTime'}, inplace=True)

    #merge snaptime and play end time with player_df
    df['gameId'] = df['gameId'].astype(int)
    df['startTime'] = data['startTime']
    df['endTime'] = data['endTime']
    df['playId'] = df['playId'].astype(int)
    df = df.merge(play_times, on=['gameId', 'playId'], how='left')
    return df


def group_data_by_play(df):
    #rollup sequences to unique plays by player
    static_cols = ['nflId', 'displayName', 'position', 'positionGroup', 'gameId', 'playId', 'snapTime', 'teamType', 'play_endTime']
    df_groupbyplay = df.groupby(static_cols)[['coordinates', 'isOnField']].sum().reset_index()
    
    #subset tracking data to on field players and tracking during play time + 2 sec prior and 1 sec post play
    df_groupbyplay['play_startIdx'] = df_groupbyplay.apply(lambda row: math.ceil(((datetime.strptime(row.snapTime, '%Y-%m-%d %H:%M:%S.%f') - datetime.strptime(row.coordinates[0][0], '%Y-%m-%dT%H:%M:%S.%f')).total_seconds()) * 10) - 20, axis = 1)
    df_groupbyplay['play_endIdx'] = df_groupbyplay.apply(lambda row: math.ceil((datetime.strptime(row.play_endTime, '%Y-%m-%d %H:%M:%S.%f') - datetime.strptime(row.coordinates[0][0], '%Y-%m-%dT%H:%M:%S.%f')).total_seconds() * 10) + 10, axis = 1)
    df_groupbyplay['isOnField_atSnap'] = df_groupbyplay.apply(lambda row: row['isOnField'][row.play_startIdx + 20] if (len(row.isOnField) > (row.play_startIdx + 20)) & (row.play_startIdx > 0) else False, axis=1)
    df_groupbyplay = df_groupbyplay[df_groupbyplay['isOnField_atSnap'] == True]
    df_groupbyplay.drop('isOnField', axis=1, inplace=True)
    df_groupbyplay['coordinates'] = df_groupbyplay.apply(lambda row: [list[1:3] for list in row.coordinates[row.play_startIdx:row.play_endIdx]], axis = 1)
    return df_groupbyplay

In [60]:
#extract all player tracking data for regular season 2019 - this takes hours ~5-6 hrs to run - RUN SPARINGLY
master_player_df = pd.DataFrame()
y=0
for game in reg_season_gameIds:
    #loop through each file and pull out relevant data for processing
    filesforsinglegame = [x for x in reg_filenames if x[start_idx:start_idx+10] == game]
    game_df = pd.DataFrame()
    for file in filesforsinglegame:

        data = load_tracking_game_data(file)
        if data['awayTrackingData'] != [] and data['homeTrackingData'] != []:
            
            try: 
                #extract playid from tracking_game_range_playid table
                playid = str(int(tracking_playid.loc[(tracking_playid['gameid'] == data['gameId']) & 
                        (tracking_playid['starttime'] == str(data['startTime'])) &
                        (tracking_playid['endtime'] == str(data['endTime'])), 'playid']))
                
                #process tracking data from away and home tracking data columns
                away_tracking = pd.DataFrame(data['awayTrackingData'])
                away_tracking_proc = extract_tracking_data(away_tracking)
                away_tracking_proc['teamType'] = 'away'
                game_df = game_df.append(away_tracking_proc, ignore_index=True)

                home_tracking = pd.DataFrame(data['homeTrackingData'])
                home_tracking_proc = extract_tracking_data(home_tracking)
                home_tracking_proc['teamType'] = 'home'
                game_df = game_df.append(home_tracking_proc, ignore_index=True)
                
            except:
                print('Oops!', sys.exc_info()[0], 'occurred for a play in game {}'.format(y+1))
                pass

        else:
            pass
    y += 1
    #roll up data to play by player level and process tracking data accordingly
    df_groupbyplay = group_data_by_play(game_df)
    print('{} games processed at {}'.format(y, datetime.now()))
    master_player_df = master_player_df.append(df_groupbyplay)
    if y % 50 == 0:
        #save files incrementally to avoid losing progress 
        master_player_df.to_pickle('S3://nyg-hackathon-154843675742/halloffame/snap_player_dataframes/snap_player_df_{}.pkl'.format(str(y)))
        print('Saved dataframe with {} games to S3 bucket'.format(y))
        master_player_df = pd.DataFrame()

#save final file 
master_player_df.to_pickle('S3://nyg-hackathon-154843675742/halloffame/snap_player_dataframes/snap_player_df_{}.pkl'.format(str(y)))

251 games processed at 2020-07-21 13:21:22.118388
252 games processed at 2020-07-21 13:23:07.569946
253 games processed at 2020-07-21 13:24:50.547229
254 games processed at 2020-07-21 13:26:15.318559
255 games processed at 2020-07-21 13:27:48.162314
256 games processed at 2020-07-21 13:29:11.342290


In [2]:
#read in one file at a time otherwise kernel may die
df_1 = pd.read_pickle('S3://nyg-hackathon-154843675742/halloffame/snap_player_dataframes/snap_player_df_50.pkl')

In [3]:
df_2 = pd.read_pickle('S3://nyg-hackathon-154843675742/halloffame/snap_player_dataframes/snap_player_df_100.pkl')

In [4]:
df_3 = pd.read_pickle('S3://nyg-hackathon-154843675742/halloffame/snap_player_dataframes/snap_player_df_150.pkl')

In [5]:
df_4 = pd.read_pickle('S3://nyg-hackathon-154843675742/halloffame/snap_player_dataframes/snap_player_df_200.pkl')

In [6]:
df_5 = pd.read_pickle('S3://nyg-hackathon-154843675742/halloffame/snap_player_dataframes/snap_player_df_250.pkl')

In [7]:
df_6 = pd.read_pickle('S3://nyg-hackathon-154843675742/halloffame/snap_player_dataframes/snap_player_df_256.pkl')

In [8]:
df_combined = pd.concat([df_1, df_2, df_3, df_4, df_5, df_6])

In [27]:
df_combined['coordinates'] = df_combined.coordinates.apply(lambda x: np.array(x))

In [23]:
#extract data
# df_combined.to_pickle('../common/data/master_player_df.pkl')