In [77]:
# Import os for relative pathing to data
import os
import pandas as pd
# Import our play by play utils file
from play_by_play_utils import *

# Set columns and width for easier printing
pd.set_option('display.max_columns', 500)
pd.set_option('display.width', 1000)


In [78]:
# Select a game ID
game_id = '0022300382'

In [79]:
# determine the directory that this file resides in
dirname = os.getcwd()

# generate file path for play by play and players on court data
input_play_by_play = os.path.join(dirname, 'data/{}_pbp.csv'.format(game_id))
input_players_on_court = os.path.join(dirname, 'data/{}_players_at_period.csv'.format(game_id))
output_path = os.path.join(dirname, 'data/{}_possessions.csv'.format(game_id))

In [80]:
# Read in play by play and fill null description columsn with empty string
play_by_play = pd.read_csv(input_play_by_play, index_col=False)
play_by_play['HOMEDESCRIPTION'] = play_by_play['HOMEDESCRIPTION'].fillna("")
play_by_play['NEUTRALDESCRIPTION'] = play_by_play['HOMEDESCRIPTION'].fillna("")
play_by_play['VISITORDESCRIPTION'] = play_by_play['VISITORDESCRIPTION'].fillna("")

In [81]:
play_by_play.head()

Unnamed: 0,GAME_ID,EVENTNUM,EVENTMSGTYPE,EVENTMSGACTIONTYPE,PERIOD,WCTIMESTRING,PCTIMESTRING,HOMEDESCRIPTION,NEUTRALDESCRIPTION,VISITORDESCRIPTION,SCORE,SCOREMARGIN,PERSON1TYPE,PLAYER1_ID,PLAYER1_NAME,PLAYER1_TEAM_ID,PLAYER1_TEAM_CITY,PLAYER1_TEAM_NICKNAME,PLAYER1_TEAM_ABBREVIATION,PERSON2TYPE,PLAYER2_ID,PLAYER2_NAME,PLAYER2_TEAM_ID,PLAYER2_TEAM_CITY,PLAYER2_TEAM_NICKNAME,PLAYER2_TEAM_ABBREVIATION,PERSON3TYPE,PLAYER3_ID,PLAYER3_NAME,PLAYER3_TEAM_ID,PLAYER3_TEAM_CITY,PLAYER3_TEAM_NICKNAME,PLAYER3_TEAM_ABBREVIATION,VIDEO_AVAILABLE_FLAG
0,22300382,2,12,0,1,7:11 PM,12:00,,,,,,0,0,,,,,,0,0,,,,,,0,0,,,,,,0
1,22300382,4,10,0,1,7:11 PM,12:00,Jump Ball Embiid vs. Poeltl: Tip to Schroder,Jump Ball Embiid vs. Poeltl: Tip to Schroder,,,,4,203954,Joel Embiid,1610613000.0,Philadelphia,76ers,PHI,5,1627751,Jakob Poeltl,1610613000.0,Toronto,Raptors,TOR,5,203471,Dennis Schroder,1610613000.0,Toronto,Raptors,TOR,1
2,22300382,7,2,79,1,7:11 PM,11:43,,,MISS Siakam 18' Pullup Jump Shot,,,5,1627783,Pascal Siakam,1610613000.0,Toronto,Raptors,TOR,0,0,,,,,,0,0,,,,,,1
3,22300382,8,4,0,1,7:11 PM,11:41,Oubre Jr. REBOUND (Off:0 Def:1),Oubre Jr. REBOUND (Off:0 Def:1),,,,4,1626162,Kelly Oubre Jr.,1610613000.0,Philadelphia,76ers,PHI,0,0,,,,,,0,0,,,,,,1
4,22300382,9,2,79,1,7:11 PM,11:23,MISS Embiid 13' Pullup Jump Shot,MISS Embiid 13' Pullup Jump Shot,,,,4,203954,Joel Embiid,1610613000.0,Philadelphia,76ers,PHI,0,0,,,,,,0,0,,,,,,1


In [82]:
# We will need to know the game clock at each event later on. Let's take the game clock string (7:34) and convert it into seconds elapsed
def parse_time_elapsed(time_str, period):
    # Maximum minutes in a period is 12 unless overtime
    max_minutes = 12 if period < 5 else 5
    # Split string on :
    [minutes, sec] = time_str.split(':')
    # extract minutes and seconds
    minutes = int(minutes)
    sec = int(sec)

    # 7:34 (4 minutes 26 seconds have passed) -> 12 - 7 -> 5, need to subtract an extra minute.
    min_elapsed = max_minutes - minutes - 1
    sec_elapsed = 60 - sec

    return (min_elapsed * 60) + sec_elapsed

# We will also need to calculate the total time elapsed, not just the time elapsed in the period
def calculate_time_elapsed(row):
    # Caclulate time elapsed in the period
    time_in_period = calculate_time_elapsed_period(row)
    period = row['PERIOD']
    # Calculate total time elapsed up to the start of the current period
    if period > 4:
        return (12 * 60 * 4) + ((period - 5) * 5 * 60) + time_in_period
    else:
        return ((period - 1) * 12 * 60) + time_in_period

# method for calculating time elapsed in a period from a play by play event row
def calculate_time_elapsed_period(row):
    return parse_time_elapsed(row['PCTIMESTRING'], row['PERIOD'])

# Apply the methods for calculating time to add the columns to the dataframe
play_by_play['TIME_ELAPSED'] = play_by_play.apply(calculate_time_elapsed, axis=1)
play_by_play['TIME_ELAPSED_PERIOD'] = play_by_play.apply(calculate_time_elapsed_period, axis=1)


In [83]:
# Read the players at the start of each period
players_at_start_of_period = pd.read_csv(input_players_on_court)

In [84]:
# Players at the start of each period are stored as an string in the dataframe column
# We need to parse out that string into an array of player Ids
def split_row(list_str):
    return [x.replace('[', '').replace(']', '').strip() for x in list_str.split(',')]

In [85]:
sub_map = {}
# Pre-populate the map with the players at the start of each period
for row in players_at_start_of_period.iterrows():
    sub_map[row[1]['PERIOD']] = {row[1]['TEAM_ID_1']: split_row(row[1]['TEAM_1_PLAYERS']),
                                 row[1]['TEAM_ID_2']: split_row(row[1]['TEAM_2_PLAYERS'])}

In [86]:
def update_subs(row):
    period = row['PERIOD']
    # If the event is a substitution we need to sub out the players on the court
    if is_substitution(row):
        team_id = row['PLAYER1_TEAM_ID']
        player_in = str(row['PLAYER2_ID'])
        player_out = str(row['PLAYER1_ID'])
        players = sub_map[period][team_id]
        players_index = players.index(player_out)
        players[players_index] = player_in
        players.sort()
        sub_map[period][team_id] = players

    for i, k in enumerate(sub_map[period].keys()):
        row['TEAM{}_ID'.format(i + 1)] = k
        row['TEAM{}_PLAYER1'.format(i + 1)] = sub_map[period][k][0]
        row['TEAM{}_PLAYER2'.format(i + 1)] = sub_map[period][k][1]
        row['TEAM{}_PLAYER3'.format(i + 1)] = sub_map[period][k][2]
        row['TEAM{}_PLAYER4'.format(i + 1)] = sub_map[period][k][3]
        row['TEAM{}_PLAYER5'.format(i + 1)] = sub_map[period][k][4]

In [87]:
def is_end_of_possession(ind, row, rows):
    return is_turnover(row) or (is_last_free_throw_made(ind, row, rows)) or is_defensive_rebound(ind, row, rows) or \
           is_make_and_not_and_1(ind, row, rows) or is_end_of_period(row)


In [88]:
def parse_possession(rows):
    possessions = []
    current_posession = []
    for ind, row in rows:
        # update our subs
        update_subs(row)
        # No need to include subs or end of period events in our possession list
        if not is_substitution(row) and not is_end_of_period(row):
            current_posession.append(row)
        # if the current event is the last event of a possession, add the current possession to our list of possessions
        # and start a new possession
        if is_end_of_possession(ind, row, rows):
            # No need to add empty end of period possessions
            if len(current_posession) > 0:
                possessions.append(current_posession)
            current_posession = []
    return possessions

In [89]:
pbp_rows = list(play_by_play.iterrows())
possessions = parse_possession(pbp_rows)

In [106]:
possessions[1:3]

[[GAME_ID                                              22300382
  EVENTNUM                                                    9
  EVENTMSGTYPE                                                2
  EVENTMSGACTIONTYPE                                         79
  PERIOD                                                      1
  WCTIMESTRING                                          7:11 PM
  PCTIMESTRING                                            11:23
  HOMEDESCRIPTION              MISS Embiid 13' Pullup Jump Shot
  NEUTRALDESCRIPTION           MISS Embiid 13' Pullup Jump Shot
  VISITORDESCRIPTION                                           
  SCORE                                                     NaN
  SCOREMARGIN                                               NaN
  PERSON1TYPE                                                 4
  PLAYER1_ID                                             203954
  PLAYER1_NAME                                      Joel Embiid
  PLAYER1_TEAM_ID                       

In [95]:
# Print out the first couple of possessions so that you can see how the parser split them.
for possession in possessions[:4]:
    print('POSSESSION')
    for p in possession:
        print(p[home_description], p[neutral_description], p[away_description])
    print('\n')

POSSESSION
  
Jump Ball Embiid vs. Poeltl: Tip to Schroder Jump Ball Embiid vs. Poeltl: Tip to Schroder 
  MISS Siakam 18' Pullup Jump Shot
Oubre Jr. REBOUND (Off:0 Def:1) Oubre Jr. REBOUND (Off:0 Def:1) 


POSSESSION
MISS Embiid 13' Pullup Jump Shot MISS Embiid 13' Pullup Jump Shot 
  Barnes REBOUND (Off:0 Def:1)


POSSESSION
Maxey C.P.FOUL (P1.T1) (S.Twardoski) Maxey C.P.FOUL (P1.T1) (S.Twardoski) 
  
  Siakam Free Throw Clear Path 1 of 2 (1 PTS)
  Siakam Free Throw Clear Path 2 of 2 (2 PTS)
  Poeltl 4' Layup (2 PTS) (Barnes 1 AST)


POSSESSION
Harris 8' Turnaround Jump Shot (2 PTS) (Morris Sr. 1 AST) Harris 8' Turnaround Jump Shot (2 PTS) (Morris Sr. 1 AST) 




In [96]:
# We need to count up each teams points from a possession
def count_points(possession):
    # points will be a map where the key is the team id and the value is the number of points scored in that possesion
    points = {}
    for p in possession:
        if is_made_shot(p) or (not is_miss(p) and is_free_throw(p)):
            if p['PLAYER1_TEAM_ID'] in points:
                points[p['PLAYER1_TEAM_ID']] += extract_points(p)
            else:
                points[p['PLAYER1_TEAM_ID']] = extract_points(p)
    return points

# We need to know how many points each shot is worth:
def extract_points(p):
    if is_free_throw(p) and not is_miss(p):
        return 1
    elif is_made_shot(p) and is_three(p):
        return 3
    elif is_made_shot(p) and not is_three(p):
        return 2
    else:
        return 0


In [97]:
def determine_possession_team(p, team1, team2):
    if is_made_shot(p) or is_free_throw(p):
        return str(int(p['PLAYER1_TEAM_ID']))
    elif is_rebound(p):
        if is_team_rebound(p):
            if p['PLAYER1_ID'] == team1:
                return team2
            else:
                return team1
        else:
            if p['PLAYER1_TEAM_ID'] == team1:
                return team2
            else:
                return team1
    elif is_turnover(p):
        if is_team_turnover(p):
           return str(int(p['PLAYER1_ID']))
        else:
            return str(int(p['PLAYER1_TEAM_ID']))
    else:
        if math.isnan(p['PLAYER1_TEAM_ID']):
            return str(int(p['PLAYER1_ID']))
        else:
            return str(int(p['PLAYER1_TEAM_ID']))


In [107]:
def parse_possession(possession):
    times_of_events = [p[time_elapsed] for p in possession]
    possession_start = min(times_of_events)
    possession_end = max(times_of_events)
    points = count_points(possession)
    game_id = possession[0]['GAME_ID']
    period = possession[0]['PERIOD']

    team1_id = possession[0]['TEAM1_ID']
    team1_player1 = possession[0]['TEAM1_PLAYER1']
    team1_player2 = possession[0]['TEAM1_PLAYER2']
    team1_player3 = possession[0]['TEAM1_PLAYER3']
    team1_player4 = possession[0]['TEAM1_PLAYER4']
    team1_player5 = possession[0]['TEAM1_PLAYER5']
    team1_points = points[team1_id] if team1_id in points else 0

    team2_id = possession[0]['TEAM2_ID']
    team2_player1 = possession[0]['TEAM2_PLAYER1']
    team2_player2 = possession[0]['TEAM2_PLAYER2']
    team2_player3 = possession[0]['TEAM2_PLAYER3']
    team2_player4 = possession[0]['TEAM2_PLAYER4']
    team2_player5 = possession[0]['TEAM2_PLAYER5']
    team2_points = points[team2_id] if team2_id in points else 0

    possession_team = determine_possession_team(possession[-1], team1_id, team2_id)

    return {
        'team1_id': str(team1_id),
        'team1_player1': str(team1_player1),
        'team1_player2': str(team1_player2),
        'team1_player3': str(team1_player3),
        'team1_player4': str(team1_player4),
        'team1_player5': str(team1_player5),
        'team2_id': str(team2_id),
        'team2_player1': str(team2_player1),
        'team2_player2': str(team2_player2),
        'team2_player3': str(team2_player3),
        'team2_player4': str(team2_player4),
        'team2_player5': str(team2_player5),
        'game_id': str(game_id),
        'period': period,
        'possession_start': possession_start,
        'possession_end': possession_end,
        'team1_points': team1_points,
        'team2_points': team2_points,
        'possession_team': str(possession_team)
    }

In [108]:
# Build a list of parsed possession objects
parsed_possessions = []
for possession in possessions:
    parsed_possessions.append(parse_possession(possession))

# Build a dataframe from the list of parsed possession
df = pd.DataFrame(parsed_possessions)

print(df)
print(df.groupby(by='team1_id')['team1_points'].sum())
print(df.groupby(by='team2_id')['team2_points'].sum())
print(df.groupby(by=['possession_team'])['possession_team'].count())

df.to_csv(output_path, index=False)

       team1_id team1_player1 team1_player2 team1_player3 team1_player4 team1_player5    team2_id team2_player1 team2_player2 team2_player3 team2_player4 team2_player5   game_id  period  possession_start  possession_end  team1_points  team2_points possession_team
0    1610612761        203471       1627751       1627783       1628384       1630567  1610612755        202694        202699        203954       1626162       1630178  22300382       1                 0              19             0             0      1610612761
1    1610612761        203471       1627751       1627783       1628384       1630567  1610612755        202694        202699        203954       1626162       1630178  22300382       1                37              39             0             0      1610612755
2    1610612761        203471       1627751       1627783       1628384       1630567  1610612755        202694        202699        203954       1626162       1630178  22300382       1                42     

In [109]:
df.shape

(200, 19)