In [1]:
import os
import pandas as pd
import numpy as np
np.set_printoptions(suppress=True)
import re
from scipy.signal import argrelmin, argrelmax
from bisect import bisect_right
from scipy.interpolate import UnivariateSpline
from scipy.stats import mode
import time

#Plotting
%matplotlib inline
import matplotlib
import matplotlib.pyplot as plt
matplotlib.rcParams['figure.figsize'] = (15, 10)
import seaborn as sns
pd.options.mode.chained_assignment = None

In [2]:
def processData(game_id):
    #get data from folder and drop all null rows
    os.chdir("C:\Users\ckchiruka\Desktop\Current Courses\STA 160")
    motion=pd.read_csv("CSV Data/00"+str(game_id)+".csv",low_memory=False)
    motion=motion[motion.isnull().any(axis=1)==False].reset_index(drop=True)
    return(motion)

def getPlayRanges(motion):
    start = 0
    end = 0
    inplay = []
    
    #remove all duplicated quarter/secinquarter/shotclock to get in play rows
    no_dups = motion.drop_duplicates(subset = ['quarter', 'sec_in_quarter', 'shot_clock'], keep = 'last')
    
    #get indices for everything
    nd_index = no_dups.index
   
    #get ranges for all indices to get inplay indices
    for i in range(max(nd_index)):
        if i not in nd_index:
            start = i
        else:
            inplay.append(start+1)
    
        if i in nd_index:
            end = i  
        else:
            inplay.append(end)
    
    #get inplay indices
    inplay = sorted(list(set(inplay)))
    inplay = [i for i in inplay if i + 1 in nd_index or i - 1 in nd_index]

    #append last index in motion as last inplay index and reshape into a 2 by 2 list
    if len(inplay) % 2 == 1:
        inplay.append(len(motion)-1)
    inplay = np.asarray(inplay).reshape((len(inplay)/2,2))

    #get all minima
    minima = getRelMinima(motion)
    
    #use bisect right to get all parabolas and all indices of parabolas
    for i in range(len(inplay)):
        try:
            inplay[i][0] = minima[bisect_right(minima, inplay[i][0])-1]
        except IndexError:
            pass
        try: 
            inplay[i][1] = minima[bisect_right(minima, inplay[i][1])]
        except IndexError:
            inplay[i][1] = len(motion) - 1
        
    return inplay


def getRelMinima(motion):
    #use argrelmin to get minima
    minima = list(argrelmin(motion['ball_z'].rolling(5, center = True).mean().as_matrix()))
    minima = [i for i in minima[0]]
    #minima need to over 3 snapshots in difference
    indices = np.where(np.diff(minima)<3)
    indices = [i for i in indices[0]]
    for index in sorted(indices, reverse=True):
        del minima[index]
    return minima

def getRelMaxima(motion):
    #use argrelmin to get maxima
    maxima = list(argrelmax(motion['ball_z'].rolling(5, center = True).mean().as_matrix()))
    maxima = [i for i in maxima[0]]
    #maxima need to over 3 snapshots in difference
    indices = np.where(np.diff(maxima)<3)
    indices = [i for i in indices[0]]
    for index in sorted(indices, reverse=True):
        del maxima[index]
    return maxima

def deleteNotInPlay(motion, ranges):
    #delete ranges that are not in play
    temp1 = pd.DataFrame()
    for i in ranges:
        temp = motion.iloc[i[0]:i[1]]
        temp = [temp1, temp]
        temp1 = pd.concat(temp, ignore_index = True)
    return temp1.reset_index(drop=True)

In [3]:
def NBA_to_SportVU(game):
    #Convert NBA API coordinates to SportVU coordinates
    Y=game.LOC_X/10+25
    Y_star=np.array(game.SIDE=="Left")*(50-Y)+np.array(game.SIDE=="Right")*Y
    X=game.LOC_Y/10+5
    X_star=np.array(game.SIDE=="Left")*X+np.array(game.SIDE=="Right")*(94-X)
    time = game.MINUTES_REMAINING*60 + game.SECONDS_REMAINING
    quarter = game.PERIOD
    player_id = game.PLAYER_ID
    side = game.SIDE
    return(np.array([X_star,Y_star, time, quarter, player_id, side]).T)

def sidingAlgorithm(game):
    #Find our candidate shot
    left_shots=np.where(game.SHOT_ZONE_AREA=="Left Side(L)")
    the_shot=game.iloc[np.min(left_shots),:]
    
    #Determine the team that took the shot
    shot_team=the_shot.TEAM_NAME
    
    #Set indicators based on shot.
    if the_shot.LOC_X>0:
        side_indicator=np.where(game.TEAM_NAME==shot_team,1,0)
    else:
        side_indicator=np.where(game.TEAM_NAME==shot_team,0,1)
        
    #Flip indicator for 2nd half.
    side_indicator[np.where(game.PERIOD>2)[0]]=1-side_indicator[np.where(game.PERIOD>2)[0]]
    left_right_map=np.vectorize(lambda x: "Left" if x == 1 else "Right")
    sides_labeled=left_right_map(side_indicator)
    
    #Set the sides dataframe to the shot dataframe
    shot_df=pd.DataFrame(sides_labeled,columns=["SIDE"])
    game_with_sides=pd.concat([game,shot_df],axis=1)
    return(game_with_sides)

def makeSecs(game):
    #make seconds to compare with motion data
    num_qtrs=np.max(game.PERIOD)
    secs=game.MINUTES_REMAINING*60+game.SECONDS_REMAINING+(num_qtrs-game.PERIOD)*720
    return(secs)

<h1>Shot Algorithm</h1>
<h2>Algorithm:</h2>
Let the matrix of points (X-Y coordinates) at time $t$ be $P_{t}$.

Set the hoop coordinates as $p_{t,l}:=<5,25>,p_{t,r}:=<89,25>$ for all $t \in T$.

Let the ball coordinates be denoted as $p_{t,0}$

Let the player with the ball $i=1,...,10$ be defined as $p_{t,i}$

Let the difference of a snapshot be defined as $\nabla x = x[i+1] - x[i]$ or $\nabla y = y[i+1] - y[i]$

Let the slope of a snapshot be defined as $\frac{\nabla x}{\nabla y}$

Let the intercept of a snapshot be define as $y - x*slope$

Then we predict y at x = 5 or 89 (depending on the side) by $intercept + x*slope$

Our final snapshot selection algorithm is $norm(player\_coords - boxscore\_coords) + norm(ball\_coords - boxscore\_coords) + (pred_y - 25)^2$

We then take the closest 10 snapshots and match them to parabolas and take the mode of that dataset (the parabola that occurs the most) to find a shot.

Finally, we filter and remove all parabolas that does not have a max Z-value of 7.

In [54]:
def makeGameShotsXY(game_id):
    #read in playershotsdf
    playershotsdf=pd.read_csv("playershotsdf.csv")
    #use correct gameid
    gameshots = playershotsdf[playershotsdf.GAME_ID == 00+game_id ].reset_index(drop=True)
    #get seconds
    gameshots['sec_in_game'] = makeSecs(gameshots)
    #sort in correct order
    gameshots = gameshots.sort_values('sec_in_game', ascending = False).reset_index()
    #find sides
    gameshots = sidingAlgorithm(gameshots)
    #make XY from dataset to sportvu
    gameshotsXY = NBA_to_SportVU(gameshots)
    #make new dataframe that's smaller
    gameshotsXY = pd.DataFrame(gameshotsXY, columns = ['X', 'Y', 'sec_in_quarter', 'quarter', 'PLAYER_ID', 'SIDE'])
    return (gameshotsXY, gameshots)

def makeVoteSpace (motionXY, gameshotsXY):
    #initialize variables
    shots = []
    remove = []
    count = 0
    for i in gameshotsXY.as_matrix():
        #initialize variables
        balls = pd.DataFrame()
        players = pd.DataFrame()
        #get motion that is relevant to shot
        tempMotion = motionXY[(motionXY['quarter'] == i[3]) & (motionXY['sec_in_quarter'] >= i[2]+0.01) & 
                                (motionXY['sec_in_quarter'] <= i[2] + 4)]
        #try/except to remove shots that are missing from motion data
        try:
            #get playerID
            player_id = np.where(tempMotion.isin([i[4]]))[1][1]
            #get player and ball x/y coordinates
            players['x'] = tempMotion.iloc[:, player_id+1]
            players['y'] = tempMotion.iloc[:, player_id+2]
            balls['x'] = tempMotion['ball_x']
            balls['y'] = tempMotion['ball_y']
            #get estimated ball coordinates
            game = np.zeros(2)
            game[0] = i[0]
            game[1] = i[1]
            #figure out which side the shot is being taken
            if i[5] == 'Left':
                side = 5
            else:
                side = 89
            #this works, so I kept it that way, there are definiately ways to make this more efficient but it works
            ballx = tempMotion['ball_x']
            bally = tempMotion['ball_y']
            #get difference in x/y coordinates
            gradientx = np.diff(ballx)
            gradienty = np.diff(bally)
            #get slope and intercept
            gradient_slope = [gradienty[j]/gradientx[j] for j in range(len(gradientx))]
            gradient_intercept = [bally.iloc[j] - gradient_slope[j]*ballx.iloc[j] for j in range(len(gradientx))]
            #predict y
            y = [gradient_slope[j]*side + gradient_intercept[j] for j in range(len(gradientx))]
            #add 0 to get correct size of y
            y = [0] + y
            #make everything a matrix
            players = players.as_matrix()
            balls = balls.as_matrix()
            #get distances 
            distance = [np.linalg.norm(players[j] - game) + np.linalg.norm(balls[j] - game) + (y[j] - 25)**2 for j in range(len(players))]
            #get indices
            ind = np.argpartition(distance, 5)[0:10]
            ind = [tempMotion.iloc[j].name for j in ind]
            #append indices to shots
            shots.append(ind)
        except (IndexError, ValueError):
            #if shot is not found append whatever
            shots.append([0,1,2,3,4,5,6,7,8,9])
            #add it to remove to remove in future
            remove.append(count)
        count = count + 1
    
    #delete shots from shots
    for i in sorted(remove, reverse=True):
        del shots[i]
    
    return (shots, remove)

def findShots (motion, shots):
    #get minima and initialize shot parabola
    minima = getRelMinima(motion)
    shot_parabola = []
    for i in shots:
        votespace = []
        #create votespace for shots
        for j in i:
            start = bisect_right(minima, j)-1
            votespace.append(start)
        #get start index by taking mode of votespace
        start = mode(votespace)[0][0]
        #try/except to make sure end and beginning indices are included
        try:
            end = minima[start+2]
        except IndexError:
            end = len(motion) - 1
        #make sure indices are within range
        if start-1 < 0:
            start = 0
        else:
            start = minima[start-1]
        shot_parabola.append([start, end])
    
    return shot_parabola

def removeShots(shot_parabolas, gameshots, motion):
    #initialize stuff
    remove = []
    count = 0
    for i in shot_parabolas:
        #remove shots if max is not over 7
        maxZ = max(motion['ball_z'].iloc[i[0]:i[1]])
        if maxZ < 7:
            remove.append(count)
        count = count + 1
    #same for gameshots
    gameshots = gameshots.drop(gameshots.index[remove]).reset_index(drop = True)
    for i in sorted(remove, reverse=True):
            del shot_parabolas[i]
    
    return (shot_parabolas, gameshots)

def makeShotsDF(motion, gameshots, shot_parabolas):
    #initialize stuff
    allballx = []
    allballz = []
    allbally = []
    predictor = []
    count = 0
    for i in shot_parabolas:
        #get ballz/bally/ballx
        ballz = motion['ball_z'].iloc[i[0]:i[1]]
        #use interpolate to make all x/y/z 50 variables so shots can be classified
        allballz.append(np.interp(np.linspace(0,len(ballz), 50),np.arange(len(ballz)), ballz))
        bally = motion['ball_y'].iloc[i[0]:i[1]]
        #take absolute value so that side doesn't matter
        bally = np.absolute(bally - 25)
        allbally.append(np.interp(np.linspace(0,len(bally), 50),np.arange(len(bally)), bally))
        ballx = motion['ball_x'].iloc[i[0]:i[1]]
        #take absolute value so that side doesn't matter
        ballx = np.absolute(47 - ballx)
        allballx.append(np.interp(np.linspace(0,len(ballx), 50),np.arange(len(ballx)), ballx))
        #add in ground truth
        predictor.append([gameshots['SHOT_MADE_FLAG'].iloc[count],gameshots['ACTION_TYPE'].iloc[count]])
        count = count + 1
    #make variable names
    xcol = ['x' + str(i) for i in range(50)]
    ycol = ['y' + str(i) for i in range(50)]
    zcol = ['z' + str(i) for i in range(50)]
    return pd.concat([pd.DataFrame(allballx, columns = xcol), pd.DataFrame(allbally, columns = ycol), 
                       pd.DataFrame(allballz, columns = zcol), pd.DataFrame(predictor, columns = ['outcome', 'type'])], axis = 1)

In [None]:
shotsdf = pd.DataFrame()
for i in os.listdir(os.curdir+'/CSV Data'):
    #make gameid
    game_id = int(i.replace('.csv', '').replace('00', '', 1)) 
    #get motion data
    motion = processData(game_id)  
    #get play ranges and delete not in play
    ranges = getPlayRanges(motion)
    motion = deleteNotInPlay(motion, ranges)
    #make gameshots and change column names
    gameshotsXY, gameshots = makeGameShotsXY(game_id)
    gameshots['LOC_X'] = gameshotsXY['X']
    gameshots['LOC_Y'] = gameshotsXY['Y']
    #get votespace and removed shots
    shots, remove = makeVoteSpace(motion, gameshotsXY)
    #remove shots
    gameshots = gameshots.drop(gameshots.index[remove]).reset_index(drop = True)
    gameshots = gameshots.drop('index', 1)
    #get all shot parabolas
    shot_parabolas = findShots(motion, shots)
    #filter all shot parabolas
    shot_parabolas, gameshots = removeShots(shot_parabolas, gameshots, motion)
    #make final dataframe
    shotsdf = pd.concat([shotsdf, makeShotsDF(motion, gameshots, shot_parabolas)])

In [None]:
#drop duplicates and save to csv
shotsdf = shotsdf.drop_duplicates().reset_index(drop=True)
shotsdf.to_csv('shotsdf.csv')

In [None]:
#show shotsdf
shotsdf