# Developing the Algorithm

This code is currently a bit sparse in its documentation, but it exists as a way to show that we can extract meaningful features in a straightforward way.

In [362]:
import os
import pandas as pd
import numpy as np
import re

#Plotting
%matplotlib inline
import matplotlib
import matplotlib.pyplot as plt
matplotlib.rcParams['figure.figsize'] = (10, 7.0)
pd.options.mode.chained_assignment = None

In [1]:
###DATA LOADING FUNCTIONS###

def initializeShots(path="",loadGitHub=True):
    if loadGitHub==False:
        try:
            nba_shots=pd.read_csv(path+"playershotsdf.csv")
            print("Loaded from directory.")
            return(nba_shots)
        except:
            print("File not found. Using GitHub instead")
            loadGitHub=True
    if loadGitHub==True:
        nba_shots=pd.read_csv("https://raw.githubusercontent.com/pvacek/160-Team-Duncan/master/playershotsdf.csv")
        print("Loaded from GitHub.")
        return(nba_shots)

def loadGameData(file,shot_df,shot_path="",loadShots=False):
    #Set directory to where the ball motion csvs exist, load ball motion data
    motion=pd.read_csv(file)
    
    #Load in the data if asked for.
    if loadShots==True:
        nba_shots=initializeShots(directory=path,loadGitHub=True)
    else:
        nba_shots=shot_df
        
    #Find the matching data within the NBA shot data.
    game_id=int(re.sub("(^.+\_00|\.csv)","",file))
    game=nba_shots[nba_shots["GAME_ID"]==game_id]
    
    #Return the two files.
    return(motion,game)

In [4]:
###DATA PREPROCESSING, THE NEW STUFF###

def shotSearch(motion,time,epsilon):
    motion_subset=motion[(motion["time_full"]<=(time+epsilon))&(motion["time_full"]>=(time-epsilon))]
    #As it currently stands, this code will have issues if the time interval (t-epsilon,t+epsilon) is not defined
    #Although it is quicker to search a smaller interval, you will need to do this process recursively...
    #...or brute force it in order to avoid failure.
    time_difference=abs(time-motion_subset.time_full.values)
    try:
        time_index=motion_subset.index[np.argmin(time_difference)]
        time_error=np.min(time_difference)
    except ValueError:
        time_index=0
        time_error=2*epsilon
    return([time_index,time_error])

def shotPartition(motion,game):
    num_quarters=np.max(game.PERIOD)
    game_time=game.MINUTES_REMAINING*60+game.SECONDS_REMAINING+(num_quarters-game.PERIOD)*720
    motion_time=motion.time+(num_quarters-motion.qtr)*720
    
    game["time_full"]=game_time
    motion["time_full"]=motion_time
    
    game_sort=game.sort_values(by="time_full",ascending=False).reset_index(drop=True)
    motion_sort=motion.sort_values(by="time_full",ascending=False).reset_index(drop=True)
    
    #It is important to know that I currently set the epsilon for 25 seconds, you can adjust this number if necessary.
    shot_searches=pd.DataFrame([shotSearch(motion_sort,time,25) for time in game_sort.time_full],columns=["time_index","error"])
    shot_midpoints=(shot_searches.time_index.values[1:]+shot_searches.time_index.values[:-1])/2
    partition_lengths=np.diff(np.hstack((0,np.round(shot_midpoints),len(motion))).astype(int))
    
    #Also, add on the error, add on the id for the motion dataset
    game_sort["error"]=shot_searches.error
    motion_sort["id"]=np.repeat(np.linspace(0,len(game)-1,len(game)),partition_lengths)
    
    return(motion_sort,game_sort,partition_lengths)

def sidingAlgorithm(game):
    #Find our candidate shot
    left_shots=np.where(game.SHOT_ZONE_AREA=="Left Side(L)")
    the_shot=game.iloc[np.min(left_shots),:]
    
    #Determine the team that took the shot
    shot_team=the_shot.TEAM_NAME
    
    #Set indicators based on shot.
    if the_shot.LOC_X>0:
        side_indicator=np.where(game.TEAM_NAME==shot_team,1,0)
    else:
        side_indicator=np.where(game.TEAM_NAME==shot_team,0,1)
        
    #Flip indicator for 2nd half.
    side_indicator[np.where(game.PERIOD>2)[0]]=1-side_indicator[np.where(game.PERIOD>2)[0]]
    left_right_map=np.vectorize(lambda x: "Left" if x == 1 else "Right")
    sides_labeled=left_right_map(side_indicator)
    
    #Set the sides dataframe to the shot dataframe
    shot_df=pd.DataFrame(sides_labeled,columns=["SIDE"])
    game_with_sides=pd.concat([game,shot_df],axis=1)
    return(game_with_sides)

def NBA_to_SportVU(game):
    #Convert NBA API coordinates to SportVU coordinates
    Y=game.LOC_X/10+25
    Y_star=np.array(game.SIDE=="Left")*(50-Y)+np.array(game.SIDE=="Right")*Y
    X=game.LOC_Y/10+5
    X_star=np.array(game.SIDE=="Left")*X+np.array(game.SIDE=="Right")*(94-X)
    return(np.array([X_star,Y_star]).T)

def mergeSportVU(motion,game):
    #Merge our two essential dataframes on each distinct shot.
    game_lean=game.drop(["GRID_TYPE","SHOT_ATTEMPTED_FLAG","LOC_X","LOC_Y","MINUTES_REMAINING",
                         "SECONDS_REMAINING","EVENT_TYPE","time_full"],axis=1)
    game_indexed=game_lean.reset_index().rename(columns={'index':'id'})
    sportVU_data=motion.merge(game_indexed,how='inner',on='id')
    return(sportVU_data)

def computeBallHoopDists(game_motion,shot_XY,partitions):
    #Compute two norms: 
    #(1) The norm of the XY coordinates to the hypothetical shot location.
    #(2) The norm of the XY coordinates to the proper hoop.
    game_coords=game_motion.iloc[:,4:6].values
    shot_coords=np.repeat(shot_XY,partitions,axis=0)
    hoop_coords=np.vstack((np.where(game_motion.SIDE=='Left',5,89),np.repeat(25,len(game_motion)))).T
    dshot=np.linalg.norm(game_coords-shot_coords,axis=1)
    hoop_XY=np.vstack((np.where(game_motion.SIDE=='Left',5,89),np.repeat(25,len(game_motion)))).T
    dhoop=np.linalg.norm(game_coords-hoop_coords,axis=1)
    distance_df=pd.DataFrame(np.vstack((dshot,dhoop)).T,columns=["dshot","dhoop"])
    return(distance_df)

In [None]:
def ProcessData(motion,game):
    #STEP 0: Remove any duplicated rows from the motion data
    motion_unique=motion[motion.duplicated()==False]
    
    #STEP 1: Partition the motion data into plausible sets of time where the shot could potentially occur
    motion_sort,game_sort,partitions=shotPartition(motion_unique,game)
    
    #STEP 2: Use the siding algorithm to determine which side each team is shooting on
    game_with_sides=sidingAlgorithm(game_sort)
    
    #STEP 3: Determine where the shot locations are from the NBA API in SportVU coordinate space
    shot_XY=NBA_to_SportVU(game_with_sides)
    
    #STEP 4: Merge the two dataframes
    game_motion=mergeSportVU(motion_sort,game_with_sides)
    
    #STEP 5: Create distance to shot, distance to hoop features
    distance_df=computeBallHoopDists(game_motion,shot_XY,partitions)
    
    #STEP 6: Create the final dataframe
    sportvu_df=pd.concat([game_motion,distance_df],axis=1)
    return(sportvu_df)

In [7]:
###FINAL FUNCTION

def read_sportVU(file,shot_df,shot_path="",loadShots=False):
    print("Reading in data...")
    motion,game=loadGameData(file,shot_df,shot_path,loadShots)
    print("Processing data...")
    sportVU_data=ProcessData(motion,game)
    return(sportVU_data)

In [456]:
###EXAMPLE:

#Find where the files are
os.chdir("E:/output")
file_names=os.listdir()

#Initialize shot data
nba_shots=initializeShots(path="E:/ProjectTimDuncan/",loadGitHub=False)

#Load the data.
cha_tor=read_sportVU(file_names[0],nba_shots)