In [146]:
import os
import pandas as pd
import numpy as np
import re

#Plotting
%matplotlib inline
import matplotlib
import matplotlib.pyplot as plt
matplotlib.rcParams['figure.figsize'] = (10, 7.0)
import seaborn as sns
pd.options.mode.chained_assignment = None

__Creating a classification table from motion data__

__Step 1:__ Remove any rows nan values from dataframe

__Step 2:__ Create a $n \times 13$ tensor where each entry contains $X,Y,Z$ coordinates of each object. The $n$ rows denotes the $n$ moments we have from SportVU, and the $p=13$ denotes the 13 objects we are tracking: the ball, 10 players, and the 2 hoops.

__Step 3:__ Compute the distance between the ball and every other object at time $t$ with the Euclidean norm: $||<X_{0},Y_{0},Z_{0}>-<X,Y,Z>||$ for all $n$ rows of the tensor.

__Step 4:__ Find which player or hoop is closest to the ball at time $t$, map the player number to a corresponding name using the list of players we have.

__Step 5:__ For continuous sequences of a player/hoop being closest to the ball, compress the data with run-length encoding.

__Step 6:__ Threshold the RLE with $k \geq 7$, which removes noisy possession changes.

__Step 7:__ When the ball is considered "unpossessed", determine the sequence dependent on what the previous state is and the next state. For example, a sequence that happens to have 2 players on the same team that sandwiches a "Ball unpossessed" is considered a __pass__.

In [136]:
def processData(game_id):
    os.chdir("F:/CSV DATA")
    motion=pd.read_csv("00"+str(game_id)+".csv",low_memory=False)
    motion=motion[motion.isnull().any(axis=1)==False].reset_index(drop=True)
    return(motion)

def makePlayersList(game_id):
    objects=pd.DataFrame(np.array([["Ball unpossessed","NBA"],["Left hoop","NBA"],["Right hoop","NBA"]]),
             columns=["PLAYER_NAME","TEAM_NAME"],index=[-1,0,1])
    players=shots[shots["GAME_ID"]==game_id][["PLAYER_ID","PLAYER_NAME","TEAM_NAME"]].drop_duplicates()
    players=players.set_index(players.PLAYER_ID.astype(int)).drop('PLAYER_ID',axis=1)
    plist=pd.concat([objects,players],axis=0)
    return(plist)

def makeXYZ(motion):
    #Set the X and Y values by adding ball, hoop coordinates
    ball_x_vals=motion.iloc[:,4].values.reshape(len(motion),1)
    player_x_vals=motion.iloc[:,8::3].values.reshape(len(motion),10)
    hoop_x_vals=np.repeat(np.array([5,89]).reshape(1,2),len(motion),axis=0)
    x_data=np.hstack((ball_x_vals,player_x_vals,hoop_x_vals))
    
    ball_y_vals=motion.iloc[:,5].values.reshape(len(motion),1)
    player_y_vals=motion.iloc[:,9::3].values.reshape(len(motion),10)
    hoop_y_vals=np.repeat(np.array([25,25]).reshape(1,2),len(motion),axis=0)
    y_data=np.hstack((ball_y_vals,player_y_vals,hoop_y_vals))
    
    #Set the Z values, set de facto player height to 6, hoop height to 10
    ball_z_vals=motion.iloc[:,6].values.reshape(len(motion),1)
    other_z_vals=np.repeat(np.array([6]*10+[10]*2).reshape(1,12),len(motion),axis=0)
    z_data=np.hstack((ball_z_vals,other_z_vals))
    #Stack the data into a 3d array with dstack
    xyz_array=np.dstack((x_data,y_data,z_data))
    return(xyz_array)

def selectTeams(teams):
    search_string="|".join(teams.tolist()+["NBA"])
    which_teams=players_list.TEAM_NAME.str.contains(search_string)
    teams_list=players_list.loc[which_teams]
    return(teams_list)

def closestObjectAlgorithm(motion,players_list,ball_dist):
    obj_index=pd.Series(np.hstack((np.linspace(7,34,10,dtype="int"),np.array([37,38]))))
    ball_min=ball_dist.min(axis=1)
    obj_num=ball_dist.idxmin(axis=1)-1
    
    hoop_ids=pd.DataFrame(np.repeat(np.array([0,1]).reshape(2,1),len(motion),axis=1).T,columns=["lhoop_id","rhoop_id"])
    motion_hoop=pd.concat([motion,hoop_ids],axis=1)
    
    obj_col=obj_num.map(obj_index)
    col_names=obj_col.map(pd.Series(motion_hoop.columns))
    obj_ids=motion_hoop.lookup(col_names.index,col_names.values)
    closest_object=players_list.loc[obj_ids]
    closest_object=closest_object.reset_index().rename(columns={"index":"PLAYER_ID"})
    closest_object["dist"]=ball_min
    closest_object["height"]=motion["ball_z"]
    closest_object.ix[closest_object.dist>=5,"PLAYER_NAME"]="Ball unpossessed"
    closest_object.ix[closest_object.dist>=5,"PLAYER_ID"]=-1
    closest_object=closest_object.dropna()
    return(closest_object)

def rle(inarray):
        """ run length encoding. Partial credit to R rle function. 
            Multi datatype arrays catered for including non Numpy
            returns: tuple (runlengths, startpositions, values) """
        ia = np.array(inarray)                  # force numpy
        n = len(ia)
        if n == 0: 
            return (None, None, None)
        else:
            y = np.array(ia[1:] != ia[:-1])     # pairwise unequal (string safe)
            i = np.append(np.where(y), n - 1)   # must include last element posi
            z = np.diff(np.append(-1, i))       # run lengths
            p = np.cumsum(np.append(0, z))[:-1] # positions
            return(z, p, ia[i])

def rle_to_df(rle_seq,seq_name):
    rle_df=pd.DataFrame(np.vstack(rle_seq).T,columns=["length","idx",seq_name])
    return(rle_df)

def rle_threshold(motion,rle_df,k):
    small_runs=np.where(rle_df.length<k)[0]
    small_runs_trim=trimSequence(rle_df,small_runs)
    big_runs=np.where(np.in1d(np.linspace(0,len(rle_df)-1,len(rle_df),dtype="int"),small_runs_trim)==False)[0]
    rle_subset=rle_df.loc[big_runs]
    new_runs=np.diff(rle_subset.idx)
    new_runs=np.hstack((new_runs,len(motion)-np.sum(new_runs)))
    rle_subset.length=new_runs
    new_runs=np.repeat(rle_subset.iloc[:,2].values,rle_subset.length.astype(int))
    new_rle=rle(new_runs)
    new_rle_df=pd.DataFrame(np.vstack(new_rle).T,columns=["length","idx",rle_df.columns[2]])
    return(new_rle_df)

def trimSequence(rle_df,sequence):
    if sequence[0] == 0:
        sequence=sequence[1:]
    if sequence[-1] == len(rle_df)-1:
        sequence=sequence[:-1]
    return(sequence)

def findSequence(i,rle_df):
    connection=np.array([rle_df["name"][i-1],rle_df["name"][i+1]])
    team=rle_df["team"][i-1]
    start=rle_df["idx"][i]
    end=start+rle_df["length"][i]
    if rle_df["team"][i-1]==rle_df["team"][i+1]:
        play="PASS"
    elif rle_df["name"][i+1]=="Left hoop" or rle_df["name"][i+1]=="Right hoop":
        play="SHOT"
    elif rle_df["team"][i-1]=="NBA" and rle_df["team"][i+1]!="NBA":
        play="REBOUND"
    else:
        play="TURNOVER"
    sequence=np.hstack((team,connection,play,start,end))
    return(sequence)

In [4]:
#Get the dataframe of shot locations
shots=pd.read_csv("F:/ProjectTimDuncan/playershotsdf.csv")

#Load the list of players we are working with
players_list=pd.read_csv("F:/ProjectTimDuncan/playerlist.csv",index_col="Unnamed: 0")

#Get the play-by-play dataframe
pbp=pd.read_csv("F:/ProjectTimDuncan/pbpdf.csv")
game_pbp=pbp[pbp["GAME_ID"]==21500592]

#Create game and motion data
motion=processData(21500003)

In [85]:
motion=processData(21500093)

In [86]:
teams=np.unique(shots[shots["GAME_ID"]==21500093].TEAM_NAME)

In [138]:
def makeClassTable(motion,players_list,k):
    print("Initializing X,Y,Z coordinates of all objects...")
    XYZ=makeXYZ(motion)
    print("Finding distance between ball and other objects...")
    ball_dist=pd.DataFrame([np.linalg.norm(xyz-xyz[0],axis=1) for xyz in XYZ]).iloc[:,1:]
    print("Finding the closest objects by Euclidean distance...")
    closest_objects=closestObjectAlgorithm(motion,players_list,ball_dist)
    print("Finding run lengths of possessions...")
    name_rle=rle_threshold(motion,rle_to_df(rle(closest_objects.PLAYER_NAME),"name"),k)
    id_rle=rle_threshold(motion,rle_to_df(rle(closest_objects.PLAYER_ID),"pid"),k)
    name_id_rle=pd.concat([name_rle,id_rle["pid"]],axis=1)
    name_id_rle["team"]=players_list.loc[name_id_rle["pid"]].TEAM_NAME.values
    print("Defining game sequences...")
    change_points=trimSequence(name_id_rle,np.where(name_id_rle["name"]=="Ball unpossessed")[0])
    sequences=np.vstack([findSequence(i,name_id_rle) for i in change_points])
    sequence_df=pd.DataFrame(sequences,columns=["team","p1","p2","play","start","end"])
    duplicate_plays=sequence_df.iloc[:,1]==sequence_df.iloc[:,2]
    sequence_df=sequence_df[duplicate_plays==False].reset_index(drop=True)
    return(sequence_df)

__Extracting plays for all games__

In [None]:
game_ids=np.array([int(o.lstrip('00').strip('.csv')) for o in os.listdir()])

In [137]:
def makeGamePlays(game_id):
    print(str(game_id))
    motion=processData(game_id)
    players_list=makePlayersList(game_id)
    class_table=makeClassTable(motion,players_list,7)
    id_col=pd.DataFrame([game_id]*len(class_table),columns=["GAME_ID"])
    game_class_table=pd.concat([id_col,class_table],axis=1)
    os.chdir("F:/classification")
    game_class_table.to_csv(str(game_id)+".csv",index=False)

In [145]:
[makeGamePlays(gid) for gid in game_ids]

21500094
Initializing X,Y,Z coordinates of all objects...
Finding distance between ball and other objects...
Finding the closest objects by Euclidean distance...
Finding run lengths of possessions...
Defining game sequences...
21500095
Initializing X,Y,Z coordinates of all objects...
Finding distance between ball and other objects...
Finding the closest objects by Euclidean distance...
Finding run lengths of possessions...
Defining game sequences...
21500096
Initializing X,Y,Z coordinates of all objects...
Finding distance between ball and other objects...
Finding the closest objects by Euclidean distance...
Finding run lengths of possessions...
Defining game sequences...
21500097
Initializing X,Y,Z coordinates of all objects...
Finding distance between ball and other objects...
Finding the closest objects by Euclidean distance...
Finding run lengths of possessions...
Defining game sequences...
21500098
Initializing X,Y,Z coordinates of all objects...
Finding distance between ball and 

[None,
 None,
 None,
 None,
 None,
 None,
 None,
 None,
 None,
 None,
 None,
 None,
 None,
 None,
 None,
 None,
 None,
 None,
 None,
 None,
 None,
 None,
 None,
 None,
 None,
 None,
 None,
 None,
 None,
 None,
 None,
 None,
 None,
 None,
 None,
 None,
 None,
 None,
 None,
 None,
 None,
 None,
 None,
 None,
 None,
 None,
 None,
 None,
 None,
 None,
 None,
 None,
 None,
 None,
 None,
 None,
 None,
 None,
 None,
 None,
 None,
 None,
 None,
 None,
 None,
 None,
 None,
 None,
 None,
 None,
 None,
 None,
 None,
 None,
 None,
 None,
 None,
 None,
 None,
 None,
 None,
 None,
 None,
 None,
 None,
 None,
 None,
 None,
 None,
 None,
 None,
 None,
 None,
 None,
 None,
 None,
 None,
 None,
 None,
 None,
 None,
 None,
 None,
 None,
 None,
 None,
 None,
 None,
 None,
 None,
 None,
 None,
 None,
 None,
 None,
 None,
 None,
 None,
 None,
 None,
 None,
 None,
 None,
 None,
 None,
 None,
 None,
 None,
 None,
 None,
 None,
 None,
 None,
 None,
 None,
 None,
 None,
 None,
 None,
 None,
 None,
 None,
 None,