# Modeling movements to predict Defensive Pass Interference

In [None]:
import numpy as np
import pandas as pd

import matplotlib.pyplot as plt

# setting plots' configuration
plt.rcParams["figure.figsize"]= 50,30
%config InlineBackend.figure_format = 'retina'

import matplotlib.patches as mpatches
from matplotlib.patches import FancyArrowPatch

import scipy.special
import math
import random
from scipy.stats import multivariate_normal

from sklearn.decomposition import PCA
from sklearn.cluster import KMeans

import warnings
warnings.filterwarnings("ignore")

plays = pd.read_csv('../input/nfl-big-data-bowl-2021/plays.csv')
games = pd.read_csv('../input/nfl-big-data-bowl-2021/games.csv')

week1 = pd.read_csv('../input/nfl-big-data-bowl-2021/week1.csv')
week2 = pd.read_csv('../input/nfl-big-data-bowl-2021/week2.csv')
week3 = pd.read_csv('../input/nfl-big-data-bowl-2021/week3.csv')
week4 = pd.read_csv('../input/nfl-big-data-bowl-2021/week4.csv')
week5 = pd.read_csv('../input/nfl-big-data-bowl-2021/week5.csv')
week6 = pd.read_csv('../input/nfl-big-data-bowl-2021/week6.csv')
week7 = pd.read_csv('../input/nfl-big-data-bowl-2021/week7.csv')
week8 = pd.read_csv('../input/nfl-big-data-bowl-2021/week8.csv')
week9 = pd.read_csv('../input/nfl-big-data-bowl-2021/week9.csv')
week10 = pd.read_csv('../input/nfl-big-data-bowl-2021/week10.csv')
week11 = pd.read_csv('../input/nfl-big-data-bowl-2021/week11.csv')
week12 = pd.read_csv('../input/nfl-big-data-bowl-2021/week12.csv')
week13 = pd.read_csv('../input/nfl-big-data-bowl-2021/week13.csv')
week14 = pd.read_csv('../input/nfl-big-data-bowl-2021/week14.csv')
week15 = pd.read_csv('../input/nfl-big-data-bowl-2021/week15.csv')
week16 = pd.read_csv('../input/nfl-big-data-bowl-2021/week16.csv')
week17 = pd.read_csv('../input/nfl-big-data-bowl-2021/week17.csv')

# creating a unique dataset for all the tracking data
frames = [week1,week2,week3,week4,week5,week6,week7,week8,week9,week10,week11,week12,week13,week14,week15,week16,week17]
weeks = pd.concat(frames,axis=0).reset_index().drop('index',axis=1)

Imagine being an **NFL coach** behind the desk of his/her office. With our staff's help, we are analyzing a bunch of several passing plays shown in the huge monitor in front of us. The goal? *Studying the defensive strategies that can hinder the opponent's strength we will have to face.* 

It is **right at this moment** that the approach shown in this notebook can be useful. Namely, can we **predict** whether a **Defensive Pass Interference will be called** on the analyzed passing play, based on **the players' movements?** 

In other words, let's imagine that the monitor in front of us visualizes this particular scenario:

In [None]:
# setting plot's configuration
fig, (ax1) = plt.subplots(1, 1)

# specific playID of the passing play taken into consideration
playID = 75

# tracking data of the passing play taken into consideration
possible_track = week1[ week1['playId'] == playID]

# names of the players involved + the ball
names = possible_track["displayName"].unique()


def arrow(x,y,ax,color, alpha):
    """
    Function to draw the arrow of the movement
    :param x: position on x-axis
    :param y: position on y-axis
    :param ax: plot's configuration
    :param color: color of the arrows
    :param alpha: color's intensity of the arrows
    :return: arrows on the specific positions
    """
    # distance between the arrows
    ind = np.arange(len(x)-14,len(x),13)
    
    # computing of the arrows
    for i in ind:
        ar = FancyArrowPatch ((x[i-1],y[i-1]),(x[i],y[i]), 
                              arrowstyle='fancy', mutation_scale=50, color = color, alpha = alpha)
        ax.add_patch(ar)


def draw(player_df, colorr, alpha):
    """
    Function to draw the movement
    :param player_df: tracking data of the player
    :param colorr: color of the movement
    :param alpha: color's intensity of the movement
    :return: plot (proportional to the field) of the movement
    """
    
    # removing variables of no interest 
    Y = player_df.drop(['time','s','a','dis','o','dir','event','nflId','displayName','jerseyNumber','position','frameId','team','gameId','playId','playDirection','route'],axis=1)  
    
    # converting into a matrix
    Y = Y.to_numpy()
    Y = Y.astype(float)
    
    # plot's dimension proportional to the field
    plt.xlim([0, 120])
    plt.ylim([0, 53.3])
    
    # plot of the movement
    plt.plot(Y[:,0], Y[:,1] , '-ok', color = colorr, alpha = alpha)
    
    # line of scrimmage
    plt.axvline(x= 90, color =  'black')
    plt.text(90.5, 2, 'line of scrimmage ',fontsize=42, color = 'black',weight = 'bold')
    
    # line of endzone
    plt.axvline(x= 10, color =  'black')
    plt.text(3, 20, 'ENDZONE ',fontsize=100, color = 'black', weight = 'bold', rotation='vertical')
    
    # plot of the arrows on the movement
    arrow(Y[:,0],Y[:,1],ax1,colorr, alpha = alpha)

    
# loop over each name involved in the passing play
for i in range(0,len(names)):
    
    # dataframe of tracking data of that "name"  
    df = possible_track[possible_track['displayName'] == names[i]]

    # plot of tracking data if "name" is the ball
    if df["team"][df.index[0]] == 'football':
        colorr = 'brown'
        draw(df, colorr,1)
        
    # plot of tracking data if "name" is a member of the offensive team
    elif df["team"][df.index[0]] == 'away':
        colorr = 'blue'
        draw(df, colorr,1)
    
    # legend
    blue_patch = mpatches.Patch(color='blue', label= 'Offensive Team')
    brawn_patch = mpatches.Patch(color='brown', label= 'Ball')
    plt.legend(handles=[blue_patch,brawn_patch], loc='upper right',prop={'size': 30}) 
    
    # arrow of the direction of the attack
    plt.arrow(65, 10, -26, 0, length_includes_head=True, head_width=2, head_length=3)
    plt.text(43.5, 10.4, 'direction of attack ',fontsize=42, color = 'black')
    
    # title
    plt.title("Possible opponent's strategy during a passing play",size=95)
    
    # caption
    plt.text(60, -5, 'Figure 1', ha='center',size=70)

After a large number of intensive meetings with our staff, we have decided that the following strategy is the best way to countering this passing play.

In [None]:
# setting plot's configuration
fig, (ax1) = plt.subplots(1, 1)

# loop over each name involved in the passing play
for i in range(0,len(names)):
    
    # dataframe of tracking data of that "name"
    df = possible_track[possible_track['displayName'] == names[i]]

    # plot of tracking data if "name" is the ball
    if df["team"][df.index[0]] == 'football':
        colorr = 'brown'
        draw(df, colorr,1)
    
    # plot of tracking data if "name" is a member of the offensive team
    elif df["team"][df.index[0]] == 'away':
        colorr = 'blue'
        draw(df, colorr,1)
    
    # plot of tracking data if "name" is a member of the defensive team
    elif df["team"][df.index[0]] == 'home':
        colorr = 'red'
        draw(df, colorr,1)
    

    # legend
    red_patch = mpatches.Patch(color='red', label= 'Defensive Team')
    blue_patch = mpatches.Patch(color='blue', label= 'Offensive Team')
    brown_patch = mpatches.Patch(color='brown', label= 'Ball')
    plt.legend(handles=[red_patch,blue_patch,brown_patch], loc='upper right',prop={'size': 30}) 
    
    # arrow of the direction of the attack
    plt.arrow(65, 10, -26, 0, length_includes_head=True, head_width=2, head_length=3)
    plt.text(43.5, 10.4, 'direction of attack ',fontsize=42, color = 'black')
    
    # title
    plt.title("Defensive strategy", size=95)
    
    # caption
    plt.text(60, -5, 'Figure 2', ha='center',size=70)

[**Defensive Pass Interference (DPI)**](https://operations.nfl.com/the-rules/nfl-video-rulebook/defensive-pass-interference/#:~:text=It%20is%20pass%20interference%20by,opportunity%20to%20catch%20the%20ball) is a crucial aspect that **can shift the balance of the match.** Therefore, we would like to keep it into consideration during the analysis of the defensive strategies. Namely, by moving in this way in response to this passing play, **will we commit a DPI?** 

Once we answer this question, we will be able **to avoid defensive strategies that could lead the team into a DPI.**

By keeping in mind this goal, let's move towards how to build an answer to this question.

# Datasets

The first step is to get a closer look at the two datasets that we are going to **manipulate** mainly in this analysis:
- *plays*: containing the information about all passing plays during the 2018 regular season.

In [None]:
plays

- *weeks*: containing all the passing plays' tracking data in *plays*.

In [None]:
weeks

# Problem

As we can observe, **each passing play** in *plays* is uniquely identified by a **combination of two variables**: *playId* and *gameId*.

To answer whether a DPI will be called or not, the two datasets *weeks* and *plays* are manipulated, in order to obtain a **new dataset** in which **classification algorithms** can be applied.

We want to obtain a **new dataset** in which each row is a passing play characterized by a set of **new** features. These **new** features should **efficiently collect** compelling **information** from the dataset *weeks* and *plays*, for **each different** passing play. 

# How to gather information from tracking data

Using the dataset *weeks* as it is, is **unthinkable**. We need a method that can **model** the movement for each player **without losing too much information.** 

The technique used in this notebook is based on [Bézier curves](https://drive.google.com/file/d/1iNhRpEiCRHS9hBGmFIaWpwjpHlpKui5N/edit?disco=AAAAHfghZEE), which are parametric curves that can be seen from a statistical point of view. More information about this method's theory can be found in my [Final Bachelor’s Project](https://drive.google.com/file/d/1iNhRpEiCRHS9hBGmFIaWpwjpHlpKui5N/view); in this notebook, the key concepts are shown.

By applying this method, we will **estimate a set of points** (in two-dimensional Euclidean space) which can **uniquely describe the movement of interest.** Let's see an example.

In [None]:
# setting plot's configuration
fig, (ax1) = plt.subplots(1, 1)

# dataframe of the tracking data of the player of interest
Y = possible_track[ possible_track['displayName'] == 'Nate Gerry']

# removing variables of no interest
Y = Y.drop(['time','s','a','dis','o','dir','event','nflId','displayName','jerseyNumber','position','frameId','team','gameId','playId','playDirection','route'],axis=1)

# converting into a matrix 
Y = Y.to_numpy()
Y = Y.astype(float)

# loop over each name involved in the passing play  
for i in range(0,len(names)):
    
    # dataframe of tracking data of that "name"
    df = possible_track[possible_track['displayName'] == names[i]]
    
    # plot of tracking data if "name" is the ball
    if df["team"][df.index[0]] == 'football':
        colorr = 'brown'
        draw(df, colorr,0.1)
        
    # plot of tracking data if "name" is a member of the offensive team    
    elif df["team"][df.index[0]] == 'away':
        colorr = 'blue'
        draw(df, colorr,0.1)
        
    # plot of tracking data if "name" is a member of the defensive team      
    elif df["team"][df.index[0]] == 'home':
        colorr = 'red'
        draw(df, colorr,0.1)
    

    # legend
    red_patch = mpatches.Patch(color='red', label= 'Defensive Team')
    blue_patch = mpatches.Patch(color='blue', label= 'Offensive Team')
    brawn_patch = mpatches.Patch(color='brown', label= 'Ball')
    orange_patch = mpatches.Patch(color='darkorange', label= 'Player of interest')
    plt.legend(handles=[red_patch,blue_patch,brawn_patch,orange_patch], loc='upper right',prop={'size': 30}) 
    
    # arrow of the direction of the attack
    plt.arrow(65, 10, -26, 0, length_includes_head=True, head_width=2, head_length=3)
    plt.text(43.5, 10.4, 'direction of attack ',fontsize=42, color = 'black')
    
    # plot of tracking data of the player of interest
    plt.plot(Y[:,0], Y[:,1] , '-ok', color = 'darkorange', alpha = 4 )
    arrow(Y[:,0],Y[:,1],ax1,color='orange', alpha = 4)
    
    # title
    plt.title("Focus on one single player",size=95)
    
    # caption
    plt.text(60, -5, 'Figure 3', ha='center',size=70)

The above graph shows the same passing play that was seen before (Figure 2), but our focus now is on the movement of the player coloured in orange. Given this smooth trajectory, we can [estimate](https://drive.google.com/file/d/1iNhRpEiCRHS9hBGmFIaWpwjpHlpKui5N/edit?disco=AAAAHfghZGY) (exploiting Bézier curves [through a statistical point of view](https://drive.google.com/file/d/1iNhRpEiCRHS9hBGmFIaWpwjpHlpKui5N/edit?disco=AAAAHfghZDw)) a new set of points that **uniquely** describes this **movement.**

In [None]:
# setting plot's configuration
fig, (ax1) = plt.subplots(1, 1)


def b(t,d):
    """
    This function is necessay to compute the general formulation of Bézier curves in matrix format.
    :param t: scalar input of Bézier curve that takes a value between 0 and 1 
    :param d: order Bézier curve
    :return: scalar number
    """
    a = (t**d) * ( (1-t)**(n-d) ) * scipy.special.binom(n, d)
    return a

# Bézier curve's order
n = 4  

# number of movement's observations
m = Y.shape[0]

# matrix T (more information look at chap. 2.1.3 "Statistical Modeling of Trajectories with Bézier Curves")
T = []
T.insert(0, np.zeros((m,n+1)))
t  =  np.linspace(0, 1,T[0].shape[0], endpoint = True )

p = 0
for h in range(0, n+1):
    for j in range(0,T[0].shape[0]):
        T[0][j,p] = b(float(t[j]),h)
    p = p + 1

# estimated points
theta_hat = np.dot(np.dot(np.linalg.pinv(np.dot(np.transpose(T[0]), T[0])),np.transpose(T[0])),Y)


def Bezier_function(t, degree):
    """
    This function computes the output of Bezier curve of a given order with the fixed theta.
    :param t: scalar input of Bézier curve that takes a value between 0 and 1 
    :param degree: order Bézier curve
    :return: two-dimensional vector
    """
    n = degree

    a = 0
    for i in range(0,n+1):
       a =  scipy.special.binom(n, i) * (1-t)**(n-i) * t**i * theta_hat[i,:] + a
    return a

# computation Bézier curve's output
t = np.linspace(start=0, stop=1, num=1000)
A = np.zeros((1000,2))
for i in range(0,1000):
    A[i,:] = Bezier_function(t[i],n)

# plot of tracking data of the player of interest
plt.scatter(Y[:,0], Y[:,1] ,c= 'darkorange')
arrow(Y[:,0],Y[:,1], ax1,color='darkorange',alpha=1)

# plot estimated points
plt.scatter(theta_hat[:,0], theta_hat[:,1], c='blueviolet', s=300)

# plot's dimension proportional to the field
plt.xlim([0, 120])
plt.ylim([0, 53.3])

# line of scrimmage
plt.axvline(x= 90, color =  'black')
plt.text(90.5, 2, 'line of scrimmage ',fontsize=42, color = 'black',weight = 'bold')
    
# line of endzone
plt.axvline(x= 10, color =  'black')
plt.text(3, 20, 'ENDZONE ',fontsize=100, color = 'black', weight = 'bold', rotation='vertical')

# legend
red_patch = mpatches.Patch(color='blueviolet', label= 'Estimated points')
orange_patch = mpatches.Patch(color='darkorange', label= 'Player of interest')
plt.legend(handles=[red_patch, orange_patch], loc='upper right',prop={'size': 30})

# title
plt.title("Estimated points of this specific movement",size=95)

# caption
plt.text(60, -5, 'Figure 4', ha='center',size=70);

In other words, instead of using all observations of the movement, **these five estimated points** (purple points) can **uniquely identify this particular movement!** 

Whether you would like to test if these estimated points actually refer uniquely to that specific movement, then let's observe that by computing the output of the Bézier curve of [fourth-order](https://drive.google.com/file/d/1iNhRpEiCRHS9hBGmFIaWpwjpHlpKui5N/edit?disco=AAAAHhgt1Ek) for 1000 values of $ t \in [0,1] $,  by fixing $ p_0, p_1, p_2, p_3, p_4 $ equal to the estimated points computed.

In [None]:
# setting plot's configuration
fig, (ax1) = plt.subplots(1, 1)

# plot of tracking data of the player of interest
plt.scatter(Y[:,0], Y[:,1], c='darkorange')

# plot estimated points
plt.scatter(theta_hat[:,0], theta_hat[:,1], c='blueviolet', s=300)

# plot's dimension proportional to the field
plt.xlim([0, 120])
plt.ylim([0, 53.3])

# line of scrimmage
plt.axvline(x= 90, color =  'black')
plt.text(90.5, 2, 'line of scrimmage ',fontsize=42, color = 'black',weight = 'bold')
    
# line of endzone
plt.axvline(x= 10, color =  'black')
plt.text(3, 20, 'ENDZONE ',fontsize=100, color = 'black', weight = 'bold', rotation='vertical')

# plot output Bézier curve
plt.scatter(A[:,0], A[:,1], c='green')
arrow(A[:,0],A[:,1], ax1,color='green', alpha =1)

# legend
red_patch = mpatches.Patch(color='blueviolet', label= 'Estimated points')
orange_patch = mpatches.Patch(color='darkorange', label= 'Player of interest')
green_patch = mpatches.Patch(color='green', label= 'Output Bézier Curve')
plt.legend(handles=[red_patch, orange_patch, green_patch], loc='upper right',prop={'size': 30})

# title
plt.title("Output Bézier curve of fourth-order",size=95)

#caption
plt.text(60, -5, 'Figure 5', ha='center',size=70);

As we can observe, the curve obtained from the fourth-order Bézier curve is similar to the player's movement!

If we had chosen to adopt a higher number of estimated points, we would have obtained Bézier curves' output closer to that of actual movement. **The power** of seeing Bézier curves through a **statistical point of view** is that you can not only compute an estimate of points that uniquely describes the movement, [**but also identify the ”correct order” of the Bézier curve.**](https://drive.google.com/file/d/1iNhRpEiCRHS9hBGmFIaWpwjpHlpKui5N/edit?disco=AAAAHfghZG8) Namely, the order that can represent the movement **without losing the signal**, but at the same time, it can **avoid over-fitting.**

In this context, I decided to adopt the Bézier curve of **fourth-order** since it seemed a **good trade-off** by looking at the [AIC values](https://drive.google.com/file/d/1iNhRpEiCRHS9hBGmFIaWpwjpHlpKui5N/edit?disco=AAAAHipHaLM). 

In [None]:
# Looking at the AIC values for each player is unthinkable, given the dimension of dataset. 
# Here we can compute the AIC values of 10 random defensive players.
# Running this part of the code multiple times is possible in order to base our decision not only on one single sample of 10 elements!

# random sampling of 10 passing plays
sampled_list = random.sample(list(plays.index), 10)

# looping over the 10 sampled passing plays
for fils in range(0,len(sampled_list)):

    gameID = plays.iloc[fils,0]
    playID = plays.iloc[fils,1]
    
    # filtering of plays with that specific values of "gameID" and "playID"
    focus7 = plays[ plays['gameId'] == gameID ]
    focus6 = focus7 [ focus7 ['playId'] == playID ].reset_index().drop('index',axis=1)
    
    # team in possession of the ball
    possTeam = focus6.iloc[0,6]

    # checking which team is the defender
    focus5 = games[ games['gameId'] == gameID ].reset_index().drop('index',axis=1)
    if focus5.iloc[0,3] == possTeam:
        defend = "away"
    else:
        defend = "home"
    
    # filtering of weeks with that specific values of "gameID", "playID" and "defend"
    focus4 = weeks[ weeks['playId'] == playID ]
    focus3 = focus4[ focus4['team'] == defend ]
    focus2 = focus3[ focus3['gameId'] == gameID].reset_index().drop('index',axis=1)
    
    # names of the players involved in that specific defensive passing play
    names = focus2['displayName'].unique().astype(str)
    
    # random sampling of one of the players in "names"
    name = random.sample(list(names), 1)
    
    # dataframe of traking data of the "name" sampled
    Y = focus2[focus2['displayName'] == name[0]]
        
    # checking if the direction of the attack is from left to right
    if(focus2.iloc[0,17] == 'right'):
        
        # if it is, we flip the passing play in order to have the same direction of the attack for each different passing play (from right to left)
        xli = []
        for l in range(0,Y.shape[0]):
        
            OldMax = 120
            OldMin = 0
            NewMax = 0
            NewMin = 120
            OldValue = Y.iloc[l,1]
        
            OldRange = (OldMax - OldMin)  
            NewRange = (NewMax - NewMin)  
            NewValue = (((OldValue - OldMin) * NewRange) / OldRange) + NewMin
    
            xli.append(NewValue)
        
        
        yli = []
        for z in range(0,Y.shape[0]):
        
            OldMax = 53.3
            OldMin = 0
            NewMax = 0
            NewMin = 53.3
            OldValue = Y.iloc[z,2]
        
            OldRange = (OldMax - OldMin)  
            NewRange = (NewMax - NewMin)  
            NewValue = (((OldValue - OldMin) * NewRange) / OldRange) + NewMin
    
            yli.append(NewValue)
    
        # dataset flipped
        frames = [pd.DataFrame(xli), pd.DataFrame(yli)]
        Y = pd.concat(frames,axis=1).reset_index().drop('index',axis=1)
            
            
    if(focus2.iloc[0,17] != 'right'):
        # removing variables of no interest
        Y = Y.drop(['time','s','a','dis','o','dir','event','nflId','displayName','jerseyNumber','position','frameId','team','gameId','playId','playDirection','route'],axis=1)
            
    #converting into a matrix            
    Y = Y.to_numpy()
    Y = Y.astype(float)

    def vector(Matrix):
        """
        This function computes the vectorization, namely a linear transformation which converts the matrix into a column vector.
        :param Matrix: a matrix
        :return: a column vector
        """
        n = Matrix.shape[0]
        p = Matrix.shape[1]

        vett = np.zeros(n*p)
        o = 0
        for i in range(p):
            for k in range(n):
                vett[o] = Matrix[k,i]
                o = o + 1
        return vett

    #AKAIKE
    def Akaike(Y,theta_hat,observations,variables,parameters):
        """
        This function computes the AIC value in a multivariate linear regression context.
        :param Y: response variable that is a two-dimensional matrix
        :param theta_hat: it contains the estimated points
        :param observations: number of observations
        :param variables: number of variables (in this context two)
        :param parameters: Bézier curve's order
        """
        p = variables
        k = parameters
        n = observations

        # T is the design matrix

        # projection matrix
        P = np.dot(T[0], np.dot(np.linalg.pinv(np.dot(np.transpose(T[0]), T[0])), np.transpose(T[0])))

        # Estimate of the error variance
        S = np.dot(np.dot(np.transpose(Y), (np.eye(Y.shape[0]) - P)), Y) / (Y.shape[0] - p - 1)

        # Computation of the Multivariate Normal distribution's mean
        Y_t = vector(Y)
        mean = vector(np.dot(T[0],theta_hat))

        # Computation of the Multivariate Normal distribution's covariance matrix
        var = np.kron(S,np.eye(Y.shape[0]))

        # Computational solution for the positive covariance
        min_eig = np.min(np.real(np.linalg.eigvals(var)))
        if min_eig < 0:
            var -= 10 * min_eig * np.eye(*var.shape)
        # Transpose of Y's log-likelihood
        L = math.log(multivariate_normal.pdf(Y_t,mean,var))

        # Akaike's formula
        return -2*L + 2*n*(p*k + p*(p+1)/2)/(n-(k+p+1))


    max_order = 10
    akaike = np.zeros(max_order)
    c = 0
    
    # loop over the possible orders of the Bézier curve
    for i in range(0,max_order):
        i = i + 2
        n = int(i) # Number of parameters
        m = Y.shape[0]

        # MATRIX T
        T = []
        T.insert(0, np.zeros((m,n+1)))
        t  =  np.linspace(0, 1,T[0].shape[0], endpoint = True )

        p = 0
        for h in range(0, n+1):
            for j in range(0,T[0].shape[0]):
                T[0][j,p] = b(float(t[j]),h)
            p = p + 1
        
        # estimated points
        theta_hat = np.dot(np.dot(np.linalg.pinv(np.dot(np.transpose(T[0]), T[0])),np.transpose(T[0])),Y)
        
        # computation of AIC value
        akaike[c] = Akaike(Y,theta_hat,Y.shape[0],2,n)
        c = c + 1

    plt.plot(np.arange(1,max_order+1), akaike, '-o')
    plt.title("AIC values for 10 defensive players chosen randomly",size=75)
    plt.xlabel("Bézier curve's order",size=55)
    plt.ylabel("AIC's values",size=55)

Now, by extending the approach of the estimated points to each movement of this analyzed defensive strategy, we can obtain the following result:

In [None]:
# dataframe of tracking data of all defensive players of the passing play that was taken into consideration
alls = possible_track[possible_track['team']== 'home']

# loop over all defensive players
for k in range(0, len(alls["displayName"].unique())):
    
    # filtering tracking data of the passing play taken into consideration, with respect to unique 'displayName' of 'alls'
    Y = possible_track[ possible_track['displayName'] == alls["displayName"].unique()[k]]
    # removing variables of no interest
    Y = Y.drop(['time','s','a','dis','o','dir','event','nflId','displayName','jerseyNumber','position','frameId','team','gameId','playId','playDirection','route'],axis=1)
    
    # converting into a matrix
    Y = Y.to_numpy()
    Y = Y.astype(float)
    
    # Bezier curve's order adopted
    n = 4 
    m = Y.shape[0]

    # MATRIX T
    T = []
    T.insert(0, np.zeros((m,n+1)))
    t  =  np.linspace(0, 1,T[0].shape[0], endpoint = True )

    p = 0
    for h in range(0, n+1):
        for j in range(0,T[0].shape[0]):
            T[0][j,p] = b(float(t[j]),h)
        p = p + 1

    # matrix of estimated points
    theta_hat = np.dot(np.dot(np.linalg.pinv(np.dot(np.transpose(T[0]), T[0])),np.transpose(T[0])),Y)

    # computation Bézier curve's output
    t = np.linspace(start=0, stop=1, num=1000)
    A = np.zeros((1000,2))
    for i in range(0,1000):
        A[i,:] = Bezier_function(t[i],n)

    # plot each defensive movements 
    plt.scatter(Y[:,0], Y[:,1], c='red')
    
    # plot estimated points
    plt.scatter(theta_hat[:,0], theta_hat[:,1], c='blueviolet', s = 200)
    
    # plot's dimension proportional to the field
    plt.xlim([0, 120])
    plt.ylim([0, 53.3])
    
    # line of scrimmage
    plt.axvline(x= 90, color =  'black')
    plt.text(90.5, 2, 'line of scrimmage ',fontsize=42, color = 'black',weight = 'bold')
    
    # line of endzone
    plt.axvline(x= 10, color =  'black')
    plt.text(3, 20, 'ENDZONE ',fontsize=100, color = 'black', weight = 'bold', rotation='vertical')
    
    # plot Bézier curve's output
    plt.scatter(A[:,0], A[:,1], c='green')

# legend
violet_patch = mpatches.Patch(color='blueviolet', label= 'Estimated points')
red_patch = mpatches.Patch(color='red', label= 'Defensive Team')
green_patch = mpatches.Patch(color='green', label= 'Output Bézier Curves')
plt.legend(handles=[violet_patch,red_patch,green_patch], loc='upper right',prop={'size': 30}) 
    
# plot arrow of the direction of the attack
plt.arrow(65, 10, -26, 0, length_includes_head=True, head_width=2, head_length=3)
plt.text(43.5, 10.4, 'direction of attack ',fontsize=42, color = 'black')

# title
plt.title("Estimated points for all defensive movements of this defensive strategy",size=95)

# caption
plt.text(60, -5, 'Figure 6', ha='center',size=70);

A $35 \times 2$ matrix containing the estimated points has been obtained for this particular defensive strategy ( 5 estimated points for each of the seven moving players involved in this passing play).

Let's see now how this matrix can be **effectively transformed** into a feature vector for this passing play.

## Dimensionality reduction

To use the matrix of estimated points as the feature vector of this passing play, a technique of dimensionality reduction is necessary. In this context, the primary linear technique called Principal Component Analysis, is used. By applying PCA, the $35 \times 2$ matrix can be **transformed** into a $35 \times 1$ array. By doing so, the transformation can be applied as **the new feature vector of that specific passing play.**

As you can imagine, each passing play has its defensive strategy in which a different number of moving defenders is involved. By restating what we implied before, we should define new variables that can accurately collect the relevant information from the tracking data (✅), **for each different passing play.**

# Each passing play is different

A different number of moving defensive players is involved in each passing play. This means that, depending on the personnel used into the defensive strategy, we have a **different number of estimated points.** So as observed before, we obtained a $35 \times 2$ matrix because there were seven moving defensive players involved. But, if nine moving defensive players had been involved, you would have obtained a $45 \times 2$ matrix.

To apply a classification algorithm, we need a dataset in which each observation has the same number of features. Once the dimensionality reduction is applied, the obtained feature vector might have a different dimension depending on the moving defensive players involved in that specific passing play. To avoid this problem, the [K-means](https://www.youtube.com/watch?v=4b5d3muPQmA&t=85s) technique is applied. In essence, **each** passing play's estimated points are **replaced** by a **fixed number** of **centroids** which is the **same** for each passing play. Therefore, the PCA is applied to **the centroids**. 

*The advantage?* Every passing play has a defensive strategy defined by **the same number** of points (**centroids obtained**) even though a different personnel is used.

*The disadvantage?* You can lose some information after using the K-means technique because we represent fewer centroids as replacements for the estimated points.

This heuristic method gives us a **good trade-off** between representation and classification's practicability. 

In [None]:
# dataframe of tracking data of all defensive players of the passing play that was taken into consideration
alls = possible_track[possible_track['team']== 'home']

# list of the estimated points for each different passing play
thetas = []

# loop over all defensive players
for k in range(0, len(alls["displayName"].unique())):
    
    # filtering tracking data of the passing play taken into consideration, with respect to unique 'displayName' of 'alls'
    Y = possible_track[ possible_track['displayName'] == alls["displayName"].unique()[k]]
    
    # removing variables of no interest
    Y = Y.drop(['time','s','a','dis','o','dir','event','nflId','displayName','jerseyNumber','position','frameId','team','gameId','playId','playDirection','route'],axis=1)
    
    # converting into a matrix
    Y = Y.to_numpy()
    Y = Y.astype(float)
        
    # Bezier curve's order adopted
    n = 4 
    m = Y.shape[0]

    # MATRIX T
    T = []
    T.insert(0, np.zeros((m,n+1)))
    t  =  np.linspace(0, 1,T[0].shape[0], endpoint = True )

    p = 0
    for h in range(0, n+1):
        for j in range(0,T[0].shape[0]):
            T[0][j,p] = b(float(t[j]),h)
        p = p + 1
    
    # matrix of estimated points
    theta_hat = np.dot(np.dot(np.linalg.pinv(np.dot(np.transpose(T[0]), T[0])),np.transpose(T[0])),Y)
    
    thetas.append(theta_hat)
            
    # plot each defensive movements 
    plt.scatter(Y[:,0], Y[:,1], c='red')
    
    # plot estimated points of each movement
    plt.scatter(theta_hat[:,0], theta_hat[:,1], c='blueviolet',s=300)
    
    # plot's dimension proportional to the field
    plt.xlim([0, 120])
    plt.ylim([0, 53.3])
    
    # line of scrimmage
    plt.axvline(x= 90, color =  'black')
    plt.text(90.5, 2, 'line of scrimmage ',fontsize=42, color = 'black',weight = 'bold')
    
    # line of endzone
    plt.axvline(x= 10, color =  'black')
    plt.text(3, 20, 'ENDZONE ',fontsize=100, color = 'black', weight = 'bold', rotation='vertical')
        
thetas = np.concatenate(thetas)

# Applying K-means algorithm on the estimated points
kmeans = KMeans(n_clusters=25)
kmeans.fit_predict(thetas)

# matrix of the centroids
centroids = kmeans.cluster_centers_
    
# plot centroids obtained
plt.scatter(pd.DataFrame(centroids)[0], pd.DataFrame(centroids)[1], c='goldenrod', s = 100);

# legend
violet_patch = mpatches.Patch(color='blueviolet', label= 'Estimated points')
red_patch = mpatches.Patch(color='red', label= 'Defensive Team')
golden_patch = mpatches.Patch(color='goldenrod', label= 'Centroids')
plt.legend(handles=[violet_patch,red_patch,golden_patch], loc='upper right',prop={'size': 30}) 
    
# plot arrow of the direction of the attack
plt.arrow(65, 10, -26, 0, length_includes_head=True, head_width=2, head_length=3)
plt.text(43.5, 10.4, 'direction of attack ',fontsize=42, color = 'black')

# title
plt.title("Centroids obtained with K-Means for this passing play",size=95)

# caption
plt.text(60, -5, 'Figure 7', ha='center',size=70);

## What is the "right" number of centroids that you should adopt?

In this specific context, we do not want to lose too much information when using K-means to define fewer centroids as replacements for each passing play's estimated points. Thus, we should use the **largest possible number** of centroids. The largest possible number of centroids corresponds to the minimum number of moving defensive players involved in a passing play multiplied by five.

By looking at the variable *personnelD* in *plays*, that shows personnel used by the defensive team, we can spot that all the passing plays have a minimum of 5 defenders in motion except for one single passing play. Therefore, I have chosen to delete this observation to increase the number of centroids (used for **each** passing play) from 20 to 25, **by enhancing the power of the centroids' representativeness!**

In [None]:
pd.DataFrame(plays.groupby('personnelD').count()['isDefensivePI'])

# I am going to drop out the only passing play with 4 defensive players in motion in the next cell.

# Defensive strategy of each passing play

Before generalizing the method to each defensive strategy of each passing play, we need to clean the dataset *plays* since some passing plays may not have defensive strategy. The data frame obtained called *key* becomes the reference point since it collects *gameId*, *playId* and the *defender team* of each *clean* passing play.

In [None]:
# list of all clean passing plays
key = []

# empty list for checking purpose
empty = []

# loop over each different passing play
for i in range(0,plays.shape[0]): 

    gameID = plays["gameId"][i]
    playID = plays["playId"][i]
    
    # filtering of plays with that specific values of "gameID" and "playID"
    focus7 = plays[ plays['gameId'] == gameID ]
    focus6 = focus7[ focus7['playId'] == playID ].reset_index().drop('index',axis=1)
    
    # team in possession of the ball
    possTeam = focus6.iloc[0,6]
    
    # checking which team is the defender
    focus5 = games[ games['gameId'] == gameID ].reset_index().drop('index',axis=1)
    if focus5.iloc[0,3] == possTeam:
        defend = "away"
    else:
        defend = "home"
    
    # filtering of weeks with that specific values of "gameID", "playID" and "defend"
    focus4 = weeks[ weeks['gameId'] == gameID ]
    focus3 = focus4[ focus4['playId'] == playID ]
    focus2 = focus3[ focus3['team'] == defend].reset_index().drop('index',axis=1)
    
    # names of the players involved in that specific defensive passing play
    names = focus2['displayName'].unique().astype(str)
    
    # checking if the passing play in plays is without a defensive strategy
    if (names.tolist() != empty ):
        
        key.append([gameID,playID,defend])

# converting into a dataframe        
key = pd.DataFrame(key)
key.columns = ['gameId','playId','Defender']

# index of the passing play with only 4 players in motion
a = key[key['playId'] == int(plays[ plays['personnelD'] == '7 DL, 3 LB, 1 DB' ]['playId'])]
idx = a[a['gameId'] == int(plays[ plays['personnelD'] == '7 DL, 3 LB, 1 DB' ]['gameId'])].index

# Let's remove the only passing play with 4 defensive players in motion
key = key.drop([idx[0]]).reset_index().drop('index',axis=1)
key

After the generalization, we obtain a dataset in which each passing play is characterized by dif$_1$, ..., dif$_{25}$. These values result from the PCA's application on the 25 centroids, which were obtained from the estimated points of each different passing play.

In [None]:
# list of centroids obtained from defensive strategy for each passing play, sorted by distance from the end zone (defensive)
protos =[]

# loop over each clean passing play
for i in range(0,len(key)): 

    gameID = key.iloc[i,0]
    playID = key.iloc[i,1]
    
    # filtering of plays with that specific values of "gameID" and "playID"
    focus7 = plays[ plays['gameId'] == gameID ]
    focus6 = focus7[ focus7['playId'] == playID ].reset_index().drop('index',axis=1)
    
    # team in possession of the ball
    possTeam = focus6.iloc[0,6]
    
    # checking which team is the defender
    focus5 = games[ games['gameId'] == gameID ].reset_index().drop('index',axis=1)
    if focus5.iloc[0,3] == possTeam:
        defend = "away"
    else:
        defend = "home"
    
    # filtering of weeks with that specific values of "gameID", "playID" and "defend"
    focus4 = weeks[ weeks['playId'] == playID ]
    focus3 = focus4[ focus4['team'] == defend ]
    focus2 = focus3[ focus3['gameId'] == gameID].reset_index().drop('index',axis=1)
    
    # names of the players involved in that specific defensive passing play
    names = focus2['displayName'].unique().astype(str)
    
    # list of the estimated points for each different passing play    
    thetas = []
    
    # loop over each defensive player's name involved in 'i' passing play
    for u in range(0, len(names)):
        
        # dataframe of tracking data of the "name" 
        Y = focus2[focus2['displayName'] == names[u]]
        
        # checking if the direction of the attack is from left to right
        if(focus2.iloc[0,17] == 'right'):
            
            # if it is, we flip the passing play in order to have the same direction of the attack for each different passing play (from right to left)
            xli = []
            for l in range(0,Y.shape[0]):
        
                OldMax = 120
                OldMin = 0
                NewMax = 0
                NewMin = 120
                OldValue = Y.iloc[l,1]
        
                OldRange = (OldMax - OldMin)  
                NewRange = (NewMax - NewMin)  
                NewValue = (((OldValue - OldMin) * NewRange) / OldRange) + NewMin
    
                xli.append(NewValue)
        
        
            yli = []
            for z in range(0,Y.shape[0]):
        
                OldMax = 53.3
                OldMin = 0
                NewMax = 0
                NewMin = 53.3
                OldValue = Y.iloc[z,2]
        
                OldRange = (OldMax - OldMin)  
                NewRange = (NewMax - NewMin)  
                NewValue = (((OldValue - OldMin) * NewRange) / OldRange) + NewMin
    
                yli.append(NewValue)
    
            # dataset flipped
            frames = [pd.DataFrame(xli), pd.DataFrame(yli)]
            Y = pd.concat(frames,axis=1).reset_index().drop('index',axis=1)
            
            
        if(focus2.iloc[0,17] != 'right'):
            # removing variables of no interest
            Y = Y.drop(['time','s','a','dis','o','dir','event','nflId','displayName','jerseyNumber','position','frameId','team','gameId','playId','playDirection','route'],axis=1)
            
        # converting into a matrix        
        Y = Y.to_numpy()
        Y = Y.astype(float)
    
        # Bezier curve's order adopted
        n = 4 
        m = Y.shape[0]

        # MATRIX T
        T = []
        T.insert(0, np.zeros((m,n+1)))
        t  =  np.linspace(0, 1,T[0].shape[0], endpoint = True )

        p = 0
        for h in range(0, n+1):
            for j in range(0,T[0].shape[0]):
                T[0][j,p] = b(float(t[j]),h)
            p = p + 1
        
        # matrix estimated points
        theta_hat = np.dot(np.dot(np.linalg.pinv(np.dot(np.transpose(T[0]), T[0])),np.transpose(T[0])),Y)
    
            
        thetas.append(theta_hat)

        
    thetas = np.concatenate(thetas) 
    
    # Applying K-means algorithm on the estimated points
    kmeans = KMeans(n_clusters=25)
    kmeans.fit_predict(thetas)
    
    # matrix of the centroids
    centroids = kmeans.cluster_centers_ 
        
    # computation of the distance from the end zone (defensive) for the centroids, in order to sort the centroids by distance
    dist = []
    for w in range(0,centroids.shape[0]):
    
            dist.append(abs( centroids[w][0] - 10 ))
        
    protos.append([centroids[qq] for qq in list(np.argsort(dist))])
        
        

# PCA on the centroids obtained
_dataset = []
for i in range(0,len(protos)):          
    
    pr = np.array(protos[i])
            
    pca = PCA(n_components=1, whiten= True)
    X_pca = pca.fit_transform(pr)
    
    _dataset.append(X_pca.tolist())
    

# transforming into a dataframe 
_dataseT_dif = pd.DataFrame(_dataset)

for i in range(0,_dataseT_dif.shape[0]):
    
    for j in range(0,_dataseT_dif.shape[1]):
        
        if _dataseT_dif.iloc[i,j] != None:
            _dataseT_dif.iloc[i,j] = float(_dataseT_dif.iloc[i,j][0])

_dataseT_dif.columns = ['dif_1','dif_2','dif_3','dif_4','dif_5',
                        'dif_6','dif_7','dif_8','dif_9','dif_10',
                        'dif_11','dif_12','dif_13','dif_14','dif_15',
                        'dif_16','dif_17','dif_18','dif_19','dif_20',
                        'dif_21','dif_22','dif_23','dif_24','dif_25']

_dataseT_dif

Having seen how defining a set of **new variables** that can accurately **collect** the relevant information from the tracking data for the **defensive strategies**, let's apply the same approach to the **offensive strategies** and to the **ball's movements!**

Someone may wonder, why should we also consider these movements? On the other hand, the DPI is committed by the defensive team. 

True, the DPI is committed by the defensive team, but it is also affected by the offensive strategy and ball's movement. They are both parts of the passing play. Thus, to add this information on the dataset just obtained, the same approach is applied to offensive strategies and ball's movements. 

# Offensive strategies and ball's movements

The only small differences from the method shown above are: 

- In the case of offensive strategies, we need to delete 4 passing plays to increase the representativeness's power by using 25 centroids. Namely, all the passing plays have a minimum of 5 moving offensive players except for 4 passing plays.

In [None]:
# empty list for checking purpose
empty = []

# list of all passing plays that do not have offensive strategy.
null = []

# loop over each different passing play
for i in range(0,len(key)): 

    gameID = key.iloc[i,0]
    playID = key.iloc[i,1]
    
    # filtering of plays with that specific values of "gameID" and "playID"
    focus7 = plays[ plays['gameId'] == gameID ]
    focus6 = focus7[ focus7['playId'] == playID ].reset_index().drop('index',axis=1)
    
    # team in possession of the ball
    possTeam = focus6.iloc[0,6]
    
    # checking which team is the attacker
    focus5 = games[ games['gameId'] == gameID ].reset_index().drop('index',axis=1)
    if focus5.iloc[0,3] == possTeam:
        attack = "home"
    else:
        attack = "away"
        
    # filtering of weeks with that specific values of "gameID", "playID" and "attack"
    focus4 = weeks[ weeks['playId'] == playID ]
    focus3 = focus4[ focus4['team'] == attack ]
    focus2 = focus3[ focus3['gameId'] == gameID].reset_index().drop('index',axis=1)
        
    # names of the players involved in that specific offensive passing play
    names = focus2['displayName'].unique().astype(str)
    
    # checking if the passing play in key is without a offensive strategy
    if (names.tolist() == empty ):
           null.append(i)
            
# Dropping passing plays whitout no offensive strategies
key = key.drop(null).reset_index().drop('index',axis=1)
_dataseT_dif = _dataseT_dif.drop(null).reset_index().drop('index',axis=1)

# list of the passing plays with 4 offensive players in motion
min_4 = []

# list of centroids obtained from offensive strategy for each passing play, sorted by distance from the end zone (defensive)
protos = []

# loop over each passing play in key
for i in range(0,len(key)): 

    gameID = key.iloc[i,0]
    playID = key.iloc[i,1]
    
    # filtering of plays with that specific values of "gameID" and "playID"
    focus7 = plays[ plays['gameId'] == gameID ]
    focus6 = focus7[ focus7['playId'] == playID ].reset_index().drop('index',axis=1)
    
    # team in possession of the ball
    possTeam = focus6.iloc[0,6]

    # checking which team is the attacker
    focus5 = games[ games['gameId'] == gameID ].reset_index().drop('index',axis=1)
    if focus5.iloc[0,3] == possTeam:
        attack = "home"
    else:
        attack = "away"
        
    # filtering of weeks with that specific values of "gameID", "playID" and "attack"
    focus4 = weeks[ weeks['playId'] == playID ]
    focus3 = focus4[ focus4['team'] == attack ]
    focus2 = focus3[ focus3['gameId'] == gameID].reset_index().drop('index',axis=1)
        
    # names of the players involved in that specific offensive passing play
    names = focus2['displayName'].unique().astype(str)
    
    # checking if the passing play has more than 4 offensive players in motion
    if (len(names) != 4 ):
        
        # list of the estimated points for each different passing play  
        thetas = []
        
        # loop over each offensive player's name involved in 'i' passing play
        for u in range(0, len(names)):
                     
            # dataframe of tracking data of the "name"
            Y = focus2[focus2['displayName'] == names[u]]
            
            # checking if the direction of the attack is from left to right
            if(focus2.iloc[0,17] == 'right'):
                
                # if it is, we flip the passing play in order to have the same direction of the attack for each different passing play (from right to left)
                xli = []
                for l in range(0,Y.shape[0]):
        
                    OldMax = 120
                    OldMin = 0
                    NewMax = 0
                    NewMin = 120
                    OldValue = Y.iloc[l,1]
        
                    OldRange = (OldMax - OldMin)  
                    NewRange = (NewMax - NewMin)  
                    NewValue = (((OldValue - OldMin) * NewRange) / OldRange) + NewMin
    
                    xli.append(NewValue)
        
        
                yli = []
                for z in range(0,Y.shape[0]):
        
                    OldMax = 53.3
                    OldMin = 0
                    NewMax = 0
                    NewMin = 53.3
                    OldValue = Y.iloc[z,2]
        
                    OldRange = (OldMax - OldMin)  
                    NewRange = (NewMax - NewMin)  
                    NewValue = (((OldValue - OldMin) * NewRange) / OldRange) + NewMin
    
                    yli.append(NewValue)
    
                # dataset flipped
                frames = [pd.DataFrame(xli), pd.DataFrame(yli)]
                Y = pd.concat(frames,axis=1).reset_index().drop('index',axis=1)
            
            
            if(focus2.iloc[0,17] != 'right'):
                # removing variables of no interest
                Y = Y.drop(['time','s','a','dis','o','dir','event','nflId','displayName','jerseyNumber','position','frameId','team','gameId','playId','playDirection','route'],axis=1)
            
            # converting into a matrix      
            Y = Y.to_numpy()
            Y = Y.astype(float)
    
            # Bezier curve's order adopted
            n = 4 
            m = Y.shape[0]

            # MATRIX T
            T = []
            T.insert(0, np.zeros((m,n+1)))
            t  =  np.linspace(0, 1,T[0].shape[0], endpoint = True )

            p = 0
            for h in range(0, n+1):
                for j in range(0,T[0].shape[0]):
                    T[0][j,p] = b(float(t[j]),h)
                p = p + 1

            # matrix estimated points
            theta_hat = np.dot(np.dot(np.linalg.pinv(np.dot(np.transpose(T[0]), T[0])),np.transpose(T[0])),Y)
    
            
            thetas.append(theta_hat)

        
        thetas = np.concatenate(thetas)
        
        # Applying K-means algorithm on the estimated points
        kmeans = KMeans(n_clusters = 25)
        kmeans.fit_predict(thetas)
        
        # matrix of the centroids
        centroids = kmeans.cluster_centers_ 
    
        # computation of the distance from the end zone (defensive) for the centroids, in order to sort the centroids by distance
        dist = []
        for w in range(0,len(centroids)):
    
            dist.append(abs( centroids[w][0] - 10 ))
        
        protos.append([centroids[qq] for qq in list(np.argsort(dist))])
    
    else:
        min_4.append(i)


# Let's remove the passing plays with 4 offensive players in motion from key and _dataseT_dif
key = key.drop(min_4).reset_index().drop('index',axis=1)
_dataseT_dif = _dataseT_dif.drop(min_4).reset_index().drop('index',axis=1)

# PCA on the centroids obtained
_dataset_off = []
for i in range(0,len(protos)):          
    
    pr = np.array(protos[i])
            
    pca = PCA(n_components=1, whiten= True)
    X_pca = pca.fit_transform(pr)
    
    _dataset_off.append(X_pca.tolist())
    
# transforming into a dataframe 
_dataseT_off = pd.DataFrame(_dataset_off)

for i in range(0,_dataseT_off.shape[0]):
    
    for j in range(0,_dataseT_off.shape[1]):
        
        if _dataseT_off.iloc[i,j] != None:
            _dataseT_off.iloc[i,j] = float(_dataseT_off.iloc[i,j][0])

_dataseT_off.columns = ['att_1','att_2','att_3','att_4',
                        'att_5','att_6','att_7','att_8','att_9',
                        'att_10','att_11','att_12','att_13','att_14',
                        'att_15','att_16','att_17','att_18','att_19',
                        'att_20','att_21','att_22','att_23','att_24','att_25']
_dataseT_off

- In the ball's movements, we do not need to use K-means technique since the ball's movement is not characterized by a personnel. We can incorporate the estimated points directly into our dataset (after transforming them into a feature vector by using PCA).

In [None]:
# list of the estimated points for each different passing play
thetas = []

# empty list for checking purpose
empty = []

# loop over each clean passing play in key
for i in range(0,len(key)): 

    gameID = key.iloc[i,0]
    playID = key.iloc[i,1]
    
    # filtering of weeks with that specific values of "gameID", "playID" and "football"
    focus4 = weeks[ weeks['playId'] == playID ]
    focus3 = focus4[ focus4['team'] == 'football' ]
    focus2 = focus3[ focus3['gameId'] == gameID].reset_index().drop('index',axis=1)
    
    # names of the ball involved in that specific passing play 
    names = focus2['displayName'].unique().astype(str)
    
    # checking if the passing play in plays is without ball's movement
    if (names.tolist() != empty ):
        
        # loop over each element in "names" for each 'i' passing play
        for u in range(0, len(names)):
            
            # dataframe of tracking data of the ball 
            Y = focus2[focus2['displayName'] == names[u]]
            
            # checking if the direction of the attack is from left to right
            if(focus2.iloc[0,17] == 'right'):
                
                # if it is, we flip the passing play in order to have the same direction of the attack for each different passing play (from right to left)
                xli = []
                for l in range(0,Y.shape[0]):
        
                    OldMax = 120
                    OldMin = 0
                    NewMax = 0
                    NewMin = 120
                    OldValue = Y.iloc[l,1]
        
                    OldRange = (OldMax - OldMin)  
                    NewRange = (NewMax - NewMin)  
                    NewValue = (((OldValue - OldMin) * NewRange) / OldRange) + NewMin
    
                    xli.append(NewValue)
        
        
                yli = []
                for z in range(0,Y.shape[0]):
        
                    OldMax = 53.3
                    OldMin = 0
                    NewMax = 0
                    NewMin = 53.3
                    OldValue = Y.iloc[z,2]
        
                    OldRange = (OldMax - OldMin)  
                    NewRange = (NewMax - NewMin)  
                    NewValue = (((OldValue - OldMin) * NewRange) / OldRange) + NewMin
    
                    yli.append(NewValue)
    
                # dataset flipped
                frames = [pd.DataFrame(xli), pd.DataFrame(yli)]
                Y = pd.concat(frames,axis=1).reset_index().drop('index',axis=1)
            
            
            if(focus2.iloc[0,17] != 'right'):
                # removing variables of no interest
                Y = Y.drop(['time','s','a','dis','o','dir','event','nflId','displayName','jerseyNumber','position','frameId','team','gameId','playId','playDirection','route'],axis=1)           
            
            # converting into a matrix     
            Y = Y.to_numpy()
            Y = Y.astype(float)
    
            # Bezier curve's order adopted
            n = 4 
            m = Y.shape[0]

            # MATRIX T
            T = []
            T.insert(0, np.zeros((m,n+1)))
            t  =  np.linspace(0, 1,T[0].shape[0], endpoint = True )

            p = 0
            for h in range(0, n+1):
                for j in range(0,T[0].shape[0]):
                    T[0][j,p] = b(float(t[j]),h)
                p = p + 1
            
            # matrix estimated points
            theta_hat = np.dot(np.dot(np.linalg.pinv(np.dot(np.transpose(T[0]), T[0])),np.transpose(T[0])),Y)
    
         
        thetas.append(theta_hat)
            

# PCA on the estimated points obtained        
_dataset_ball = []
for i in range(0,len(thetas)):          
    
    pr = np.array(thetas[i])
            
    pca = PCA(n_components=1, whiten= True)
    X_pca = pca.fit_transform(pr)
    
    _dataset_ball.append(X_pca.tolist())
    
# transforming into a dataframe 
_dataseT_ball = pd.DataFrame(_dataset_ball)

for i in range(0,_dataseT_ball.shape[0]):
    
    for j in range(0,_dataseT_ball.shape[1]):
        
        if _dataseT_ball.iloc[i,j] != None:
            _dataseT_ball.iloc[i,j] = float(_dataseT_ball.iloc[i,j][0])

_dataseT_ball.columns = ['ball_1','ball_2','ball_3','ball_4','ball_5']
_dataseT_ball

# Other useful features

The dataset *weeks* also contains information regarding each movement, such as speed and acceleration. Incorporating these into the new dataset is crucial in **enhancing** the data’s **quality**.

Moreover, to **contextualize** each passing play, adding the variable *epa* from *plays* is necessary. This variable estimates the average of the next scoring outcome given the **down**, **distance**, **yardline** and **time remaining**. By doing so, each passing play is also characterized by its **game characteristics**.

In [None]:
# Each average information is a nested list. Each average[i] has as the fist element the information regarding the defensive movements,
# the second element regarding the information about the offensive movements, the third element regarding the information about 
# the ball's movement. This is done for each passing play.
average_s = []
average_a = []
average_dis = []
average_o = []
average_dir = []

# loop over each passing play in key
for i in range(0,len(key)): 
    
    # filtering of weeks with that specific values of "playID" 
    a = weeks[weeks['playId'] == key.iloc[i,1]]
    
    #Defender
    b = a[a["team"] == key.iloc[i,2]]
    c = b[b["gameId"] == key.iloc[i,0]]
    average_s.append([np.mean(c['s'])])
    average_a.append([np.mean(c['a'])])
    average_dis.append([np.mean(c['dis'])])
    average_o.append([np.mean(c['o'])])
    average_dir.append([np.mean(c['dir'])])
    
    
    if key.iloc[i,2] == 'away':
        rr = 'home'
    else:
        rr = 'away'
    #Attacker
    b = a[a["team"] == rr  ]
    c = b[b["gameId"] == key.iloc[i,0]]
    average_s[i].append(np.mean(c['s']))
    average_a[i].append(np.mean(c['a']))
    average_dis[i].append(np.mean(c['dis']))
    average_o[i].append(np.mean(c['o']))
    average_dir[i].append(np.mean(c['dir']))
    
    
    #Ball
    b = a[a["team"] == 'football' ]
    c = b[b["gameId"] == key.iloc[i,0]]
    average_s[i].append(np.mean(c['s']))
    average_a[i].append(np.mean(c['a']))
    average_dis[i].append(np.mean(c['dis'])) 
    

# converting into a dataframe
speed = pd.DataFrame(average_s)
speed.columns = ['speed_dif','speed_att','speed_ball']

acc = pd.DataFrame(average_a)
acc.columns = ['acc_dif','acc_att','acc_ball']

dis = pd.DataFrame(average_dis)
dis.columns = ['dis_dif','dis_att','dis_ball']

o = pd.DataFrame(average_o)
o.columns = ['o_dif','o_att']

dire = pd.DataFrame(average_dir)
dire.columns = ['dire_dif','dire_att']



add_info = []

# loop over each passing play in 'key'
for i in range(0,len(key)):
    
    # filtering of plays with that specific values of "playID" and 'gameID'
    a = plays[plays['playId'] == key.iloc[i,1]]
    b = a[a["gameId"] == key.iloc[i,0]]
    
    # adding 'epa' for the passing play 'i'
    add_info.append(b['epa'].reset_index().drop('index',axis=1).iloc[0,0]) 
    
add_info = pd.DataFrame(add_info)
add_info.columns = ['epa'] 
add_info

We have now obtained a *new* dataset that can effectively represent the **relevant information** from the tracking data, and in which a **classification algorithm** can be applied to predicting DPI penalties.

In other words, we can now **predict** whether a Defensive Pass Interference will be called on the analyzed passing play, based on the **tracking data**!

In [None]:
frames = [_dataseT_dif,_dataseT_off,_dataseT_ball,speed,acc,dis,o,dire,add_info]
__final_ = pd.concat(frames,axis=1)
__final_

In [None]:
# response variable
_y = []

# loop over each passing play in key
for i in range(0,len(key)):
    
    # filtering of plays with that specific values of "gameID" and "playID"
    a = plays[plays['playId'] == key.iloc[i,1]]
    b = a[a["gameId"] == key.iloc[i,0]]
    
    # value of 'isDefensivePI' 
    _y.append([b['isDefensivePI'].reset_index().drop('index',axis=1).iloc[0,0]])
    
_y = pd.DataFrame(_y)
_y.columns = ['isDefensivePI']
_y

# Predicting DPI

In [None]:
from sklearn.metrics import balanced_accuracy_score
from sklearn.metrics import precision_score,recall_score
from sklearn.metrics import confusion_matrix

As you can imagine, we have more passing plays that do not have DPI fouls than the passing plays that do have them. Meaning, we are working with an extremely unbalanced dataset. Specifically, the Imbalance Ratio (IR) is equal to 0.01.

In [None]:
_y.isDefensivePI.value_counts()

The "standard" classification algorithms, such as Random Forest or Linear SVC, do not work correctly with this type of dataset. Therefore, we will use algorithms that perform appropriately with an unbalanced dataset from the library [*imbalanced-learn*](https://github.com/scikit-learn-contrib/imbalanced-learn).

We will try to maintain this Imbalance Ratio when we split the dataset in train and test set, trying to keep the representativeness of the results.

In [None]:
from sklearn.preprocessing import OrdinalEncoder

ordinal_encoder = OrdinalEncoder()
y_encoder = ordinal_encoder.fit_transform(_y)
y_encoder = np.transpose(y_encoder)
y_encoder = y_encoder[0].astype(np.float64)
y_encoder

X = np.array(__final_).astype(np.float64)

# Splitting into train and test maintaing the Imbalance Ratio
from sklearn.model_selection import StratifiedShuffleSplit

split = StratifiedShuffleSplit(n_splits= 1, test_size=0.25)
for train_index, test_index in split.split(_y,_y['isDefensivePI']):
    y_train = y_encoder[train_index]
    y_test = y_encoder[test_index]
    
    X_train = X[train_index]
    X_test = X[test_index]

### Balanced Random Forest

In [None]:
from imblearn.ensemble import BalancedRandomForestClassifier

# fitting the model
brf = BalancedRandomForestClassifier()
brf.fit(X_train, y_train) 

# checking the scores
y_pred = brf.predict(X_test)

print(f'The balanced accuracy is: {round(balanced_accuracy_score(y_test, y_pred),2)}')
print(f'The confusion matrix is: \n {confusion_matrix(y_test,y_pred)}')
print(f'The precision is: {round(precision_score(y_test,y_pred),2)}')
print(f'The recall is: {round(recall_score(y_test,y_pred),2)}')

### RUSBoost

In [None]:
from imblearn.ensemble import RUSBoostClassifier

# fitting the model
rusboost = RUSBoostClassifier(algorithm='SAMME.R')
rusboost.fit(X_train, y_train)  

# checking the scores
y_pred = rusboost.predict(X_test)

print(f'The balanced accuracy is: {round(balanced_accuracy_score(y_test, y_pred),2) }')
print(f'The confusion matrix is: \n {confusion_matrix(y_test,y_pred)}')
print(f'The precision is: {round(precision_score(y_test,y_pred),2)}')
print(f'The recall is: {round(recall_score(y_test,y_pred),2)}')

### Easy Ensemble classifier

In [None]:
from imblearn.ensemble import EasyEnsembleClassifier

# fitting the model
eec = EasyEnsembleClassifier()
eec.fit(X_train, y_train) 

# checking the scores
y_pred = eec.predict(X_test)

print(f'The balanced accuracy is: {round(balanced_accuracy_score(y_test, y_pred),2) }')
print(f'The confusion matrix is: \n {confusion_matrix(y_test,y_pred)}')
print(f'The precision is: {round(precision_score(y_test,y_pred),2)}')
print(f'The recall is: {round(recall_score(y_test,y_pred),2)}')

### Balanced Bagging

In [None]:
from imblearn.ensemble import BalancedBaggingClassifier
from sklearn.tree import DecisionTreeClassifier

# fitting the model
bbc = BalancedBaggingClassifier(base_estimator=DecisionTreeClassifier(), sampling_strategy='auto',replacement=False)
bbc.fit(X_train, y_train) 

# checking the scores
y_pred = bbc.predict(X_test)

print(f'The balanced accuracy is: {round(balanced_accuracy_score(y_test, y_pred),2) }')
print(f'The confusion matrix is: \n {confusion_matrix(y_test,y_pred)}')
print(f'The precision is: {round(precision_score(y_test,y_pred),2)}')
print(f'The recall is: {round(recall_score(y_test,y_pred),2)}')

### Linear SVC

In [None]:
from sklearn.pipeline import Pipeline
from sklearn.preprocessing import StandardScaler
from sklearn.svm import LinearSVC

# fitting the model
svm_clf = Pipeline([
        ("scaler",StandardScaler()),
        ("linear_svc",LinearSVC(C=10,loss="hinge"))
    ])
svm_clf.fit(X_train, y_train) 

# checking the scores
y_pred = svm_clf.predict(X_test)

print(f'The balanced accuracy is: {balanced_accuracy_score(y_test, y_pred) }')
print(f'The confusion matrix is: \n {confusion_matrix(y_test,y_pred)}')
print(f'The precision is: {precision_score(y_test,y_pred)}')
print(f'The recall is: {recall_score(y_test,y_pred)}')

### Random Forest 

In [None]:
from sklearn.ensemble import RandomForestClassifier

# fitting the model
rfc = RandomForestClassifier()
rfc.fit(X_train, y_train) 

# checking the scores
y_pred = rfc.predict(X_test)

print(f'The balanced accuracy is: {balanced_accuracy_score(y_test, y_pred) }')
print(f'The confusion matrix is: \n {confusion_matrix(y_test,y_pred)}')
print(f'The precision is: {precision_score(y_test,y_pred)}')
print(f'The recall is: {recall_score(y_test,y_pred)}')

### XGBoost

In [None]:
import xgboost

# fitting the model
xgb_clf = xgboost.XGBClassifier()
xgb_clf.fit(X_train,y_train)

# checking the scores
y_pred = xgb_clf.predict(X_test)

print(f'The balanced accuracy is: {round(balanced_accuracy_score(y_test, y_pred),2) }')
print(f'The confusion matrix is: \n {confusion_matrix(y_test,y_pred)}')
print(f'The precision is: {round(precision_score(y_test,y_pred),2)}')
print(f'The recall is: {round(recall_score(y_test,y_pred),2)}')

### Adaboost

In [None]:
from sklearn.ensemble import AdaBoostClassifier
from sklearn.tree import DecisionTreeClassifier

# fitting the model
ada_clf = AdaBoostClassifier(DecisionTreeClassifier(max_depth=1), 
                             algorithm='SAMME.R', learning_rate=0.5, n_estimators=200)
ada_clf.fit(X_train, y_train)

# checking the scores
y_pred = ada_clf.predict(X_test)

print(f'The balanced accuracy is: {round(balanced_accuracy_score(y_test, y_pred),2) }')
print(f'The confusion matrix is: \n {confusion_matrix(y_test,y_pred)}')
print(f'The precision is: {round(precision_score(y_test,y_pred),2)}')
print(f'The recall is: {round(recall_score(y_test,y_pred),2)}')

Given the impact that DPI might have on the match, I would prefer to recognize as many DPI as possible to provide the treatment they deserve. Therefore, I would pick the model that has the **highest recall**, and the Easy Ensemble classifier is the right option.

In any case, each coach has his/her opinion. Someone may prefer to be sure about predicting that the passing play actually has a DPI foul, at the risk of not to detect all them. In this case, the XGboost can be adopted since it has the **highest precision**.

The results obtained can be improved by:
- analyzing the optimal threshold
- hyperparameter optimization
- And ... **I am open to any suggestion!**

[Mattia Arsendi](https://mattia-arsendi.netlify.app/)