DESCRIPTION

More and more sports teams are trying to exploit the game data to their advantage. The NFL Next Gen Stats track the position and the speed of every player during each play. Any actionable insight from such massive data could provide coaches that extra yard needed to change the result of the game in their favor. Here a model is presented using Deep Learning techniques to predict the intended receiver before the quarterback releases the football.

METHODOLOGY
- The model discussed here used the data from NFL Next Gen Stats available on kaggle website. The data has information about each play from every game played in 2018 season.

- As a part of the data preparation for model training, the data from a frame (snapshot) just before the release of football from quarterback is extracted and used as an input to the model. The frame contains the position, speed and direction of offensive and defensive players at that moment.

- Based on the commentary provided for each play, intended receiver for a pass from quarterback is obtained and used as an output of the model. For example, “J.Jones” is extraxted as intended receiver from the commentary “(2:01) M.Ryan pass deep left to J.Jones to …”. Offensive players are tagged before-hand to be able to predict intended reciever using classification approach.

- Only pass plays (either complete or incomplete) of the games are considered.

In [None]:
import numpy as np
import pandas as pd
import os

#pd.set_option("display.max_columns", None)
#pd.set_option("display.max_rows", 200)
#pd.set_option('display.max_colwidth', None)

DELETE = 1
para_weeks_considered = 17
para_frames_considered = 1
para_frames_step = 1
data_csv = "/kaggle/working/data_1_17_51.csv"

In [None]:
# Plays data
df_plays = pd.read_csv("/kaggle/input/nfl-big-data-bowl-2021/plays.csv")
df_plays.head()
#df_plays[(df_plays["gameId"] == 2018091700) & (df_plays["playId"] == 2560)]

In [None]:
# Weeks data
df_weeks = list()
for week in range(1,para_weeks_considered+1):
    df_week = pd.read_csv("/kaggle/input/nfl-big-data-bowl-2021/week" + str(week) + ".csv")
    df_week.drop(df_week[df_week["displayName"] == "Football"].index, inplace=True)     ## FIX
    df_weeks.append(df_week)
    del df_week
df_weeks = pd.concat(df_weeks)
df_weeks.head()

In [None]:
# Data Pre-processing (Getting relevant frames)
df_w1 = df_weeks.copy()
if (DELETE):
    del df_weeks

#df_w1 = df_w1[["x","y","event","displayName","position","frameId","gameId","playId"]]         # FIX
df_w1 = df_w1[["x","y","s","o","dir","event","displayName","position","frameId","gameId","playId"]]

df_w1_pf = df_w1[(df_w1["position"] == "QB") & (df_w1["event"] == "pass_forward")][["gameId","playId","frameId"]].copy()
df_w1_pf["min_frameId"] = df_w1_pf["frameId"] - 0         # FIX  0 or para_frames_considered
df_w1_pf["max_frameId"] = df_w1_pf["frameId"] - 0         # FIX  0 or 1
df_w1.drop(columns=["event"],inplace=True)

df_w1.set_index(["gameId","playId"],inplace=True)
df_w1_pf.set_index(["gameId","playId"],inplace=True)
df_w1 = df_w1.join(df_w1_pf, on=["gameId","playId"], how='left', rsuffix='_play')
df_w1 = df_w1[(df_w1["min_frameId"] <= df_w1["frameId"]) & \
              (df_w1["frameId"] <= df_w1["max_frameId"]) & \
              ((df_w1["frameId"]-df_w1["min_frameId"]) % para_frames_step == 0)]
df_w1["new_frameId"] = df_w1["frameId"] - df_w1["min_frameId"] + 1
df_w1.drop(columns=["frameId","frameId_play","min_frameId","max_frameId"], inplace=True)

df_w1["Side"] = "Side"
df_w1.loc[df_w1["position"].str.contains('SS|FS|MLB|CB|LB|OLB|DL|DB|ILB|NT|S|DE'),"Side"] = "D"
df_w1.loc[df_w1["position"].str.contains('QB|WR|RB|TE|FB|HB'),"Side"] = "O"
if ((df_w1["Side"] == "Side").sum() > 0):
    df_w1.drop(df_w1[df_w1["Side"] == "Side"].index, inplace=True)
df_w1.reset_index(inplace=True)

if (DELETE):
    del df_w1_pf
df_w1.head()

In [None]:
df_w1.info()

In [None]:
# Data Pre-processing (Getting intended receiver from commentary)
df_p1 = df_plays.copy()
if (DELETE):
    del df_plays
df_p1 = df_p1[(df_p1["passResult"] == "C") | (df_p1["passResult"] == "I")]       # FIX
#df_p1 = df_p1[(df_p1["passResult"] == "C")]
df_p1["intendedReciever"] = df_p1["playDescription"].str.split(" to ").str[1].str.split(" ").str[0] \
                                    .str.replace("."," ").str.strip()

df_p1.drop(df_p1[df_p1["intendedReciever"].isnull()].index, inplace=True)
df_p1 = df_p1[['gameId','playId','quarter','down','yardsToGo','preSnapVisitorScore','preSnapHomeScore', \
                'gameClock','absoluteYardlineNumber','intendedReciever']]

df_p1 = df_p1.drop(df_p1[df_p1['gameClock'].isnull()].index, axis=0)
df_p1_temp = df_p1['gameClock'].str.split(':', expand=True).astype(int)
df_p1['timeLeft'] = df_p1_temp[0] * 60 + df_p1_temp[1]

del df_p1_temp
df_p1.head()

In [None]:
df_p1.info()

In [None]:
# Data pre-processing (Matching intended receiver with offensive players)
df_w2 = df_w1.copy()
if (DELETE):
    del df_w1

df_w2.set_index(["gameId","playId"], inplace=True)
df_p1.set_index(["gameId","playId"], inplace=True)
df_w2 = df_w2.join(df_p1["intendedReciever"], how='inner', on=["gameId","playId"])
df_w2.reset_index(inplace=True)

df_w2["recieverMatch"] = 0
df_w2.loc[(df_w2["displayName"].str[0] == df_w2["intendedReciever"].str[0]) & \
(df_w2["displayName"].str.split(" ").str[-1] == df_w2["intendedReciever"].str.split(" ").str[-1]), \
        "recieverMatch"] = 1

df_w2.drop(columns=["intendedReciever"], inplace=True)
df_w2.head()

In [None]:
# Data clean up for intended reciever
df_w2_1 = df_w2.drop(df_w2[(df_w2["recieverMatch"] == 1) & (df_w2["position"] == "QB")].index)

df_w2_1 = df_w2_1.groupby(["gameId","playId"])[["recieverMatch"]].sum()
df_w2_1.reset_index(inplace=True)
df_w2_1.drop(df_w2_1[df_w2_1["recieverMatch"] == 0].index, inplace=True)

df_w2.set_index(["gameId","playId"], inplace=True)
df_w2_1.set_index(["gameId","playId"], inplace=True)
df_w2 = df_w2.join(df_w2_1, how="inner", on=["gameId","playId"], rsuffix="_temp")
df_w2.reset_index(inplace=True)
df_w2.drop(columns=df_w2.columns[df_w2.columns.str.contains("_temp")], inplace=True)


In [None]:
df_w2.head()

In [None]:
df_w2.info()

In [None]:
# Tagging offensive and defensive players to be used for classification
df_w3 = df_w2.copy()
if (DELETE):
    del df_w2

df_w3 = df_w3.sort_values(["gameId","playId","new_frameId","Side"], ascending=[True,True,True,False])
df_w3.reset_index(inplace=True)
df_w3["playerId"] = -1
df_w3.loc[(df_w3["position"] == "QB"),"playerId"] = 0

g = df_w3.loc[0,"gameId"]
p = df_w3.loc[0,"playId"]
f = df_w3.loc[0,"new_frameId"]
s = df_w3.loc[0,"Side"]
n = 1
for i in range(len(df_w3)):
    if (df_w3.loc[i,"position"] == "QB"):
        continue
        
    if ((df_w3.loc[i,"gameId"] == g) & (df_w3.loc[i,"playId"] == p) & \
        (df_w3.loc[i,"new_frameId"] == f) & (df_w3.loc[i,"Side"] == s)):
        df_w3.loc[i,"playerId"] = n
        n += 1
    else:
        g = df_w3.loc[i,"gameId"]
        p = df_w3.loc[i,"playId"]
        f = df_w3.loc[i,"new_frameId"]
        s = df_w3.loc[i,"Side"]
        n = 1
        df_w3.loc[i,"playerId"] = n
        n += 1  

df_w3["Side_playerId"] =  df_w3["Side"] + df_w3["playerId"].astype(str)
df_w3.drop(columns=["index","playerId","Side","displayName","position"], inplace=True)
df_w3.head()

In [None]:
df_w3.info()

In [None]:
# Data cleaning
df_w4 = df_w3.copy()
if (DELETE):
    del df_w3

# FIX
#df_w4_bad = df_w4[(df_w4["Side_playerId"] == "O6") | (df_w4["Side_playerId"] == "D8")] \
df_w4_bad = df_w4[(df_w4["Side_playerId"] == "O6") | (df_w4["Side_playerId"] == "D10")] \
                    [["gameId","playId"]].copy()
df_w4_bad["dummy"] = 1

df_w4.set_index(["gameId","playId"], inplace=True)
df_w4_bad.set_index(["gameId","playId"], inplace=True)
df_w4 = df_w4.join(df_w4_bad, on=["gameId","playId"], how='left', rsuffix='_bad')
df_w4.drop(df_w4[(df_w4["dummy"] == 1)].index, inplace=True)
df_w4.drop(columns=["dummy"], inplace=True)
df_w4.rename(columns={"recieverMatch": "IR"}, inplace=True)

del df_w4_bad
df_w4.head()

In [None]:
df_w4.info()

In [None]:
# Data Cleaning
df_data1 = pd.pivot_table(df_w4, values=["x","y","s","o","dir","IR"],
#df_data1 = pd.pivot_table(df_w4, values=["x","y","IR"], \
                          index=["gameId","playId","new_frameId"], columns=["Side_playerId"]).copy()

df_data1.drop(columns=[('IR','D1'),('IR','D2'),('IR','D3'),('IR','D4'),('IR','D5'), \
                       ('IR','D6'),('IR','D7'),('IR','D8'),('IR','D9'), \
                       ('IR','O0')], inplace=True)          # FIX

df_data1.columns = ['_'.join(col).strip() for col in df_data1.columns.values]
df_data1.reset_index(inplace=True)
if (DELETE):
    del df_w4
df_data1.head()

In [None]:
df_data1.info()

In [None]:
# Data cleaning
df_data2 = df_data1.join(df_p1, on=["gameId","playId"], how='left', rsuffix='_play').copy()
if (DELETE):
    del df_data1

df_data2.drop(columns=["gameClock","intendedReciever"], inplace=True)
#df_data2.reset_index(inplace=True)
#df_data2.fillna(-1, inplace=True)         # FIX
df_data2.fillna(0, inplace=True)         # FIX
df_data2.drop(df_data2[(df_data2["IR_O1"] == 0) & (df_data2["IR_O2"] == 0) & (df_data2["IR_O3"] == 0) & \
                      (df_data2["IR_O4"] == 0) & (df_data2["IR_O5"] == 0)].index, inplace=True)      # FIX

if (DELETE):
    del df_p1
df_data2.head()

In [None]:
# Saving data to csv file
df_data2.to_csv(data_csv, index=False)
df_data2.info()

if (DELETE):
    del df_data2

In [None]:
# Importing the libraries
import numpy as np
import pandas as pd
import seaborn as sns
import matplotlib.pyplot as plt

import tensorflow as tf
from tensorflow import keras
from tensorflow.keras.models import Sequential
from tensorflow.keras.layers import LSTM, Dropout, Activation, Dense, BatchNormalization
from keras.layers import Conv2D, MaxPooling2D, Flatten, Reshape
from tensorflow.keras.optimizers import Adam
from keras.optimizers import SGD, Adadelta
from tensorflow.keras.metrics import categorical_crossentropy
from keras.utils import to_categorical
from sklearn.model_selection import train_test_split

In [None]:
# Importing the training set
#path = "/kaggle/working/"
path = "/kaggle/input/dataandmodel/"
df_xy_data = pd.read_csv(path + "data_1_17_51.csv")
#df_xy_data = pd.read_csv(data_csv)
xy_data = df_xy_data.iloc[:,3:].values

df_xy_data.head()
#df_xy_data.info()

In [None]:
from sklearn.preprocessing import MinMaxScaler
sc = MinMaxScaler(feature_range=(0,1))
xy_data = sc.fit_transform(xy_data)

y_data = xy_data[:,0:5]
x_data = xy_data[:,5:]

x_data, y_data = np.array(x_data), np.array(y_data)
#x_data = np.reshape(x_data, (x_data.shape[0], x_data.shape[1], x_data.shape[2]))
print (x_data.shape, y_data.shape)

x_train, x_test, y_train, y_test = train_test_split(x_data, y_data, test_size=0.1, shuffle=True, random_state=0)
print (x_train.shape, y_train.shape, x_test.shape, y_test.shape)

In [None]:
# Data Augmentation
columns = list(df_xy_data.columns)
a = columns.index("IR_O1")-3
b = columns.index("x_O1")-8
c = columns.index("y_O1")-8
d = columns.index("dir_O1")-8
e = columns.index("s_O1")-8
f = columns.index("o_O1")-8
print (a,b,c,d,e,f)
data1_y = y_train.copy()
data1_x = x_train.copy()

for i in range(0,5):
    for j in range(0,5):
        if (j==i):
            continue
        for k in range(0,5):
            if ((k==i) | (k==j)):
                continue
            for l in range(0,5):
                if ((l==i) | (l==j) | (l==k)):
                    continue
                for m in range(0,5):
                    if ((m==i) | (m==j) | (m==k) | (m==l)):
                        continue
                    elif ((i==0) & (j==1) & (k==2) & (l==3) & (m==4)):
                        continue
                    else:
                        data2_y = y_train.copy()
                        data2_x = x_train.copy()
                        data2_y[:,[a+0,a+1,a+2,a+3,a+4]] = data2_y[:,[a+i,a+j,a+k,a+l,a+m]]
                        data2_x[:,[b+0,b+1,b+2,b+3,b+4]] = data2_x[:,[b+i,b+j,b+k,b+l,b+m]]
                        data2_x[:,[c+0,c+1,c+2,c+3,c+4]] = data2_x[:,[c+i,c+j,c+k,c+l,c+m]]
                        data2_x[:,[d+0,d+1,d+2,d+3,d+4]] = data2_x[:,[d+i,d+j,d+k,d+l,d+m]]                        
                        data2_x[:,[e+0,e+1,e+2,e+3,e+4]] = data2_x[:,[e+i,e+j,e+k,e+l,e+m]]                        
                        data2_x[:,[f+0,f+1,f+2,f+3,f+4]] = data2_x[:,[f+i,f+j,f+k,f+l,f+m]]                        
                        data1_y = np.vstack((data1_y,data2_y))
                        data1_x = np.vstack((data1_x,data2_x))

print (y_train.shape, x_train.shape)                        
y_train = data1_y.copy()
x_train = data1_x.copy()
print (y_train.shape, x_train.shape)

In [None]:
#Model
regressor = Sequential()
        
regressor.add(Dense(units=128, activation="relu", kernel_initializer='he_uniform', input_shape=(x_train.shape[1],)))
regressor.add(BatchNormalization())
regressor.add(Dropout(0.4))

regressor.add(Dense(units=256, activation="relu", kernel_initializer='he_uniform'))
regressor.add(BatchNormalization())
regressor.add(Dropout(0.4))

regressor.add(Dense(units=512, activation="relu", kernel_initializer='he_uniform'))
regressor.add(BatchNormalization())
regressor.add(Dropout(0.4))

regressor.add(Dense(units=512, activation="relu", kernel_initializer='he_uniform'))
regressor.add(BatchNormalization())
regressor.add(Dropout(0.4))

regressor.add(Dense(units=512, activation="relu", kernel_initializer='he_uniform'))
regressor.add(BatchNormalization())
regressor.add(Dropout(0.4))

regressor.add(Dense(units=256, activation="relu", kernel_initializer='he_uniform'))
regressor.add(BatchNormalization())
regressor.add(Dropout(0.4))

regressor.add(Dense(units=128, activation="relu", kernel_initializer='he_uniform'))
regressor.add(BatchNormalization())
regressor.add(Dropout(0.4))

regressor.add(Dense(units=64, activation="relu", kernel_initializer='he_uniform'))
regressor.add(BatchNormalization())
regressor.add(Dropout(0.4))

regressor.add(Dense(units=5, activation="softmax"))

regressor.compile(optimizer="adam", loss='categorical_crossentropy', metrics=['accuracy'])
#regressor.compile(optimizer=Adam(learning_rate=0.0001), loss='mean_squared_error', metrics=['accuracy'])
#regressor.compile(optimizer="adam", loss='mean_squared_error', metrics=['accuracy'])
regressor.summary()

In [None]:
# Loading model weights for training
regressor.load_weights(path + "/model_epochs1-20_dropout40_categorical.h5")
# regressor.load_weights("/kaggle/working/model_1_20_dropout40.h5")

In [None]:
history=regressor.fit(x_train, y_train, epochs=1, batch_size=32, validation_split=0.1, shuffle=True, verbose=2)

In [None]:
# summarize history for loss
plt.plot(history.history['loss'])
plt.plot(history.history['val_loss'])
plt.title('model loss')
plt.ylabel('loss')
plt.xlabel('epoch')
plt.legend(['train', 'val'], loc='upper right')
#plt.ylim((0,0.16))
plt.show()

In [None]:
from sklearn import metrics
from sklearn.metrics import accuracy_score

# Prediction
y_pred = regressor.predict(x_test)
y_pred = np.argmax(y_pred, axis=1)
y_test = np.argmax(y_test, axis=1)

# Confusion Matrix
cm = metrics.confusion_matrix(y_test,y_pred)
score = accuracy_score(y_test,y_pred)*100

plt.figure(figsize=(5,5))
sns.heatmap(cm, annot=True, fmt=".0f", linewidth=0.5, square=True, cmap="Blues_r")
plt.ylabel("Actual")
plt.xlabel("Predicted")
all_sample_title = "TEST Accuracy Score: {0}".format(score)
plt.title(all_sample_title, size=15)

In [None]:
from sklearn import metrics
from sklearn.metrics import accuracy_score

# Prediction
y_pred_train = regressor.predict(x_train)
y_pred_train = np.argmax(y_pred_train, axis=1)
y_train = np.argmax(y_train, axis=1)

# Confusion Matrix
cm = metrics.confusion_matrix(y_train,y_pred_train)
score = accuracy_score(y_train,y_pred_train)*100

plt.figure(figsize=(5,5))
sns.heatmap(cm, annot=True, fmt=".0f", linewidth=0.5, square=True, cmap="Blues_r")
plt.ylabel("Actual")
plt.xlabel("Predicted")
all_sample_title = "TRAIN Accuracy Score: {0}".format(score)
plt.title(all_sample_title, size=15)

In [None]:
# Saving model
# regressor.save("/kaggle/working/model_epochs21-40_dropout40_categorical.h5")