# NFL strategy prediction

We all know that one of the main purposes of an NFL game is to push your team to a touchdown. To realize a touchdown, various strategies are evaluated and executed.
In our codes, we combine the team data, the player data, strategy used to create a model that helps to evaluate how far can a team push with specific strategies.
By using this model, we hope that teams can make their strategy with the assistance of our system to get a better performance in the coming season.

In [None]:
# This Python 3 environment comes with many helpful analytics libraries installed
# It is defined by the kaggle/python Docker image: https://github.com/kaggle/docker-python
# For example, here's several helpful packages to load

import numpy as np # linear algebra
import pandas as pd # data processing, CSV file I/O (e.g. pd.read_csv)

# Input data files are available in the read-only "../input/" directory
# For example, running this (by clicking run or pressing Shift+Enter) will list all files under the input directory

import os
for dirname, _, filenames in os.walk('/kaggle/input'):
    for filename in filenames:
        print(os.path.join(dirname, filename))

# You can write up to 20GB to the current directory (/kaggle/working/) that gets preserved as output when you create a version using "Save & Run All" 
# You can also write temporary files to /kaggle/temp/, but they won't be saved outside of the current session

## Data pre-process

Data we use:
* plays.csv (mainly used to create dataset)
    * yardsToGo
    * kickerId (for referencing)
    * specialTeamsPlayType(One-Hot Encoded)
    * yardlineNumber
    * yardlineSidePossessionTeam
    * playResult (main target)
* players.csv (reference by kickerId)
    * height (change format to numeric type)
    * weight

After all, we drop NAs.

In [None]:
import pandas as pd
import numpy as np
import math

import os

import random

from imblearn.over_sampling import BorderlineSMOTE
from imblearn.under_sampling import RandomUnderSampler

def to_one_hot(df, additional_name, column_name):
	all_types = np.unique(df[column_name].to_numpy())
	for per_type in all_types:
		df[f"{additional_name}_{per_type}"] = (df[column_name] == per_type) * 1.0
	df = df.drop(column_name, axis = 1)
	return df

def label_to_number(df, column_name):
	column_list = np.unique(df[column_name].to_numpy())
	column_dict = {x:i for i, x in enumerate(column_list)}
	df = df.copy()
	df[f"{column_name}_number"] = df[column_name].map(column_dict)
	return column_list, column_dict, df.drop(column_name, axis = 1)

def number_to_label(df, column_name, column_dict):
	column_dict2 = {i:x for i, x in enumerate(column_dict)}
	df = df.copy()
	df[f"{column_name}"] = df[f"{column_name}_number"].map(column_dict2)
	return df.drop(f"{column_name}_number", axis = 1)

def combine_label(df, column_names, new_name):
	df[new_name] = df.apply(lambda _: 0, axis = 1)
	mi = 1
	for column_name in column_names:
		df[new_name] = df[new_name] + df[column_name] * mi
		df = df.drop(column_name, axis = 1)
		mi *= 1000
	return df

def split_label(df, column_names, new_name):
	for column_name in column_names:
		df[column_name] = df.apply(lambda _: 0, axis = 1)
		df[column_name] = df[new_name]%1000
		df[new_name] = df[new_name]//1000
	df = df.drop(new_name, axis = 1)
	return df

def get_players_dataframe(csv_path):
	df = pd.read_csv(csv_path, keep_default_na=False)
	try:
		df = df.drop("birthDate", axis = 1)
		df = df.drop("collegeName", axis = 1)
		df = df.drop("Position", axis = 1)
		df = df.drop("displayName", axis = 1)

		df = df.reset_index().drop("index", axis = 1)
		df['height_number'] = df.apply(lambda _: 0, axis = 1)
		for i in range(len(df)):
			h = df.iloc[i]["height"]
			df.at[i, "height_number"] = float(h) if "-" not in h else float(h.split("-")[0]) * 12 + float(h.split("-")[1])
		df = df.drop("height", axis = 1)

	except Exception as e:
		print(e)
	return df


def get_play_dataframe(csv_path):
	df = pd.read_csv(csv_path, keep_default_na=False)
	try:
		df = df.drop("gameId", axis = 1)
		df = df.drop("playDescription", axis = 1)
		df = df.drop("playId", axis = 1)
		df = df.drop("passResult", axis = 1)

		df = df.iloc[(df["penaltyCodes"] == "NA").to_numpy()]
		df = df.drop("penaltyCodes", axis = 1)

		df = df.drop("penaltyJerseyNumbers", axis = 1)
		df = df.drop("penaltyYards", axis = 1)
		df = df.drop("returnerId", axis = 1)
		df = df.drop("kickBlockerId", axis = 1)

		df = df.iloc[(df["kickerId"] != "NA").to_numpy()]

		df["yardlineSidePossessionTeam"] = (df["yardlineSide"] == df["possessionTeam"]) * 1.0
		df = df.drop("yardlineSide", axis = 1)
		df = df.drop("possessionTeam", axis = 1)

		df = df.drop("kickLength", axis = 1)
		df = df.drop("kickReturnYardage", axis = 1)
		df = df.drop("absoluteYardlineNumber", axis = 1)

		df["playResult"] = df["playResult"].astype(float)

		temp_removes = ["quarter", "gameClock", "preSnapHomeScore", "preSnapVisitorScore", "down"]
		for temp_remove in temp_removes:
			df = df.drop(temp_remove, axis = 1)


	except Exception as e:
		print(e)
	return df

def get_dataframe(csvs_path):
	player_df = get_players_dataframe(os.path.join(csvs_path, "players.csv"))
	player_dict_height = {}
	player_dict_weight = {}
	for i in range(len(player_df)):
		player_dict_height[str(player_df["nflId"][i])] = player_df["height_number"][i]
		player_dict_weight[str(player_df["nflId"][i])] = float(player_df["weight"][i])

	plays_df = get_play_dataframe(os.path.join(csvs_path, "plays.csv"))
	
	plays_df["kicker_height"] = plays_df["kickerId"].map(player_dict_height)
	plays_df["kicker_weight"] = plays_df["kickerId"].map(player_dict_weight)
	plays_df = plays_df.dropna()

	plays_df = plays_df.drop("kickerId", axis = 1)

	return plays_df

def undersample(df, column_name, random_state):
	rus = RandomUnderSampler(random_state=random_state)
	x, y = rus.fit_resample(df.drop(column_name, axis = 1), df[column_name])
	x[column_name] = y
	return x

def undersample_with_multiple_labels(df, column_names, random_state):
	ls = []
	ds = []
	for column_name in column_names:
		l1, d1, df = label_to_number(df, column_name)
		ls.append(l1)
		ds.append(d1)

	df = combine_label(df, [f"{x}_number" for x in column_names], "new_label")
	df = undersample(df, "new_label", random_state)
	df = split_label(df, [f"{x}_number" for x in column_names], "new_label")

	for i in range(len(column_names)):
		df = number_to_label(df, column_names[i], ds[i])
	return df

def oversample(df, column_name, k_neighbors=5):
	sm = BorderlineSMOTE(k_neighbors = k_neighbors)
	x, y = sm.fit_resample(df.drop(column_name, axis = 1), df[column_name])
	x[column_name] = y
	return x

def oversample_with_multiple_labels(df, column_names, k_neighbors=5):
	ls = []
	ds = []
	for column_name in column_names:
		l1, d1, df = label_to_number(df, column_name)
		ls.append(l1)
		ds.append(d1)

	df = combine_label(df, [f"{x}_number" for x in column_names], "new_label")
	df = oversample(df, "new_label", k_neighbors)
	df = split_label(df, [f"{x}_number" for x in column_names], "new_label")

	for i in range(len(column_names)):
		df = number_to_label(df, column_names[i], ds[i])
	return df

## Dataset Helper

In this helper class, we define dataset to simplify the process of getting it's information.

In [None]:
import torch
import numpy as np
import pandas as pd

class Dataset(torch.utils.data.Dataset):
	def __init__(self, df, df_ans):
		self.df = df
		self.df_ans = df_ans
		

	def __getitem__(self, idx):
		return {"data": torch.as_tensor(np.array(self.df.iloc[idx]).astype("float32")), 
				"label": torch.as_tensor(np.array(self.df_ans.iloc[idx]))}

	def __len__(self):
		return len(self.df)

We propose a neural network to predict the playResult with different specialTeamsPlayTypes.

In [None]:
import torch

class Model(torch.nn.Module):
	def __init__(self, in_channels, out_channels):
		super().__init__()
		mid_channels = int(in_channels * 3)
		self.process = torch.nn.Sequential(
			torch.nn.Linear(in_channels, mid_channels),
			torch.nn.PReLU(),
			torch.nn.Linear(mid_channels, mid_channels),
			torch.nn.PReLU(),
			torch.nn.Linear(mid_channels, mid_channels),
			torch.nn.PReLU(),
			torch.nn.Linear(mid_channels, out_channels)
		)

	def forward(self, x):
		return self.process(x)

## Main Code

In our code, serveral process are down to help evaluate the strategy.
* Data Pre-Process
* Model Build
* Environment Training
* Model Training

In [None]:
import pandas as pd
import numpy as np
import torch
import torch.nn.functional as F
import torch.optim as optim
import math
anwers = []
labels = []
def main(csvs_path, model_path, learning_rate = 0.005, num_epochs = 3, num_traning = 30):
	df = get_dataframe(csvs_path)

	df = df.drop("specialTeamsResult", axis = 1)

	#df = oversample_with_multiple_labels(df, ['specialTeamsPlayType'], 5)

	label_df = df[["playResult"]]

	df = df.drop("playResult", axis = 1)
	df = to_one_hot(df, "specialTeamsPlayType", "specialTeamsPlayType")

	print(df.columns)
	print(df)

	# training model
	model = Model(len(df.columns), len(label_df.columns))
	try:
		model = torch.load(model_path)
		model.eval()
		print("Read model from previous model data")
	except:
		print(model_path)
		print("No previous model data")
		pass

	# training environment
	if torch.cuda.is_available():
		device = torch.device('cuda')
		print("cuda")
	else :
		device = torch.device('cpu')
		print("cpu")

	test_len = len(df) // 8
	df_training = df.iloc[:-test_len]
	label_df_training = label_df.iloc[:-test_len]
	
	df_test = df.iloc[-test_len:]
	label_df_test = label_df.iloc[-test_len:]

	for i in range(num_traning):

		print("new Round!!")
		
		# dataset
		dataset = Dataset(df_training, label_df_training)

		data_loader = torch.utils.data.DataLoader(
			dataset, batch_size=1, shuffle=True, num_workers=4)

		dataset_test = Dataset(df_test, label_df_test)

		data_loader_test = torch.utils.data.DataLoader(
			dataset_test, batch_size=1, shuffle=True, num_workers=4)


		# training setup
		model.to(device)

		criterion = torch.nn.MSELoss()
		optimizer = optim.SGD(model.parameters(), lr=learning_rate, momentum=0.9)
		for epoch in range(num_epochs):  # loop over the dataset multiple times
			print("train:")
			running_loss = 0.0
			for i, data_pairs in enumerate(data_loader, 0):
				# get the inputs; data is a list of [inputs, labels]
				
				data = data_pairs["data"].to(device)
				label = data_pairs["label"].to(device)

				# zero the parameter gradients
				optimizer.zero_grad()

				# forward + backward + optimize
				output = model(data)
				
				loss = criterion(output, label.float())
				loss.backward()
				optimizer.step()

				# print statistics
				running_loss += loss.item()
				if i % 2000 == 1999:    # print every 2000 mini-batches
					print('[%d, %5d] loss: %.3f' %
						(epoch + 1, i + 1, running_loss / 2000))
					running_loss = 0.0

			print("test:")
			global answers
			answers = []
			global labels
			labels = []
			with torch.no_grad():
				for i, data_pairs in enumerate(data_loader_test, 0):
					# get the inputs; data is a list of [inputs, labels]
					
					data = data_pairs["data"].to(device)
					label = data_pairs["label"].to(device)

					# forward + backward + optimize
					output = model(data)
					answers.append(output.cpu().numpy()[0])
					labels.append(label[:, 0].cpu().numpy()[0])
					loss = criterion(output, label.float())
					# print statistics
					running_loss += loss.item()
					if i % 2000 == 1999:    # print every 2000 mini-batches
						print(output)
						print(label)
						print('[%d, %5d] loss: %.3f' %
							(epoch + 1, i + 1, running_loss / 2000))
						running_loss = 0.0
			for i in range(10):
				print(answers[i], labels[i])

		torch.save(model, model_path)

	print('Finished Training')

In [None]:
main("/kaggle/input/nfl-big-data-bowl-2022", model_path = "playResult_model.pth", learning_rate = 0.0000001, num_epochs = 3, num_traning = 1)

## Results

In [None]:
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
answers_df = pd.DataFrame(answers).clip(-100, 100)
labels_df = pd.DataFrame(labels)
diff_df = labels_df - answers_df
diff_df = round(diff_df / 5)*5
counts = diff_df.value_counts().sort_index()
plt.plot(list(counts.index), counts)
plt.title("Error counts")
plt.show()
print(f"Accuracy: {(counts[-10] + counts[-5] + counts[0] + counts[5] + counts[10]) / sum(counts)* 100} %")

In our test dataset, the prediction error can be reduced to $\pm 10$. It means we can input the game conditions, and the neural network will predict how many yards the player can gain when he choose a specialTeamsPlayType. Although it is not allowed to input the status in the game, we expect the predicting results can help the players to choose which strategies to use when training.