In [23]:
!pip freeze

import numpy as np
import torch

aiohttp==3.9.1
aiosignal==1.3.1
alembic==1.13.0
altair==5.2.0
annotated-types==0.6.0
anyio==4.2.0
appnope==0.1.3
argon2-cffi==23.1.0
argon2-cffi-bindings==21.2.0
arrow==1.3.0
asttokens==2.4.1
async-lru==2.0.4
async-timeout==4.0.3
attrs==23.1.0
Babel==2.14.0
beautifulsoup4==4.12.2
bidict==0.22.1
bleach==6.1.0
blinker==1.7.0
cachetools==5.3.2
certifi==2023.11.17
cffi==1.16.0
charset-normalizer==3.3.2
click==8.1.7
comm==0.2.0
contourpy==1.2.0
cycler==0.12.1
dataclasses-json==0.6.3
debugpy==1.8.0
decorator==5.1.1
defusedxml==0.7.1
Deprecated==1.2.14
dill==0.3.7
distro==1.8.0
entrypoints==0.4
exceptiongroup==1.2.0
executing==2.0.1
Faker==21.0.0
fastjsonschema==2.19.0
favicon==0.7.0
filelock==3.14.0
Flask==3.0.0
Flask-SocketIO==5.3.6
fonttools==4.47.0
fqdn==1.5.1
frozendict==2.3.10
frozenlist==1.4.1
fsspec==2023.12.2
gitdb==4.0.11
GitPython==3.1.40
google-ai-generativelanguage==0.4.0
google-api-core==2.15.0
google-auth==2.25.2
google-generativeai==0.3.1
googleapis-common-protos==1.62.0
green

# Load Data

We have downloaded the freely available datasets from https://www.fantasylife.com/tools/player-stats. These are season statistics for all players who played running back or wide receiver in the NFL from the years 2020 to 2023.

In this section, we load and combine the data. We also have to classify each player as either a running back or wide receiver to help with the training and testing of the model.


In [24]:
import pandas as pd

file_paths_rb = [
    "data/rb_2020.csv",
    "data/rb_2021.csv",
    "data/rb_2022.csv",
    "data/rb_2023.csv"]
    
file_paths_wr =    [
    "data/wr_2020.csv",
    "data/wr_2021.csv",
    "data/wr_2022.csv",
    "data/wr_2023.csv"
]

dataframes = []
for file_path in file_paths_rb:
    df = pd.read_csv(file_path)
    # add a column to the dataframe called 'position' and set it to 'rb'
    df['position'] = 'rb'
    dataframes.append(df)

for file_path in file_paths_wr:
    df = pd.read_csv(file_path)
    # add a column to the dataframe called 'position' and set it to 'wr'
    df['position'] = 'wr'
    dataframes.append(df)


combined_df = pd.concat(dataframes)

# print out the shape of the combined dataframe
print(combined_df.shape)
print(combined_df.columns)

(1807, 17)
Index(['Player', 'Team', 'GP', 'Snaps/G', 'R Att', 'Ru Yds', 'Ru YPG',
       'Ru YPA', 'Ru TD', 'TAR', 'REC', 'Rec Yds', 'Rec YPC', 'Rec TD', 'FP',
       'PPG', 'position'],
      dtype='object')


# Cleaning the Data

Before we can train the model, we have to do some pre-processing. Here are some cleaning steps for our combined dataset:

Convert NaN data to zeros so that they can be considered in the model

There are some players who don't have enough yards to tell their position. There is no way for the model to pick up on these players. For example, a 6th string wide receiver who only catches 1 pass does not have enough data to draw insights about whether they were a running back or wide receiver. They simply didn't play enough. Our solution is to set a threshold on total yards gained. If a player is beneath the threshold, they are discarded from the training set.

In [25]:
# Convert all NaN values of the combined dataframe to 0
combined_df = combined_df.fillna(0)

# threshold dataset based on total yards
THRESHOLD = 100
combined_df['Total Yds'] = combined_df['Ru Yds'] + combined_df['Rec Yds']
filtered_df = combined_df[combined_df['Total Yds'] >= THRESHOLD].drop('Total Yds', axis=1)

# print out size of filtered dataframe compared to size of combined dataframe
print("before filtering by yards")
print(combined_df.shape)
print("after filtering by yards")
print(filtered_df.shape)

# save to a csv file
filtered_df.to_csv('cleaned_data/combined.csv', index=False)

before filtering by yards
(1807, 18)
after filtering by yards
(1016, 17)
