In [1]:
import pandas as pd
pd.set_option('display.max_columns', None)
pd.set_option('display.max_rows', None)

import numpy as np

from sklearn.model_selection import train_test_split, RandomizedSearchCV, GridSearchCV
from sklearn.metrics import mean_squared_error
from sklearn.preprocessing import OneHotEncoder, StandardScaler
from sklearn.compose import ColumnTransformer

import os
import json

import matplotlib.pyplot as plt
import seaborn as sns

import scipy.stats as stats
import nfl_data_py as nfl

import torch
import torch.nn as nn
import torch.optim as optim
from torch.utils.data import TensorDataset, DataLoader
import torch.nn.functional as F
from torchsummary import summary
from skorch import NeuralNetRegressor

In [2]:
# Pull season-level data

year_range = list(range(2000,2025))
season_df = nfl.import_seasonal_data(year_range, s_type='REG')

In [3]:
season_df.info()

<class 'pandas.core.frame.DataFrame'>
Int64Index: 14541 entries, 0 to 14540
Data columns (total 58 columns):
 #   Column                       Non-Null Count  Dtype  
---  ------                       --------------  -----  
 0   player_id                    14541 non-null  object 
 1   season                       14541 non-null  int64  
 2   season_type                  14541 non-null  object 
 3   completions                  14541 non-null  int32  
 4   attempts                     14541 non-null  int32  
 5   passing_yards                14541 non-null  float64
 6   passing_tds                  14541 non-null  int32  
 7   interceptions                14541 non-null  float64
 8   sacks                        14541 non-null  float64
 9   sack_yards                   14541 non-null  float64
 10  sack_fumbles                 14541 non-null  int32  
 11  sack_fumbles_lost            14541 non-null  int32  
 12  passing_air_yards            14541 non-null  float64
 13  passing_yards_af

In [4]:
season_df.head()

Unnamed: 0,player_id,season,season_type,completions,attempts,passing_yards,passing_tds,interceptions,sacks,sack_yards,sack_fumbles,sack_fumbles_lost,passing_air_yards,passing_yards_after_catch,passing_first_downs,passing_epa,passing_2pt_conversions,pacr,dakota,carries,rushing_yards,rushing_tds,rushing_fumbles,rushing_fumbles_lost,rushing_first_downs,rushing_epa,rushing_2pt_conversions,receptions,targets,receiving_yards,receiving_tds,receiving_fumbles,receiving_fumbles_lost,receiving_air_yards,receiving_yards_after_catch,receiving_first_downs,receiving_epa,receiving_2pt_conversions,racr,target_share,air_yards_share,wopr_x,special_teams_tds,fantasy_points,fantasy_points_ppr,games,tgt_sh,ay_sh,yac_sh,wopr_y,ry_sh,rtd_sh,rfd_sh,rtdfd_sh,dom,w8dom,yptmpa,ppr_sh
0,00-0000003,2000,REG,0,0,0.0,0,0.0,0.0,0.0,0,0,0.0,0.0,0.0,0.0,0,0.0,0.0,1,-2.0,0,0.0,0.0,0.0,-0.549944,0,0,0,0.0,0,0.0,0.0,0.0,0.0,0.0,0.0,0,0.0,0.0,0.0,0.0,0.0,-0.2,-0.2,1,0.0,,,,0.0,0.0,0.0,0.0,0.0,0.0,0.0,-0.002418
1,00-0000007,2000,REG,0,0,0.0,0,0.0,0.0,0.0,0,0,0.0,0.0,0.0,0.0,0,0.0,0.0,16,70.0,0,0.0,0.0,5.0,-2.828506,0,2,2,14.0,0,0.0,0.0,0.0,0.0,1.0,0.297373,0,0.0,0.116883,0.0,0.0,0.0,8.4,10.4,4,0.028169,,,,0.03423,0.0,0.043478,0.041667,0.017115,0.027384,0.197183,0.053758
2,00-0000007,2001,REG,0,0,0.0,0,0.0,0.0,0.0,0,0,0.0,0.0,0.0,0.0,0,0.0,0.0,11,40.0,0,0.0,0.0,1.0,-0.837385,0,2,2,26.0,0,0.0,0.0,0.0,0.0,2.0,2.192011,0,0.0,0.064516,0.0,0.0,0.0,6.6,8.6,2,0.031746,,,,0.064838,0.0,0.095238,0.083333,0.032419,0.05187,0.412698,0.054108
3,00-0000007,2003,REG,0,0,0.0,0,0.0,0.0,0.0,0,0,0.0,0.0,0.0,0.0,0,0.0,0.0,18,37.0,0,0.0,0.0,4.0,-4.764046,0,8,8,55.0,0,0.0,0.0,0.0,0.0,3.0,-0.458145,0,0.0,0.461828,0.0,0.0,0.0,9.2,17.2,12,0.020253,,,,0.024091,0.0,0.027273,0.025424,0.012046,0.019273,0.139241,0.02178
4,00-0000007,2004,REG,0,0,0.0,0,0.0,0.0,0.0,0,0,0.0,0.0,0.0,0.0,0,0.0,0.0,13,13.0,1,0.0,0.0,3.0,-5.657374,0,1,1,9.0,0,0.0,0.0,0.0,0.0,0.0,-0.472901,0,0.0,0.052632,0.0,0.0,0.0,8.2,9.2,6,0.005682,,,,0.00716,0.0,0.0,0.0,0.00358,0.005728,0.051136,0.018849


In [5]:
season_cols = ['player_id', 'season', 'completions', 'attempts',
               'passing_yards', 'passing_tds', 'interceptions', 'sacks', 
               'sack_yards','sack_fumbles', 'sack_fumbles_lost', 'passing_air_yards',
               'passing_yards_after_catch', 'passing_first_downs', 'passing_epa',
               'passing_2pt_conversions', 'pacr', 'dakota', 'carries', 'rushing_yards',
               'rushing_tds', 'rushing_fumbles', 'rushing_fumbles_lost',
               'rushing_first_downs', 'rushing_epa', 'rushing_2pt_conversions',
               'receptions', 'targets', 'receiving_yards', 'receiving_tds',
               'receiving_fumbles', 'receiving_fumbles_lost', 'receiving_air_yards',
               'receiving_yards_after_catch', 'receiving_first_downs', 'receiving_epa',
               'receiving_2pt_conversions', 'racr', 'target_share', 'air_yards_share',
               'wopr_x', 'special_teams_tds', 'games', 'dom', 'w8dom'
              ]

# 'ry_sh', 'rtd_sh', 'rfd_sh', 'rtdfd_sh', 'yptmpa'

season_df = season_df[season_cols]

In [6]:
season_df.columns

Index(['player_id', 'season', 'completions', 'attempts', 'passing_yards',
       'passing_tds', 'interceptions', 'sacks', 'sack_yards', 'sack_fumbles',
       'sack_fumbles_lost', 'passing_air_yards', 'passing_yards_after_catch',
       'passing_first_downs', 'passing_epa', 'passing_2pt_conversions', 'pacr',
       'dakota', 'carries', 'rushing_yards', 'rushing_tds', 'rushing_fumbles',
       'rushing_fumbles_lost', 'rushing_first_downs', 'rushing_epa',
       'rushing_2pt_conversions', 'receptions', 'targets', 'receiving_yards',
       'receiving_tds', 'receiving_fumbles', 'receiving_fumbles_lost',
       'receiving_air_yards', 'receiving_yards_after_catch',
       'receiving_first_downs', 'receiving_epa', 'receiving_2pt_conversions',
       'racr', 'target_share', 'air_yards_share', 'wopr_x',
       'special_teams_tds', 'games', 'dom', 'w8dom'],
      dtype='object')

In [7]:
players = nfl.import_players()

In [8]:
players.head()

Unnamed: 0,gsis_id,display_name,common_first_name,first_name,last_name,short_name,football_name,suffix,esb_id,nfl_id,pfr_id,pff_id,otc_id,espn_id,smart_id,birth_date,position_group,position,ngs_position_group,ngs_position,height,weight,headshot,college_name,college_conference,jersey_number,rookie_season,last_season,latest_team,status,ngs_status,ngs_status_short_description,years_of_experience,pff_position,pff_status,draft_year,draft_round,draft_pick,draft_team
0,00-0028830,Isaako Aaitui,Isaako,Isaako,Aaitui,,,,AAI622937,,AaitIs00,6998.0,2535.0,14856.0,32004141-4962-2937-61ff-017b1804dec6,1987-01-25,DL,NT,,,76.0,307.0,https://static.www.nfl.com/image/private/{form...,UNLV,,0.0,2011,2014,WAS,DEV,,,2,DI,,,,,
1,00-0038389,Israel Abanikanda,Israel,Israel,Abanikanda,I.Abanikanda,Israel,,ABA159567,56008.0,AbanIs00,122999.0,10967.0,4429202.0,32004142-4115-9567-2e24-0eab29f6a4b9,2002-10-05,RB,RB,,,70.0,216.0,https://static.www.nfl.com/image/private/{form...,Pittsburgh,Atlantic Coast Conference,,2023,2025,GB,ACT,ACT,Active,3,HB,A,2023.0,5.0,143.0,NYJ
2,00-0024644,Jon Abbate,Jon,Jon,Abbate,,,,ABB051371,,,,,10801.0,32004142-4205-1371-db95-1abc96313b69,1985-06-18,LB,LB,,,71.0,245.0,https://static.www.nfl.com/image/private/{form...,Wake Forest,,67.0,2007,2007,HOU,RES,,,0,,,,,,
3,ABB498348,Vince Abbott,Vince,Vincent,Abbott,,,,ABB498348,,abbotvin01,,,,32004142-4249-8348-e00f-5fbbe6a0c73c,1958-05-31,SPEC,K,,,71.0,207.0,https://static.www.nfl.com/image/private/{form...,California State-Fullerton; Washington,,0.0,1987,1988,LAC,ACT,,,2,,,,,,
4,00-0031021,Jared Abbrederis,Jared,Jared,Abbrederis,J.Abbrederis,Jared,,ABB650964,41405.0,AbbrJa00,8811.0,3115.0,16836.0,32004142-4265-0964-fc36-bb0ad76ff6e6,1990-12-17,WR,WR,WR,WR,73.0,195.0,https://static.www.nfl.com/image/private/{form...,Wisconsin,,10.0,2014,2017,DET,CUT,CUT,,4,WR,,2014.0,5.0,176.0,GB


In [9]:
positions = ['QB', 'RB', 'WR', 'TE']
skill_df = players[players['position'].isin(positions)].copy()
skill_df = skill_df[skill_df['status'] == 'ACT']

In [10]:
player_cols = ['gsis_id', 'position', 'display_name', 'rookie_season', 
               'college_conference', 'draft_pick', 'status', 'height', 
               'weight', 'college_name', 'birth_date', 'draft_team'
              ]

skill_df = skill_df[skill_df['rookie_season'] >= 2000]
skill_df = skill_df[player_cols]

In [11]:
skill_df.head()

Unnamed: 0,gsis_id,position,display_name,rookie_season,college_conference,draft_pick,status,height,weight,college_name,birth_date,draft_team
1,00-0038389,RB,Israel Abanikanda,2023,Atlantic Coast Conference,143.0,ACT,70.0,216.0,Pittsburgh,2002-10-05,NYJ
7,00-0032104,RB,Ameer Abdullah,2015,Big Ten Conference,54.0,ACT,69.0,203.0,Nebraska,1993-06-13,DET
33,00-0039040,RB,De'Von Achane,2023,Southeastern Conference,84.0,ACT,69.0,191.0,Texas A&M,2001-10-13,MIA
48,00-0029271,WR,Joe Adams,2012,,104.0,ACT,71.0,190.0,Arkansas,1989-11-22,CAR
55,00-0021006,WR,Charlie Adams,2003,,,ACT,74.0,190.0,Hofstra,1979-10-23,


In [12]:
df = pd.merge(
    skill_df,
    season_df,
    how='right',
    left_on='gsis_id',
    right_on='player_id',
)

In [13]:
df = df.dropna(subset=['position'])

In [14]:
df.shape

(7038, 57)

In [15]:
df.head()

Unnamed: 0,gsis_id,position,display_name,rookie_season,college_conference,draft_pick,status,height,weight,college_name,birth_date,draft_team,player_id,season,completions,attempts,passing_yards,passing_tds,interceptions,sacks,sack_yards,sack_fumbles,sack_fumbles_lost,passing_air_yards,passing_yards_after_catch,passing_first_downs,passing_epa,passing_2pt_conversions,pacr,dakota,carries,rushing_yards,rushing_tds,rushing_fumbles,rushing_fumbles_lost,rushing_first_downs,rushing_epa,rushing_2pt_conversions,receptions,targets,receiving_yards,receiving_tds,receiving_fumbles,receiving_fumbles_lost,receiving_air_yards,receiving_yards_after_catch,receiving_first_downs,receiving_epa,receiving_2pt_conversions,racr,target_share,air_yards_share,wopr_x,special_teams_tds,games,dom,w8dom
79,00-0000781,RB,Marlon Barnes,2000.0,,,ACT,69.0,215.0,Colorado,1976-03-13,,00-0000781,2000,0,0,0.0,0,0.0,0.0,0.0,0,0,0.0,0.0,0.0,0.0,0,0.0,0.0,15,81.0,0,1.0,1.0,4.0,-2.47792,0,1,3,7.0,0,0.0,0.0,0.0,0.0,0.0,-1.804374,0,0.0,0.102646,0.0,0.0,0.0,6,0.003038,0.004861
365,00-0003220,WR,Fred Coleman,2001.0,,160.0,ACT,72.0,192.0,Washington,1975-01-31,BUF,00-0003220,2001,0,0,0.0,0,0.0,0.0,0.0,0,0,0.0,0.0,0.0,0.0,0,0.0,0.0,0,0.0,0,0.0,0.0,0.0,0.0,0,2,4,50.0,0,0.0,0.0,0.0,0.0,1.0,2.833107,0,0.0,0.139163,0.0,0.0,0.0,2,0.060827,0.097324
458,00-0004055,WR,Thabiti Davis,2000.0,,,ACT,74.0,205.0,Wake Forest,1975-03-24,,00-0004055,2000,0,0,0.0,0,0.0,0.0,0.0,0,0,0.0,0.0,0.0,0.0,0,0.0,0.0,0,0.0,0,0.0,0.0,0.0,0.0,0,2,5,40.0,0,0.0,0.0,0.0,0.0,2.0,2.495312,0,0.0,0.139244,0.0,0.0,0.0,3,0.024125,0.038601
459,00-0004055,WR,Thabiti Davis,2000.0,,,ACT,74.0,205.0,Wake Forest,1975-03-24,,00-0004055,2001,0,0,0.0,0,0.0,0.0,0.0,0,0,0.0,0.0,0.0,0.0,0,0.0,0.0,0,0.0,0,0.0,0.0,0.0,0.0,0,3,8,34.0,0,0.0,0.0,0.0,0.0,2.0,0.646903,0,0.0,0.229505,0.0,0.0,0.0,5,0.01398,0.022368
514,00-0004508,RB,Scott Dragos,2000.0,,,ACT,74.0,245.0,Boston College,1975-10-28,,00-0004508,2000,0,0,0.0,0,0.0,0.0,0.0,0,0,0.0,0.0,0.0,0.0,0,0.0,0.0,0,0.0,0,0.0,0.0,0.0,0.0,0,4,6,28.0,0,0.0,0.0,0.0,0.0,1.0,0.368381,0,0.0,0.21142,0.0,0.0,0.0,5,0.01697,0.027152


In [16]:
df.info()

<class 'pandas.core.frame.DataFrame'>
Int64Index: 7038 entries, 79 to 14540
Data columns (total 57 columns):
 #   Column                       Non-Null Count  Dtype  
---  ------                       --------------  -----  
 0   gsis_id                      7038 non-null   object 
 1   position                     7038 non-null   object 
 2   display_name                 7038 non-null   object 
 3   rookie_season                7038 non-null   float64
 4   college_conference           4145 non-null   object 
 5   draft_pick                   5322 non-null   float64
 6   status                       7038 non-null   object 
 7   height                       7038 non-null   float64
 8   weight                       7038 non-null   float64
 9   college_name                 7038 non-null   object 
 10  birth_date                   7038 non-null   object 
 11  draft_team                   5322 non-null   object 
 12  player_id                    7038 non-null   object 
 13  season          

In [17]:
# Check missing data counts
missing_counts = df.isnull().sum().sort_values(ascending=False)

missing_counts

college_conference             2893
draft_pick                     1716
draft_team                     1716
w8dom                           172
dom                             172
rushing_2pt_conversions           0
receiving_tds                     0
receiving_yards                   0
targets                           0
receptions                        0
rushing_first_downs               0
rushing_epa                       0
receiving_fumbles_lost            0
rushing_fumbles_lost              0
rushing_fumbles                   0
rushing_tds                       0
receiving_fumbles                 0
receiving_yards_after_catch       0
receiving_air_yards               0
carries                           0
receiving_first_downs             0
receiving_epa                     0
receiving_2pt_conversions         0
racr                              0
target_share                      0
air_yards_share                   0
wopr_x                            0
special_teams_tds           

In [18]:
# Use .apply() to run the isnull().mean() calculation on each group individually
missing_by_position = df.groupby('position')['college_conference'].apply(lambda x: x.isnull().mean()) * 100

# Sort the results
missing_by_position = missing_by_position.sort_values(ascending=False)

missing_by_position

position
RB    52.701213
WR    41.082164
TE    38.095238
QB    27.958697
Name: college_conference, dtype: float64

In [19]:
skill_df[skill_df['gsis_id'] == "00-0000781"]

Unnamed: 0,gsis_id,position,display_name,rookie_season,college_conference,draft_pick,status,height,weight,college_name,birth_date,draft_team
1125,00-0000781,RB,Marlon Barnes,2000,,,ACT,69.0,215.0,Colorado,1976-03-13,


In [24]:
df.head()

Unnamed: 0,gsis_id,position,display_name,rookie_season,draft_pick,status,height,weight,college_name,birth_date,draft_team,player_id,season,completions,attempts,passing_yards,passing_tds,interceptions,sacks,sack_yards,sack_fumbles,sack_fumbles_lost,passing_air_yards,passing_yards_after_catch,passing_first_downs,passing_epa,passing_2pt_conversions,pacr,dakota,carries,rushing_yards,rushing_tds,rushing_fumbles,rushing_fumbles_lost,rushing_first_downs,rushing_epa,rushing_2pt_conversions,receptions,targets,receiving_yards,receiving_tds,receiving_fumbles,receiving_fumbles_lost,receiving_air_yards,receiving_yards_after_catch,receiving_first_downs,receiving_epa,receiving_2pt_conversions,racr,target_share,air_yards_share,wopr_x,special_teams_tds,games,dom,w8dom,age
79,00-0000781,RB,Marlon Barnes,2000,300.0,ACT,69.0,215.0,Colorado,1976,Unknown,00-0000781,2000,0,0,0.0,0,0.0,0.0,0.0,0,0,0.0,0.0,0.0,0.0,0,0.0,0.0,15,81.0,0,1.0,1.0,4.0,-2.47792,0,1,3,7.0,0,0.0,0.0,0.0,0.0,0.0,-1.804374,0,0.0,0.102646,0.0,0.0,0.0,6,0.003038,0.004861,24
365,00-0003220,WR,Fred Coleman,2001,160.0,ACT,72.0,192.0,Washington,1975,BUF,00-0003220,2001,0,0,0.0,0,0.0,0.0,0.0,0,0,0.0,0.0,0.0,0.0,0,0.0,0.0,0,0.0,0,0.0,0.0,0.0,0.0,0,2,4,50.0,0,0.0,0.0,0.0,0.0,1.0,2.833107,0,0.0,0.139163,0.0,0.0,0.0,2,0.060827,0.097324,26
458,00-0004055,WR,Thabiti Davis,2000,300.0,ACT,74.0,205.0,Wake Forest,1975,Unknown,00-0004055,2000,0,0,0.0,0,0.0,0.0,0.0,0,0,0.0,0.0,0.0,0.0,0,0.0,0.0,0,0.0,0,0.0,0.0,0.0,0.0,0,2,5,40.0,0,0.0,0.0,0.0,0.0,2.0,2.495312,0,0.0,0.139244,0.0,0.0,0.0,3,0.024125,0.038601,25
459,00-0004055,WR,Thabiti Davis,2000,300.0,ACT,74.0,205.0,Wake Forest,1975,Unknown,00-0004055,2001,0,0,0.0,0,0.0,0.0,0.0,0,0,0.0,0.0,0.0,0.0,0,0.0,0.0,0,0.0,0,0.0,0.0,0.0,0.0,0,3,8,34.0,0,0.0,0.0,0.0,0.0,2.0,0.646903,0,0.0,0.229505,0.0,0.0,0.0,5,0.01398,0.022368,26
514,00-0004508,RB,Scott Dragos,2000,300.0,ACT,74.0,245.0,Boston College,1975,Unknown,00-0004508,2000,0,0,0.0,0,0.0,0.0,0.0,0,0,0.0,0.0,0.0,0.0,0,0.0,0.0,0,0.0,0,0.0,0.0,0.0,0.0,0,4,6,28.0,0,0.0,0.0,0.0,0.0,1.0,0.368381,0,0.0,0.21142,0.0,0.0,0.0,5,0.01697,0.027152,25


In [21]:
df.info()

<class 'pandas.core.frame.DataFrame'>
Int64Index: 7038 entries, 79 to 14540
Data columns (total 57 columns):
 #   Column                       Non-Null Count  Dtype  
---  ------                       --------------  -----  
 0   gsis_id                      7038 non-null   object 
 1   position                     7038 non-null   object 
 2   display_name                 7038 non-null   object 
 3   rookie_season                7038 non-null   float64
 4   college_conference           4145 non-null   object 
 5   draft_pick                   5322 non-null   float64
 6   status                       7038 non-null   object 
 7   height                       7038 non-null   float64
 8   weight                       7038 non-null   float64
 9   college_name                 7038 non-null   object 
 10  birth_date                   7038 non-null   object 
 11  draft_team                   5322 non-null   object 
 12  player_id                    7038 non-null   object 
 13  season          

In [22]:
df = df.drop(columns=['college_conference'])

# For draft_number: fill missing with a special value, e.g. -1
df['draft_pick'].fillna(300, inplace=True)

stats = ['w8dom', 'dom']
for stat in stats:
    df[stat] = df[stat].fillna(0)

# Fill missing with 'Unknown'
df['draft_team'].fillna('Unknown', inplace=True)

In [23]:
# Convert 'season' from float to integer
df['season'] = df['season'].astype(int)
df['rookie_season'] = df['rookie_season'].astype(int)

# Convert 'birth_date' to datetime and extract the year
df['birth_date'] = pd.to_datetime(df['birth_date']).dt.year

# Calculate age by subtracting the two integer columns
df['age'] = df['season'] - df['birth_date']

In [25]:
df.to_csv('fantasy_season_data.csv', index=False)

In [26]:
df_qb = df[df['position'] == 'QB']
df_rb = df[df['position'] == 'RB']
df_wr = df[df['position'] == 'WR']
df_te = df[df['position'] == 'TE']

In [27]:
df_qb.tail(20)

Unnamed: 0,gsis_id,position,display_name,rookie_season,draft_pick,status,height,weight,college_name,birth_date,draft_team,player_id,season,completions,attempts,passing_yards,passing_tds,interceptions,sacks,sack_yards,sack_fumbles,sack_fumbles_lost,passing_air_yards,passing_yards_after_catch,passing_first_downs,passing_epa,passing_2pt_conversions,pacr,dakota,carries,rushing_yards,rushing_tds,rushing_fumbles,rushing_fumbles_lost,rushing_first_downs,rushing_epa,rushing_2pt_conversions,receptions,targets,receiving_yards,receiving_tds,receiving_fumbles,receiving_fumbles_lost,receiving_air_yards,receiving_yards_after_catch,receiving_first_downs,receiving_epa,receiving_2pt_conversions,racr,target_share,air_yards_share,wopr_x,special_teams_tds,games,dom,w8dom,age
14317,00-0038579,QB,Aidan O'Connell,2023,135.0,ACT,75.0,206.0,Purdue,1998,LV,00-0038579,2024,154,243,1612.0,8,4.0,10.0,82.0,0,0,1921.0,731.0,78.0,4.302664,0,7.274391,0.514999,21,30.0,1,2.0,2.0,10.0,-8.121734,0,0,0,0.0,0,0.0,0.0,0.0,0.0,0.0,0.0,0,0.0,0.0,0.0,0.0,0.0,9,0.0,0.0,26
14318,00-0038582,QB,Clayton Tune,2023,139.0,ACT,75.0,220.0,Houston,1999,ARI,00-0038582,2023,12,21,62.0,0,2.0,7.0,41.0,1,1,53.0,43.0,3.0,-34.749543,0,2.183673,-0.031238,8,30.0,1,1.0,0.0,3.0,-0.641286,1,0,0,0.0,0,0.0,0.0,0.0,0.0,0.0,0.0,0,0.0,0.0,0.0,0.0,0.0,6,0.0,0.0,24
14319,00-0038582,QB,Clayton Tune,2023,139.0,ACT,75.0,220.0,Houston,1999,ARI,00-0038582,2024,2,2,8.0,0,0.0,0.0,0.0,0,0,4.0,4.0,0.0,-1.128019,0,2.0,0.0,7,-4.0,0,1.0,1.0,2.0,-10.539624,0,0,0,0.0,0,0.0,0.0,0.0,0.0,0.0,0.0,0,0.0,0.0,0.0,0.0,0.0,5,0.0,0.0,25
14320,00-0038583,QB,Dorian Thompson-Robinson,2023,140.0,ACT,74.0,203.0,UCLA,1999,CLE,00-0038583,2023,60,112,440.0,1,4.0,6.0,60.0,1,0,662.0,204.0,26.0,-38.58013,0,2.150558,-0.047532,14,65.0,0,1.0,0.0,4.0,0.732106,1,0,0,0.0,0,0.0,0.0,0.0,0.0,0.0,0.0,0,0.0,0.0,0.0,0.0,0.0,7,0.0,0.0,24
14321,00-0038583,QB,Dorian Thompson-Robinson,2023,140.0,ACT,74.0,203.0,UCLA,1999,CLE,00-0038583,2024,61,118,440.0,0,6.0,8.0,47.0,2,1,710.0,251.0,20.0,-71.141147,0,2.766568,-0.1087,21,122.0,0,1.0,0.0,9.0,5.991398,0,0,0,0.0,0,0.0,0.0,0.0,0.0,0.0,0.0,0,0.0,0.0,0.0,0.0,0.0,6,0.0,0.0,25
14326,00-0038598,QB,Jaren Hall,2023,164.0,ACT,72.0,207.0,BYU,1998,MIN,00-0038598,2023,13,20,168.0,0,1.0,4.0,28.0,2,2,156.0,99.0,7.0,-12.676043,0,4.925433,0.261492,6,14.0,0,0.0,0.0,0.0,-1.96602,0,0,0,0.0,0,0.0,0.0,0.0,0.0,0.0,0.0,0,0.0,0.0,0.0,0.0,0.0,3,0.0,0.0,25
14405,00-0038998,QB,Jake Haener,2023,127.0,ACT,73.0,200.0,Fresno State; Washington,1999,NO,00-0038998,2024,18,39,226.0,1,1.0,6.0,55.0,0,0,345.0,97.0,10.0,-14.948479,0,3.059571,-0.231547,11,22.0,0,0.0,0.0,0.0,-1.556291,0,0,0,0.0,0,0.0,0.0,0.0,0.0,0.0,0.0,0,0.0,0.0,0.0,0.0,0.0,8,0.0,0.0,25
14439,00-0039150,QB,Bryce Young,2023,1.0,ACT,70.0,204.0,Alabama,2001,CAR,00-0039150,2023,315,527,2877.0,11,10.0,62.0,477.0,9,6,4009.0,1300.0,133.0,-160.326442,1,12.786642,0.672251,39,253.0,0,2.0,0.0,18.0,21.6163,1,0,0,0.0,0,0.0,0.0,0.0,0.0,0.0,0.0,0,0.0,0.0,0.0,0.0,0.0,16,0.0,0.0,22
14440,00-0039150,QB,Bryce Young,2023,1.0,ACT,70.0,204.0,Alabama,2001,CAR,00-0039150,2024,234,384,2403.0,15,9.0,29.0,186.0,4,1,3358.0,969.0,117.0,-24.919599,0,10.194872,1.048693,43,249.0,6,1.0,1.0,15.0,15.547885,0,0,0,0.0,0,0.0,0.0,0.0,0.0,0.0,0.0,0,0.0,0.0,0.0,0.0,0.0,14,0.0,0.0,23
14443,00-0039163,QB,C.J. Stroud,2023,2.0,ACT,75.0,218.0,Ohio State,2001,HOU,00-0039163,2023,319,499,4108.0,23,5.0,38.0,331.0,6,3,4481.0,1762.0,188.0,64.826173,0,13.869473,1.603286,39,157.0,3,2.0,1.0,17.0,6.004496,1,1,1,0.0,0,0.0,0.0,-1.0,1.0,0.0,-0.873244,0,0.0,0.023256,-0.003846,0.032191,0.0,15,0.0,0.0,22


In [28]:
df_qb.shape, df_rb.shape, df_wr.shape, df_te.shape

((1259, 57), (1814, 57), (2495, 57), (1470, 57))

In [29]:
df_qb = df_qb.sort_values(['gsis_id', 'season'])

In [30]:
df_qb.columns

Index(['gsis_id', 'position', 'display_name', 'rookie_season', 'draft_pick',
       'status', 'height', 'weight', 'college_name', 'birth_date',
       'draft_team', 'player_id', 'season', 'completions', 'attempts',
       'passing_yards', 'passing_tds', 'interceptions', 'sacks', 'sack_yards',
       'sack_fumbles', 'sack_fumbles_lost', 'passing_air_yards',
       'passing_yards_after_catch', 'passing_first_downs', 'passing_epa',
       'passing_2pt_conversions', 'pacr', 'dakota', 'carries', 'rushing_yards',
       'rushing_tds', 'rushing_fumbles', 'rushing_fumbles_lost',
       'rushing_first_downs', 'rushing_epa', 'rushing_2pt_conversions',
       'receptions', 'targets', 'receiving_yards', 'receiving_tds',
       'receiving_fumbles', 'receiving_fumbles_lost', 'receiving_air_yards',
       'receiving_yards_after_catch', 'receiving_first_downs', 'receiving_epa',
       'receiving_2pt_conversions', 'racr', 'target_share', 'air_yards_share',
       'wopr_x', 'special_teams_tds', 'games',

In [33]:
# Final columns
features = [
    'rookie_season', 'draft_pick',
    'height', 'weight', 'college_name', 'birth_date',
    'draft_team', 'season', 'completions', 'attempts',
    'passing_tds', 'interceptions', 'sacks', 'sack_yards',
    'sack_fumbles', 'passing_air_yards',
    'passing_yards_after_catch', 'passing_first_downs', 'passing_epa',
    'passing_2pt_conversions', 'pacr', 'dakota', 'carries', 'rushing_yards',
    'rushing_tds', 'rushing_fumbles',
    'rushing_first_downs', 'rushing_epa', 'rushing_2pt_conversions',
    'receptions', 'targets', 'receiving_yards', 'receiving_tds',
    'receiving_fumbles', 'receiving_air_yards',
    'receiving_yards_after_catch', 'receiving_first_downs', 'receiving_epa',
    'receiving_2pt_conversions', 'racr', 'target_share', 'air_yards_share',
    'wopr_x', 'special_teams_tds', 'games', 'dom', 'w8dom'
]


target = [
    'passing_yards', 'passing_tds', 'interceptions',
    'rushing_yards', 'rushing_tds',
    'rushing_2pt_conversions', 'passing_2pt_conversions',
    'sack_fumbles_lost', 'rushing_fumbles_lost',
    'receiving_fumbles_lost'
]

In [34]:
X = df_qb[features].copy()
y = df_qb[target].copy()

In [None]:
cat_features = ['team_abbr', 'college_conference', 'status']
num_features = [col for col in features if col not in cat_features]

# Column transformer
preprocessor = ColumnTransformer([
    ('cat', OneHotEncoder(handle_unknown='ignore'), cat_features)
], remainder='passthrough')  # passthrough keeps the numeric columns

In [None]:
train_count = df_qb[(df_qb['season'] >= 2000) & (df_qb['season'] <= 2022)].shape[0]
val_count = df_qb[(df_qb['season'] == 2023)].shape[0]
test_count = df_qb[(df_qb['season'] == 2024)].shape[0]
total_count = train_count + val_count + test_count
train_count/total_count, val_count/total_count, test_count/total_count

In [None]:
scaler = StandardScaler()

train = df_qb[df_qb['season'] <= 2022]
val = df_qb[df_qb['season'] == 2023]
test = df_qb[df_qb['season'] >= 2024]

X_train = scaler.fit_transform(preprocessor.fit_transform(train[features]))
y_train = train[target].values

X_val = scaler.transform(preprocessor.transform(val[features]))
y_val = val[target].values

X_test = scaler.transform(preprocessor.transform(test[features]))
y_test = test[target].values

In [None]:
X_train = X_train.astype(np.float32)
y_train = y_train.astype(np.float32)
X_val = X_val.astype(np.float32)
y_val = y_val.astype(np.float32)

In [None]:
# Convert to tensors
X_train_tensor = torch.tensor(X_train, dtype=torch.float32)
y_train_tensor = torch.tensor(y_train, dtype=torch.float32)

X_val_tensor = torch.tensor(X_val, dtype=torch.float32)
y_val_tensor = torch.tensor(y_val, dtype=torch.float32)

X_test_tensor = torch.tensor(X_test, dtype=torch.float32)
y_test_tensor = torch.tensor(y_test, dtype=torch.float32)

# DataLoaders
train_loader = DataLoader(TensorDataset(X_train_tensor, y_train_tensor), batch_size=32, shuffle=True)
val_loader = DataLoader(TensorDataset(X_val_tensor, y_val_tensor), batch_size=32)
test_loader = DataLoader(TensorDataset(X_test_tensor, y_test_tensor), batch_size=32)

In [None]:
class QBPerformanceModel(nn.Module):
    def __init__(self, input_dim, hidden_dim, dropout_prob):
        super(QBPerformanceModel, self).__init__()
        
        self.model = nn.Sequential(
            nn.Linear(input_dim, hidden_dim),
            nn.ReLU(),
            nn.Dropout(dropout_prob),
            
            nn.Linear(hidden_dim, hidden_dim),
            nn.ReLU(),
            nn.Dropout(dropout_prob),
            
            nn.Linear(hidden_dim, 8)  # 7 targets: yds, TDs, ints, etc.
        )

    def forward(self, x):
        return self.model(x)

In [None]:
input_dim = X_train_tensor.shape[1]
output_dim = y_train_tensor.shape[1]
hidden_dim = 256
dropout_prob = 0.3

model = QBPerformanceModel(input_dim=input_dim, hidden_dim=hidden_dim, dropout_prob=dropout_prob)

In [None]:
net = NeuralNetRegressor(
    QBPerformanceModel,
    module__input_dim=input_dim,
    module__hidden_dim=256,
    module__dropout_prob=0.3,
    max_epochs=20,
    lr=0.001,
    iterator_train__shuffle=True,
    device='cuda' if torch.cuda.is_available() else 'cpu'
)

In [None]:
param_dist = {
    'lr': stats.loguniform(1e-4, 1e-2),
    'module__hidden_dim': [64, 128, 256],
    'module__dropout_prob': [0.1, 0.3, 0.5],
    'max_epochs': [10, 20, 30]
}

In [None]:
random_search = RandomizedSearchCV(
    net,
    param_distributions=param_dist,
    n_iter=10,
    cv=3,
    scoring='neg_mean_squared_error',
    verbose=2,
    error_score='raise'  # <-- raises error instead of silently converting to nan
)
random_search.fit(X_train, y_train)

In [None]:
criterion = nn.MSELoss()
optimizer = optim.Adam(model.parameters(), lr=0.001)

In [None]:
n_epochs = 500
patience = 7
save_path = "best_model.pth"
val_score_file = "best_val_loss.txt"

best_val_loss_current_run = float('inf')
epochs_no_improve = 0
best_model_state_dict = None

for epoch in range(1, n_epochs + 1):
    model.train()
    train_loss = 0.0
    train_batches = 0
    for batch in train_loader:
        optimizer.zero_grad()
        inputs, targets = batch
        outputs = model(inputs)
        loss = criterion(outputs, targets)
        loss.backward()
        optimizer.step()

        train_loss += loss.item()
        train_batches += 1
    train_loss /= train_batches

    model.eval()
    val_loss = 0.0
    val_batches = 0
    with torch.no_grad():
        for batch in val_loader:
            inputs, targets = batch
            outputs = model(inputs)
            loss = criterion(outputs, targets)
            val_loss += loss.item()
            val_batches += 1
    val_loss /= val_batches

    print(f"Epoch {epoch}, Train Loss: {train_loss:.4f}, Val Loss: {val_loss:.4f}")

    if val_loss < best_val_loss_current_run:
        best_val_loss_current_run = val_loss
        best_model_state_dict = model.state_dict()
        epochs_no_improve = 0
    else:
        epochs_no_improve += 1

    if epochs_no_improve >= patience:
        print(f"Early stopping triggered after {epoch} epochs with no improvement in current run.")
        break

previous_best_val_loss = float('inf')
if os.path.exists(val_score_file):
    with open(val_score_file, "r") as f:
        previous_best_val_loss = float(f.read().strip())
    print(f"Previous best val loss: {previous_best_val_loss:.4f}")

if best_model_state_dict is not None and best_val_loss_current_run < previous_best_val_loss:
    print(f"New overall best model! Saving model with val loss {best_val_loss_current_run:.4f}")

    hyperparams = {
        "input_dim": input_dim,
        "hidden_dim": hidden_dim,
        "dropout_prob": dropout_prob
    }

    with open("best_model_params.json", "w") as f:
        json.dump(hyperparams, f)

    torch.save(best_model_state_dict, save_path)

    with open(val_score_file, "w") as f:
        f.write(f"{best_val_loss_current_run}")
else:
    print(f"Current run val loss {best_val_loss_current_run:.4f} did not beat previous best {previous_best_val_loss:.4f}. Not saving.")

model = QBPerformanceModel(input_dim=input_dim, hidden_dim=hidden_dim, dropout_prob=dropout_prob)
model.load_state_dict(torch.load("best_model.pth"))
model.eval()

trainable_params = sum(p.numel() for p in model.parameters() if p.requires_grad)
print(f"Trainable parameters: {trainable_params}")

In [None]:
summary(model, input_size=(input_dim,))

In [None]:
#Reload
#model = QBPerformanceModel(input_dim, output_dim)
#model.load_state_dict(torch.load("qb_model.pth"))
#model.eval()

In [None]:
model.eval()
with torch.no_grad():
    preds = model(X_val_tensor)
    mse = criterion(preds, y_val_tensor).item()
    print(f"Final MSE on validation set: {mse:.2f}")

In [None]:
actual_vs_pred = pd.DataFrame({
    'Actual': y_val_tensor[:, 0].numpy(),
    'Predicted': preds[:, 0].numpy()
})

print(actual_vs_pred.sample(10))  # See random comparisons

In [None]:
plt.figure(figsize=(6,6))
plt.scatter(actual_vs_pred['Actual'], actual_vs_pred['Predicted'], alpha=0.6)
plt.plot([0, max(actual_vs_pred['Actual'])], [0, max(actual_vs_pred['Actual'])], 'r--')
plt.xlabel("Actual Passing Yards")
plt.ylabel("Predicted Passing Yards")
plt.title("Actual vs Predicted")
plt.grid(True)
plt.show()