In [1]:
import pandas as pd

In [2]:
# Train with 2022-23 data, test with 2024 data
df_22 = pd.read_csv('2022rookiestats_QB.csv')
df_23 = pd.read_csv('2023rookiestats_QB.csv')
df_24 = pd.read_csv('2024rookiestats_QB.csv')
df_22_season = pd.read_csv('2022playerstats_QB.csv')
df_23_season = pd.read_csv('2023playerstats_QB.csv')
df_24_season = pd.read_csv('2024playerstats_QB.csv')
print(df_22.head())
print(df_23.head())
print(df_24.head())

   Unnamed: 0  Pick   Tm          Player Pos  Age      To  AP1  PB  St  ...  \
0          21    20  PIT   Kenny Pickett  QB   24  2024.0    0   0   2  ...   
1          75    74  ATL  Desmond Ridder  QB   23  2024.0    0   0   1  ...   
2          87    86  TEN    Malik Willis  QB   23  2024.0    0   0   0  ...   
3          95    94  CAR     Matt Corral  QB   23     NaN    0   0   0  ...   
4         138   137  NWE    Bailey Zappe  QB   23  2024.0    0   0   0  ...   

   Coll_pass_yds  Coll_pass_td  Coll_pass_td_pct  Coll_pass_int  \
0        12303.0          81.0               4.8           32.0   
1        10239.0          87.0               6.7           28.0   
2         5191.0          48.0               7.8           18.0   
3         8287.0          57.0               6.3           23.0   
4         5967.0          62.0               9.0           11.0   

   Coll_pass_int_pct  Coll_pass_yds_per_att  Coll_pass_adj_yds_per_att  \
0                1.9                    7.3     

# Rookie QB Model:
- Games Played
- Completion Percentage
- passing yds per game
- passing tds per game
- interceptions per game

In [3]:
def compute_features(df):
    df['GamesPlayed'] = df['Coll_games'].fillna(0)

    df['CompletionPct'] = df['Coll_pass_cmp_pct'].fillna(0)
    
    df['PassAttPerGame'] = df['Coll_pass_att'] / df['Coll_games'].replace(0, pd.NA)
    df['PassAttPerGame'] = df['PassAttPerGame'].fillna(0)

    df['PassYdsPerGame'] = df['Coll_pass_yds_per_g'].fillna(0)

    df['PassTDsPerGame'] = df['Coll_pass_td'] / df['Coll_games'].replace(0, pd.NA)
    df['PassTDsPerGame'] = df['PassTDsPerGame'].fillna(0)

    df['PassIntPerGame'] = df['Coll_pass_int'] / df['Coll_games'].replace(0, pd.NA)
    df['PassIntPerGame'] = df['PassIntPerGame'].fillna(0)

    df['RushYdsPerGame'] = df['Coll_rush_yds_per_g'].fillna(0)

    df['RushTDsPerGame'] = df['Coll_rush_td'] / df['Coll_games'].replace(0, pd.NA)
    df['RushTDsPerGame'] = df['RushTDsPerGame'].fillna(0)

In [4]:
compute_features(df_22)
compute_features(df_23)
compute_features(df_24)

In [5]:
from helpers import append_total_fantasy_points

# Append total fantasy points
df_22_merged = append_total_fantasy_points(df_22, df_22_season)
df_23_merged = append_total_fantasy_points(df_23, df_23_season)
df_24_merged = append_total_fantasy_points(df_24, df_24_season)

print(df_22_merged.head())
print(df_23_merged.head())
print(df_24_merged.head())

   Unnamed: 0  Pick   Tm          Player Pos  Age      To  AP1  PB  St  ...  \
0          21    20  PIT   Kenny Pickett  QB   24  2024.0    0   0   2  ...   
1          75    74  ATL  Desmond Ridder  QB   23  2024.0    0   0   1  ...   
2          87    86  TEN    Malik Willis  QB   23  2024.0    0   0   0  ...   
3          95    94  CAR     Matt Corral  QB   23     NaN    0   0   0  ...   
4         138   137  NWE    Bailey Zappe  QB   23  2024.0    0   0   0  ...   

   Coll_pass_rating  GamesPlayed  CompletionPct  PassAttPerGame  \
0             136.3         52.0           62.4       32.192308   
1             145.8         50.0           62.1       26.080000   
2             153.1         38.0           62.8       16.263158   
3             159.2         37.0           67.3       24.648649   
4             168.7         14.0           69.1       49.071429   

   PassYdsPerGame  PassTDsPerGame  PassIntPerGame  RushYdsPerGame  \
0           236.6        1.557692        0.615385    

In [6]:
# Keep only the features we want
features = [
    'GamesPlayed',
    'CompletionPct',
    'PassYdsPerGame',
    'PassTDsPerGame',
    'PassIntPerGame'
]

df_22_23_merged = pd.concat([df_22_merged, df_23_merged], ignore_index=True)

print(df_22_23_merged)

    Unnamed: 0  Pick   Tm                    Player Pos  Age      To  AP1  PB  \
0           21    20  PIT             Kenny Pickett  QB   24  2024.0    0   0   
1           75    74  ATL            Desmond Ridder  QB   23  2024.0    0   0   
2           87    86  TEN              Malik Willis  QB   23  2024.0    0   0   
3           95    94  CAR               Matt Corral  QB   23     NaN    0   0   
4          138   137  NWE              Bailey Zappe  QB   23  2024.0    0   0   
5          145   144  WAS                Sam Howell  QB   21  2024.0    0   0   
6          248   247  MIA           Skylar Thompson  QB   25  2024.0    0   0   
7          263   262  SFO               Brock Purdy  QB   22  2024.0    0   1   
8            2     1  CAR               Bryce Young  QB   22  2024.0    0   0   
9            3     2  HOU               C.J. Stroud  QB   21  2024.0    0   1   
10           5     4  IND        Anthony Richardson  QB   21  2024.0    0   0   
11          34    33  TEN   

In [7]:
y_test = df_24_merged['FantasyPtsPPR']
y_train = df_22_23_merged['FantasyPtsPPR']
X_test = df_24_merged[features]
X_train = df_22_23_merged[features]

# print(X_train.head())
# print(y_train.head())
# print(X_test.head())
# print(y_test.head())

print(X_train.info())
print(X_test.info())

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 22 entries, 0 to 21
Data columns (total 5 columns):
 #   Column          Non-Null Count  Dtype  
---  ------          --------------  -----  
 0   GamesPlayed     22 non-null     float64
 1   CompletionPct   22 non-null     float64
 2   PassYdsPerGame  22 non-null     float64
 3   PassTDsPerGame  22 non-null     float64
 4   PassIntPerGame  22 non-null     float64
dtypes: float64(5)
memory usage: 1012.0 bytes
None
<class 'pandas.core.frame.DataFrame'>
RangeIndex: 11 entries, 0 to 10
Data columns (total 5 columns):
 #   Column          Non-Null Count  Dtype  
---  ------          --------------  -----  
 0   GamesPlayed     11 non-null     float64
 1   CompletionPct   11 non-null     float64
 2   PassYdsPerGame  11 non-null     float64
 3   PassTDsPerGame  11 non-null     float64
 4   PassIntPerGame  11 non-null     float64
dtypes: float64(5)
memory usage: 572.0 bytes
None


In [8]:
from sklearn.preprocessing import StandardScaler
scaler = StandardScaler().fit(X_train)
X_train = scaler.transform(X_train)
X_test = scaler.transform(X_test)

print(X_train[:5])
print(X_test[:5])

[[ 1.29294703 -0.6078865   0.15371951 -0.3832293   0.21119702]
 [ 1.0981194  -0.69990602 -0.34975506 -0.14013487 -0.12666069]
 [-0.07084641 -0.48519381 -1.42953385 -0.77597013 -0.65320503]
 [-0.16826023  0.89509893 -0.04577042 -0.40609997  0.24924406]
 [-2.40877803  1.44721603  3.15556788  3.44488516  1.25024357]]
[[-0.16826023  0.77240625  0.72210747  0.89129075 -1.2345905 ]
 [ 1.58518848  0.58836721  0.07772335 -0.30257081 -1.32451987]
 [-0.85015695  0.1589428   0.63977829  0.33989969 -0.28933293]
 [ 0.90329176 -0.33182795  0.94059643  0.20655676  0.77820361]
 [ 0.12398122  0.98711845 -1.12713246 -0.82685098 -1.86522019]]


In [9]:
from helpers import print_diagnostics
print_diagnostics(X_train, X_test, features, y_train, y_test)

Training samples: 22
Testing samples: 11
Features: 5
Samples per feature: 4.4
Training target stats: mean=52.5, std=72.3
Testing target stats: mean=112.3, std=138.4


In [10]:
from sklearn.linear_model import LinearRegression
model = LinearRegression().fit(X_train, y_train)

train_preds = model.predict(X_train)
preds = model.predict(X_test)

from helpers import evaluate_model
evaluate_model(y_train, train_preds, y_test, preds)

Training RMSE: 66.70171164420151
Training R^2: 0.10905713547455653
Training MAE: 55.7807326386107
Testing RMSE: 145.9211519318059
Testing R^2: -0.22319046413859422
Testing MAE: 101.25694306065473


In [11]:
print(preds[:5])
print(y_test.head())

[72.34967921 39.14784574 54.41897175 30.61142528 61.27585457]
0    254.5
1    355.8
2    177.1
3     44.1
4      0.0
Name: FantasyPtsPPR, dtype: float64


In [12]:
from helpers import print_model_coefficients
print_model_coefficients(model, features)

Intercept: 52.52272727272729
Feature Coefficients:
GamesPlayed: -4.8987
CompletionPct: 13.0533
PassYdsPerGame: -22.1430
PassTDsPerGame: 30.4206
PassIntPerGame: 1.7850


In [13]:
from helpers import compute_rank_squared_error, build_results_df
results = build_results_df(y_test, preds, df_24_merged['Player'])
rank_squared_error = compute_rank_squared_error(results)

print(results.to_string())
print("2024 Rank Squared Error:", rank_squared_error)

             Player  Actual  Predicted  ActualRank  PredictedRank  RankError
0    Caleb Williams   254.5  72.349679           3              1         -2
1    Jayden Daniels   355.8  39.147846           1              7          6
2        Drake Maye   177.1  54.418972           4              4          0
3     Michael Penix    44.1  30.611425           6             11          5
4     J.J. McCarthy     0.0  61.275855           8              2         -6
5            Bo Nix   317.2  39.893264           2              6          4
6   Spencer Rattler    67.3  56.376851           5              3         -2
7     Jordan Travis     0.0  33.172840           8              9          1
8        Joe Milton    19.2  31.763281           7             10          3
9       Devin Leary     0.0  38.974120           8              8          0
10    Michael Pratt     0.0  44.638315           8              5         -3
2024 Rank Squared Error: 140
