In [25]:
import pandas as pd

# Train with 2022-2023 data, Test with 2023-2024 data
df_22 = pd.read_csv('data/2022playerstats_QB.csv')
df_23 = pd.read_csv('data/2023playerstats_QB.csv')
df_24 = pd.read_csv('data/2024playerstats_QB.csv')
print(df_22.head())
# print(df_22.info())
print(df_23.head())
# print(df_23.info())
print(df_24.head())
# print(df_24.info())

            Player FantPos  Age  GamesPlayed  GamesStarted  PassCmp  PassAtt  \
0  Patrick Mahomes      QB   27           17            17      435      648   
1       Josh Allen      QB   26           16            16      359      567   
2      Jalen Hurts      QB   24           15            15      306      460   
3       Joe Burrow      QB   26           16            16      414      606   
4       Geno Smith      QB   32           17            17      399      572   

   PassYds  PassTD  PassInt  ...  Team_NWE  Team_NYG  Team_NYJ  Team_PHI  \
0     5250      41       12  ...     False     False     False     False   
1     4283      35       14  ...     False     False     False     False   
2     3701      22        6  ...     False     False     False      True   
3     4475      35       12  ...     False     False     False     False   
4     4282      30       11  ...     False     False     False     False   

   Team_PIT  Team_SEA  Team_SFO  Team_TAM  Team_TEN  Team_WAS 

# QB model:
- pass attempts per game
- passing yds per game
- passing tds per game
- interceptions per game
- rush yds per game
- rush tds per game

In [26]:
def compute_features(df):
    # Compute the features needed
    df['PassAttPerGame'] = df['PassAtt'] / df['GamesPlayed'].replace(0, pd.NA)
    df['PassAttPerGame'] = df['PassAttPerGame'].fillna(0)

    df['PassYdsPerGame'] = df['PassYds'] / df['GamesPlayed'].replace(0, pd.NA)
    df['PassYdsPerGame'] = df['PassYdsPerGame'].fillna(0)

    df['PassTDsPerGame'] = df['PassTD'] / df['GamesPlayed'].replace(0, pd.NA)
    df['PassTDsPerGame'] = df['PassTDsPerGame'].fillna(0)

    df['PassIntPerGame'] = df['PassInt'] / df['GamesPlayed'].replace(0, pd.NA)
    df['PassIntPerGame'] = df['PassIntPerGame'].fillna(0)

    df['RushYdsPerGame'] = df['RushYds'] / df['GamesPlayed'].replace(0, pd.NA)
    df['RushYdsPerGame'] = df['RushYdsPerGame'].fillna(0)

    df['RushTDsPerGame'] = df['RushTD'] / df['GamesPlayed'].replace(0, pd.NA)
    df['RushTDsPerGame'] = df['RushTDsPerGame'].fillna(0)

In [27]:
compute_features(df_22)
compute_features(df_23)

print(df_22.head())
print(df_23.head())

            Player FantPos  Age  GamesPlayed  GamesStarted  PassCmp  PassAtt  \
0  Patrick Mahomes      QB   27           17            17      435      648   
1       Josh Allen      QB   26           16            16      359      567   
2      Jalen Hurts      QB   24           15            15      306      460   
3       Joe Burrow      QB   26           16            16      414      606   
4       Geno Smith      QB   32           17            17      399      572   

   PassYds  PassTD  PassInt  ...  Team_SFO  Team_TAM  Team_TEN  Team_WAS  \
0     5250      41       12  ...     False     False     False     False   
1     4283      35       14  ...     False     False     False     False   
2     3701      22        6  ...     False     False     False     False   
3     4475      35       12  ...     False     False     False     False   
4     4282      30       11  ...     False     False     False     False   

   PassAttPerGame  PassYdsPerGame  PassTDsPerGame  PassIntPerG

In [28]:
# Keep only the features we want
features = [
    'PassAttPerGame',
    'PassYdsPerGame',
    'PassTDsPerGame',
    'PassIntPerGame',
    'RushYdsPerGame',
    'RushTDsPerGame'
]

# Merge the data frames to remove players that are not in both
df_22_23_merged = df_22.merge(df_23, on='Player', how='inner', suffixes=('', '_23'))
df_23_24_merged = df_23.merge(df_24, on='Player', how='inner', suffixes=('', '_24'))

y_test = df_23_24_merged['FantasyPtsPPR_24']
y_train = df_22_23_merged['FantasyPtsPPR_23']
X_test = df_23_24_merged[features]
X_train = df_22_23_merged[features]

print(y_train.head())
print(X_train.head())
print(y_test.head())
print(X_test.head())

0    280.2
1    392.6
2    356.8
3    147.2
4    227.3
Name: FantasyPtsPPR_23, dtype: float64
   PassAttPerGame  PassYdsPerGame  PassTDsPerGame  PassIntPerGame  \
0       38.117647      308.823529        2.411765        0.705882   
1       35.437500      267.687500        2.187500        0.875000   
2       30.666667      246.733333        1.466667        0.400000   
3       37.875000      279.687500        2.187500        0.750000   
4       33.647059      251.882353        1.764706        0.647059   

   RushYdsPerGame  RushTDsPerGame  
0       21.058824        0.235294  
1       47.625000        0.437500  
2       50.666667        0.866667  
3       16.062500        0.312500  
4       21.529412        0.058824  
0    379.0
1    315.1
2    116.5
3    430.4
4    233.9
Name: FantasyPtsPPR_24, dtype: float64
   PassAttPerGame  PassYdsPerGame  PassTDsPerGame  PassIntPerGame  \
0       34.058824      253.294118        1.705882        1.058824   
1       31.647059      226.941176        1.

In [29]:
# Normalize the data
from sklearn.preprocessing import StandardScaler
scaler = StandardScaler().fit(X_train)
X_train = scaler.transform(X_train)
X_test = scaler.transform(X_test)

print(X_train[:5])
print(X_test[:5])

[[ 1.22416271  1.68110358  2.43432444  0.18402528  0.47726583  0.53708119]
 [ 0.99070487  1.19255375  2.05701     0.59718102  2.15211566  1.58457786]
 [ 0.57513497  0.94369272  0.84424304 -0.56324773  2.34387585  3.80781018]
 [ 1.20302661  1.3350711   2.05701     0.29180504  0.16227539  0.93703446]
 [ 0.83474607  1.00484478  1.3456795   0.04031893  0.50693381 -0.37709772]]
[[ 0.87061339  1.02161152  1.24671178  1.04626335  1.09287629  3.88907053]
 [ 0.66053336  0.70863224  0.65290544  0.61514432  1.39326452  3.88907053]
 [ 0.92697633  1.16832056  1.93948583 -0.24709376  0.04708021 -0.07237142]
 [ 0.39184868  0.74347564  0.90032475 -0.47163493  2.38459205  0.93703446]
 [ 0.87061339  0.9189152   1.54361494  0.04031893  0.0656227   0.53708119]]


In [30]:
from helpers import print_diagnostics
print_diagnostics(X_train, X_test, features, y_train, y_test)

Training samples: 65
Testing samples: 64
Features: 6
Samples per feature: 10.8
Training target stats: mean=115.4, std=118.3
Testing target stats: mean=122.5, std=120.6


In [31]:
# Train and evaluate
from sklearn.linear_model import LinearRegression
model = LinearRegression().fit(X_train, y_train)

train_preds = model.predict(X_train)
preds = model.predict(X_test)

from helpers import evaluate_model
evaluate_model(y_train, train_preds, y_test, preds)

Training RMSE: 88.4336996362048
Training R^2: 0.4324899853720058
Training MAE: 71.62223895711469
Testing RMSE: 100.52369007353049
Testing R^2: 0.2947701279509122
Testing MAE: 77.75720657749756


In [32]:
print(preds[:5])
print(y_test.head())

[235.63082223 212.19285062 239.95854635 261.84944888 207.64678751]
0    379.0
1    315.1
2    116.5
3    430.4
4    233.9
Name: FantasyPtsPPR_24, dtype: float64


In [33]:
# Model coefficients
print("Intercept:", model.intercept_)
print("Feature Coefficients:")
for feat, coef in zip(features, model.coef_):
    print(f"{feat}: {coef:.4f}")

Intercept: 115.37999999999998
Feature Coefficients:
PassAttPerGame: -56.8784
PassYdsPerGame: 68.2831
PassTDsPerGame: 47.9280
PassIntPerGame: -15.8836
RushYdsPerGame: 25.2887
RushTDsPerGame: 7.5184


In [34]:
from helpers import compute_rank_squared_error, build_results_df

In [35]:
results = build_results_df(y_test, preds, df_23_24_merged['Player'])
rank_squared_error = compute_rank_squared_error(results)

print(results.to_string())
print("2024 Rank Squared Error:", rank_squared_error)
# print(results.head())

                      Player  Actual   Predicted  ActualRank  PredictedRank  RankError
0                 Josh Allen   379.0  235.630822           2              4          2
1                Jalen Hurts   315.1  212.192851           6              8          2
2               Dak Prescott   116.5  239.958546          28              3        -25
3              Lamar Jackson   430.4  261.849449           1              1          0
4                Jordan Love   233.9  207.646788          14              9         -5
5                Brock Purdy   266.9  247.202759          11              2         -9
6                 Jared Goff   324.5  182.367690           5             14          9
7            Patrick Mahomes   283.0  187.018688          10             13          3
8                C.J. Stroud   220.4  212.508414          15              7         -8
9             Baker Mayfield   365.8  174.507880           4             18         14
10            Tua Tagovailoa   181.6  188.8

In [None]:
import joblib
import os

model_package = {
    'model': model,
    'scaler': scaler,
    'features': features
}

os.makedirs('models', exist_ok=True)
filepath = os.path.join('models', 'qb_model.joblib')

joblib.dump(model_package, filepath)

print("Model saved as 'qb_model.joblib'")

Model saved as 'qb_model.joblib'
