In [25]:
import pandas as pd

In [26]:
# Fix: Use list instead of generator for train_years
train_years = list(range(2020, 2023))  # Changed from generator to list
test_years = (2023, 2024)

train_data = {}
test_data = {}

print("Train years:", train_years)
print("Test years:", test_years)

Train years: [2020, 2021, 2022]
Test years: (2023, 2024)


In [27]:
for year in train_years:
    train_data[year] = pd.read_csv(f"data/{year}playerstats_QB.csv")

for year in test_years:
    test_data[year] = pd.read_csv(f"data/{year}playerstats_QB.csv")

# QB model:
- pass attempts per game
- passing yds per game
- passing tds per game
- interceptions per game
- rush yds per game
- rush tds per game

In [28]:
from helpers import compute_qb_features

In [29]:
for year in train_years:
    print(f"Computing features for training year {year}")
    compute_qb_features(train_data[year])

for year in test_years:
    print(f"Computing features for test year {year}")
    compute_qb_features(test_data[year])

Computing features for training year 2020
Computing features for training year 2021
Computing features for training year 2022
Computing features for test year 2023
Computing features for test year 2024


In [46]:
# Keep only the features we want
features = [
    'PassAttPerGame',
    'PassYdsPerGame',
    'PassTDsPerGame',
    'PassIntPerGame',
    'RushYdsPerGame',
    'RushTDsPerGame'
]

all_dfs = list(train_data.values()) + list(test_data.values())
combined_df = pd.concat(all_dfs, ignore_index=True)

combined_df['FantasyPtsPPR_PerGame'] = combined_df['FantasyPtsPPR'] / combined_df['GamesPlayed']
combined_df['FantasyPtsPPR_PerGame'] = combined_df['FantasyPtsPPR_PerGame'].fillna(0)

combined_df = combined_df.sort_values(by=['Player', 'Year'])

combined_df['NextYearFantasyPtsPPR_PerGame'] = combined_df.groupby('Player')['FantasyPtsPPR_PerGame'].shift(-1)

final_df = combined_df.dropna(subset=['NextYearFantasyPtsPPR_PerGame'])

X_train = final_df[final_df['Year'] < test_years[0]][features]
y_train = final_df[final_df['Year'] < test_years[0]]['NextYearFantasyPtsPPR_PerGame']

X_test = final_df[final_df['Year'] == test_years[0]][features]
y_test = final_df[final_df['Year'] == test_years[0]]['NextYearFantasyPtsPPR_PerGame']

print("X_train shape:", X_train.shape)
print("y_train shape:", y_train.shape)
print("X_test shape:", X_test.shape)
print("y_test shape:", y_test.shape)
print(X_train.head())
print(y_train.head())
print(X_test.head())
print(y_test.head())


X_train shape: (210, 6)
y_train shape: (210,)
X_test shape: (64, 6)
y_test shape: (64,)
     PassAttPerGame  PassYdsPerGame  PassTDsPerGame  PassIntPerGame  \
74         0.500000       10.000000        0.000000        0.000000   
1         32.875000      268.687500        3.000000        0.312500   
86        33.187500      257.187500        2.312500        0.250000   
177       31.882353      217.352941        1.529412        0.705882   
28        30.272727      197.272727        1.272727        0.727273   

     RushYdsPerGame  RushTDsPerGame  
74         0.000000        0.000000  
1          9.312500        0.187500  
86         6.312500        0.187500  
177        5.529412        0.058824  
28        10.363636        0.000000  
74      0.400000
1      20.831250
86     14.070588
177     0.000000
28     10.525000
Name: NextYearFantasyPtsPPR_PerGame, dtype: float64
     PassAttPerGame  PassYdsPerGame  PassTDsPerGame  PassIntPerGame  \
328        1.000000        0.000000        0.0000

In [35]:
# Normalize the data
from sklearn.preprocessing import StandardScaler
scaler = StandardScaler().fit(X_train)
X_train = scaler.transform(X_train)
X_test = scaler.transform(X_test)

print(X_train[:5])
print(X_test[:5])

[[-1.67406575 -1.51344561 -1.28742162 -1.33372768 -0.80672685 -0.66389049]
 [ 0.81517309  1.17496572  2.83730432 -0.56852486 -0.19532316  0.36162408]
 [ 0.83920049  1.0554519   1.89205462 -0.72156542 -0.39228542  0.36162408]
 [ 0.73885075  0.64147102  0.81537984  0.39473046 -0.44369836 -0.34216043]
 [ 0.61509036  0.43278728  0.46246211  0.44710798 -0.12631176 -0.66389049]]
[[-1.63562191 -1.61737067 -1.28742162 -1.33372768 -0.80672685 -0.66389049]
 [ 0.68498826  0.4781364   0.21247872  0.22450352 -0.74107276 -0.16667131]
 [-0.22601433 -0.36680581 -0.37081586 -1.33372768 -0.5441105  -0.66389049]
 [-0.09786818 -0.11825171 -0.25624014 -0.72156542  1.42551213  4.80552056]
 [-0.08249065 -0.29544394 -0.46247643  0.87005645 -0.26179792 -0.11694939]]


In [36]:
from helpers import print_diagnostics
print_diagnostics(X_train, X_test, features, y_train, y_test)

Training samples: 210
Testing samples: 64
Features: 6
Samples per feature: 35.0
Training target stats: mean=9.9, std=7.1
Testing target stats: mean=10.5, std=7.0


In [38]:
# Train and evaluate
from sklearn.linear_model import LinearRegression
model = LinearRegression().fit(X_train, y_train)

train_preds = model.predict(X_train)
preds = model.predict(X_test)

from helpers import evaluate_model
evaluate_model(y_train, train_preds, y_test, preds)

Training RMSE: 5.375059537790496
Training R^2: 0.43198488668524837
Training MAE: 4.2279393499823845
Testing RMSE: 5.83303452624775
Testing R^2: 0.29715597907273106
Testing MAE: 4.628119814930369


In [47]:
print(preds[:5])
print(y_test.head())

[ 3.86698765 10.06691121  8.73041598 14.66024457  6.66964346]
328    15.094118
276    10.388889
301     9.833333
286    14.854545
287     7.000000
Name: NextYearFantasyPtsPPR_PerGame, dtype: float64


In [48]:
# Model coefficients
print("Intercept:", model.intercept_)
print("Feature Coefficients:")
for feat, coef in zip(features, model.coef_):
    print(f"{feat}: {coef:.4f}")

Intercept: 9.924433047305948
Feature Coefficients:
PassAttPerGame: -0.5948
PassYdsPerGame: 2.4072
PassTDsPerGame: 2.6333
PassIntPerGame: -1.1700
RushYdsPerGame: 1.0579
RushTDsPerGame: 0.6835


In [49]:
from helpers import compute_rank_squared_error, build_results_df

In [50]:
test_players = final_df[final_df['Year'] == test_years[0]]['Player']

results = build_results_df(y_test, preds, test_players)
rank_squared_error = compute_rank_squared_error(results)

print(results.to_string())
print("2024 Rank Squared Error:", rank_squared_error)
# print(results.head())

                       Player     Actual  Predicted  ActualRank  PredictedRank  RankError
328             Aaron Rodgers  15.094118   3.866988          19             60         41
276           Aidan O'Connell  10.388889  10.066911          35             36          1
301               Andy Dalton   9.833333   8.730416          39             39          0
286        Anthony Richardson  14.854545  14.660245          21             12         -9
287              Bailey Zappe   7.000000   6.669643          44             51          7
258            Baker Mayfield  21.517647  13.505068           4             17         13
254               Brock Purdy  17.793333  15.473277           9              6         -3
271               Bryce Young  13.928571   8.701596          24             40         16
257               C.J. Stroud  12.964706  15.360128          27              7        -20
300              Carson Wentz   1.566667  11.404984          53             27        -26
313       

In [36]:
import joblib
import os

model_package = {
    'model': model,
    'scaler': scaler,
    'features': features
}

os.makedirs('models', exist_ok=True)
filepath = os.path.join('models', 'qb_model.joblib')

joblib.dump(model_package, filepath)

print("Model saved as 'qb_model.joblib'")

Model saved as 'qb_model.joblib'
