In [9]:
import pandas as pd

# Train with 2023 data, Test with 2024 data
df_23 = pd.read_csv('2023playerstats_QB.csv')
df_24 = pd.read_csv('2024playerstats_QB.csv')
print(df_23.head())
print(df_23.info())
print(df_24.head())
print(df_24.info())

          Player FantPos  Age  GamesPlayed  GamesStarted  PassCmp  PassAtt  \
0     Josh Allen      QB   27           17            17      385      579   
1    Jalen Hurts      QB   25           17            17      352      538   
2   Dak Prescott      QB   30           17            17      410      590   
3  Lamar Jackson      QB   26           16            16      307      457   
4    Jordan Love      QB   25           17            17      372      579   

   PassYds  PassTD  PassInt  ...  Team_NWE  Team_NYG  Team_NYJ  Team_PHI  \
0     4306      29       18  ...     False     False     False     False   
1     3858      23       15  ...     False     False     False      True   
2     4516      36        9  ...     False     False     False     False   
3     3678      24        7  ...     False     False     False     False   
4     4159      32       11  ...     False     False     False     False   

   Team_PIT  Team_SEA  Team_SFO  Team_TAM  Team_TEN  Team_WAS  
0     Fals

# QB model:
- pass attempts per game
- passing yds per game
- passing tds per game
- interceptions per game
- rush yds per game
- rush tds per game

In [10]:
# Compute the features needed
df_23['PassAttPerGame'] = df_23['PassAtt'] / df_23['GamesPlayed'].replace(0, pd.NA)
df_23['PassAttPerGame'] = df_23['PassAttPerGame'].fillna(0)

df_23['PassYdsPerGame'] = df_23['PassYds'] / df_23['GamesPlayed'].replace(0, pd.NA)
df_23['PassYdsPerGame'] = df_23['PassYdsPerGame'].fillna(0)

df_23['PassTDsPerGame'] = df_23['PassTD'] / df_23['GamesPlayed'].replace(0, pd.NA)
df_23['PassTDsPerGame'] = df_23['PassTDsPerGame'].fillna(0)

df_23['PassIntPerGame'] = df_23['PassInt'] / df_23['GamesPlayed'].replace(0, pd.NA)
df_23['PassIntPerGame'] = df_23['PassIntPerGame'].fillna(0)

df_23['RushYdsPerGame'] = df_23['RushYds'] / df_23['GamesPlayed'].replace(0, pd.NA)
df_23['RushYdsPerGame'] = df_23['RushYdsPerGame'].fillna(0)

df_23['RushTDsPerGame'] = df_23['RushTD'] / df_23['GamesPlayed'].replace(0, pd.NA)
df_23['RushTDsPerGame'] = df_23['RushTDsPerGame'].fillna(0)

print(df_23.head())
print(df_23.tail())

          Player FantPos  Age  GamesPlayed  GamesStarted  PassCmp  PassAtt  \
0     Josh Allen      QB   27           17            17      385      579   
1    Jalen Hurts      QB   25           17            17      352      538   
2   Dak Prescott      QB   30           17            17      410      590   
3  Lamar Jackson      QB   26           16            16      307      457   
4    Jordan Love      QB   25           17            17      372      579   

   PassYds  PassTD  PassInt  ...  Team_SFO  Team_TAM  Team_TEN  Team_WAS  \
0     4306      29       18  ...     False     False     False     False   
1     3858      23       15  ...     False     False     False     False   
2     4516      36        9  ...     False     False     False     False   
3     3678      24        7  ...     False     False     False     False   
4     4159      32       11  ...     False     False     False     False   

   PassAttPerGame  PassYdsPerGame  PassTDsPerGame  PassIntPerGame  \
0    

In [None]:
# Keep only the features we want
features = [
    'PassAttPerGame',
    'PassYdsPerGame',
    'PassTDsPerGame',
    'PassIntPerGame',
    'RushYdsPerGame',
    'RushTDsPerGame'
]

# Merge the data frames to remove players that are not in both
df_merged = df_23.merge(df_24, on='Player', how='inner', suffixes=('_23', '_24'))

y = df_merged['FantasyPtsPPR_24']
X = df_merged[features]

print(y.head())
print(X.head())

0    379.0
1    315.1
2    116.5
3    430.4
4    233.9
Name: FantasyPtsPPR_24, dtype: float64
   PassAttPerGame  PassYdsPerGame  PassTDsPerGame  PassIntPerGame  \
0       34.058824      253.294118        1.705882        1.058824   
1       31.647059      226.941176        1.352941        0.882353   
2       34.705882      265.647059        2.117647        0.529412   
3       28.562500      229.875000        1.500000        0.437500   
4       34.058824      244.647059        1.882353        0.647059   

   RushYdsPerGame  RushTDsPerGame  
0       30.823529        0.882353  
1       35.588235        0.882353  
2       14.235294        0.117647  
3       51.312500        0.312500  
4       14.529412        0.235294  


In [24]:
# Generate train test split
from sklearn.model_selection import train_test_split

X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=514)

In [25]:
# Normalize the data
from sklearn.preprocessing import StandardScaler
scaler = StandardScaler().fit(X_train)
X_train = scaler.transform(X_train)
X_test = scaler.transform(X_test)

print(X_train[:5])
print(X_test[:5])

[[ 0.81400645  0.93617242  0.98211025  1.05237491  1.38586195  3.64467918]
 [ 1.07938255  1.025509    0.95422998  0.620229    0.87407164 -0.71447722]
 [ 0.72254915  0.1300459  -0.56245706  0.03251056  0.20594109 -0.71447722]
 [ 0.66527589  0.65997517  0.62524272 -0.33049201 -0.8520286  -0.71447722]
 [ 0.29944669  1.09201736  1.33340174  0.17944017 -0.32954589 -0.09693006]]
[[-0.21847151 -0.45591334 -0.39182977  0.91408822 -0.18805942  1.26167368]
 [ 0.45033624  1.02290352  0.51818245  2.32461248 -0.64396026 -0.71447722]
 [-0.25109628 -0.26009078 -0.46766412 -0.8490671   1.63554395  4.22590003]
 [ 0.36061813  0.19609716  0.16428881  0.13046363  0.82330682  0.10891899]
 [-1.8823347  -1.70727618 -1.35239823 -1.04497325 -1.15488362 -0.71447722]]


In [26]:
# Train and evaluate
from sklearn.linear_model import LinearRegression
model = LinearRegression().fit(X_train, y_train)

train_preds = model.predict(X_train)
preds = model.predict(X_test)
from sklearn.metrics import mean_squared_error, r2_score
print("Training MSE:", mean_squared_error(y_train, train_preds))
print("Training R^2:", r2_score(y_train, train_preds))
print("Testing MSE:", mean_squared_error(y_test, preds))
print("Testing R^2:", r2_score(y_test, preds))

Training MSE: 9938.868842610786
Training R^2: 0.3828000860698577
Testing MSE: 5222.336466758379
Testing R^2: -0.4456863508418143


In [27]:
print(preds[:5])
print(y_test.head())

[ 64.24992474  51.28549131 241.41469879 153.66879434  36.21490573]
42     15.3
37      1.3
35    163.4
34     76.7
60      1.1
Name: FantasyPtsPPR_24, dtype: float64


In [28]:
# Model coefficients
print("Intercept:", model.intercept_)
print("Feature Coefficients:")
for feat, coef in zip(features, model.coef_):
    print(f"{feat}: {coef:.4f}")

Intercept: 136.39803921568625
Feature Coefficients:
PassAttPerGame: 7.8618
PassYdsPerGame: 52.2079
PassTDsPerGame: 29.4253
PassIntPerGame: -60.6027
RushYdsPerGame: 6.5723
RushTDsPerGame: 17.0675


In [29]:
# actual - 1d data frame
# predicted - 1d numpy array
# player - 1d data frame that matches the predictions to the player
# display - T/F for displaying a preview of the results
def compute_rank_squared_error(actual, predicted, player, display):
    results = pd.DataFrame({
        'Player': player,
        'Actual': actual.values,
        'Predicted': predicted
    }, index=actual.index)

    results['ActualRank'] = results['Actual'].rank(ascending=False, method='min').astype(int)
    results['PredictedRank'] = results['Predicted'].rank(ascending=False, method='min').astype(int)

    results['RankError'] = results['PredictedRank'] - results['ActualRank']

    rank_squared_error = (results['RankError'] ** 2).sum()

    if display:
        print(results.head())

    return rank_squared_error

In [30]:
rank_squared_error = compute_rank_squared_error(y_test, preds, df_test['Player'], True)

print("2024 Rank Squared Error:", rank_squared_error)
# print(results.head())


                Player  Actual   Predicted  ActualRank  PredictedRank  \
42         Andy Dalton    15.3   64.249925          11              9   
37     Aidan O'Connell     1.3   51.285491          12             11   
35  Gardner Minshew II   163.4  241.414699           2              1   
34           Mac Jones    76.7  153.668794           6              5   
60         Davis Mills     1.1   36.214906          13             12   

    RankError  
42         -2  
37         -1  
35         -1  
34         -1  
60         -1  
2024 Rank Squared Error: 86


In [31]:
results = pd.DataFrame({
    'Player': df_merged['Player'],
    'Actual': y_test.values,
    'Predicted': preds
}, index=y_test.index)

print(results.to_string())

                      Player  Actual   Predicted
42         Mitchell Trubisky    15.3   64.249925
37              Nick Mullens     1.3   51.285491
35        Anthony Richardson   163.4  241.414699
34            Deshaun Watson    76.7  153.668794
60                Mike White     1.1   36.214906
27           Aidan O'Connell    93.5  147.278156
33              Tyrod Taylor    20.1  116.589631
21            Desmond Ridder    21.9  126.946228
48               Andy Dalton    59.0  158.727821
40              Daniel Jones   135.3   53.658893
10            Tua Tagovailoa   181.6  183.553679
51  Dorian Thompson-Robinson    15.8   24.610994
43             Mason Rudolph    95.8  194.909921
