In [1]:
import pandas as pd

# Train with 2023 data, Test with 2024 data
df_train = pd.read_csv('2023playerstats_QB.csv')
df_test = pd.read_csv('2024playerstats_QB.csv')
print(df_train.head())
print(df_train.info())
print(df_test.head())
print(df_test.info())

          Player FantPos  Age  GamesPlayed  GamesStarted  PassCmp  PassAtt  \
0     Josh Allen      QB   27           17            17      385      579   
1    Jalen Hurts      QB   25           17            17      352      538   
2   Dak Prescott      QB   30           17            17      410      590   
3  Lamar Jackson      QB   26           16            16      307      457   
4    Jordan Love      QB   25           17            17      372      579   

   PassYds  PassTD  PassInt  ...  Team_NWE  Team_NYG  Team_NYJ  Team_PHI  \
0     4306      29       18  ...     False     False     False     False   
1     3858      23       15  ...     False     False     False      True   
2     4516      36        9  ...     False     False     False     False   
3     3678      24        7  ...     False     False     False     False   
4     4159      32       11  ...     False     False     False     False   

   Team_PIT  Team_SEA  Team_SFO  Team_TAM  Team_TEN  Team_WAS  
0     Fals

# QB model:
- pass attempts per game
- passing yds per game
- passing tds per game
- interceptions per game
- rush yds per game
- rush tds per game

In [2]:
# Compute the features needed
df_train['PassAttPerGame'] = df_train['PassAtt'] / df_train['GamesPlayed'].replace(0, pd.NA)
df_train['PassAttPerGame'] = df_train['PassAttPerGame'].fillna(0)
df_test['PassAttPerGame'] = df_test['PassAtt'] / df_test['GamesPlayed'].replace(0, pd.NA)
df_test['PassAttPerGame'] = df_test['PassAttPerGame'].fillna(0)

df_train['PassYdsPerGame'] = df_train['PassYds'] / df_train['GamesPlayed'].replace(0, pd.NA)
df_train['PassYdsPerGame'] = df_train['PassYdsPerGame'].fillna(0)
df_test['PassYdsPerGame'] = df_test['PassYds'] / df_test['GamesPlayed'].replace(0, pd.NA)
df_test['PassYdsPerGame'] = df_test['PassYdsPerGame'].fillna(0)

df_train['PassTDsPerGame'] = df_train['PassTD'] / df_train['GamesPlayed'].replace(0, pd.NA)
df_train['PassTDsPerGame'] = df_train['PassTDsPerGame'].fillna(0)
df_test['PassTDsPerGame'] = df_test['PassTD'] / df_test['GamesPlayed'].replace(0, pd.NA)
df_test['PassTDsPerGame'] = df_test['PassTDsPerGame'].fillna(0)

df_train['PassIntPerGame'] = df_train['PassInt'] / df_train['GamesPlayed'].replace(0, pd.NA)
df_train['PassIntPerGame'] = df_train['PassIntPerGame'].fillna(0)
df_test['PassIntPerGame'] = df_test['PassInt'] / df_test['GamesPlayed'].replace(0, pd.NA)
df_test['PassIntPerGame'] = df_test['PassIntPerGame'].fillna(0)

df_train['RushYdsPerGame'] = df_train['RushYds'] / df_train['GamesPlayed'].replace(0, pd.NA)
df_train['RushYdsPerGame'] = df_train['RushYdsPerGame'].fillna(0)
df_test['RushYdsPerGame'] = df_test['RushYds'] / df_test['GamesPlayed'].replace(0, pd.NA)
df_test['RushYdsPerGame'] = df_test['RushYdsPerGame'].fillna(0)

df_train['RushTDsPerGame'] = df_train['RushTD'] / df_train['GamesPlayed'].replace(0, pd.NA)
df_train['RushTDsPerGame'] = df_train['RushTDsPerGame'].fillna(0)
df_test['RushTDsPerGame'] = df_test['RushTD'] / df_test['GamesPlayed'].replace(0, pd.NA)
df_test['RushTDsPerGame'] = df_test['RushTDsPerGame'].fillna(0)

print(df_train.head())
print(df_train.tail())
print(df_test.head())
print(df_test.tail())

          Player FantPos  Age  GamesPlayed  GamesStarted  PassCmp  PassAtt  \
0     Josh Allen      QB   27           17            17      385      579   
1    Jalen Hurts      QB   25           17            17      352      538   
2   Dak Prescott      QB   30           17            17      410      590   
3  Lamar Jackson      QB   26           16            16      307      457   
4    Jordan Love      QB   25           17            17      372      579   

   PassYds  PassTD  PassInt  ...  Team_SFO  Team_TAM  Team_TEN  Team_WAS  \
0     4306      29       18  ...     False     False     False     False   
1     3858      23       15  ...     False     False     False     False   
2     4516      36        9  ...     False     False     False     False   
3     3678      24        7  ...     False     False     False     False   
4     4159      32       11  ...     False     False     False     False   

   PassAttPerGame  PassYdsPerGame  PassTDsPerGame  PassIntPerGame  \
0    

In [5]:
# Remove all features that we don't need
features = [
    'PassAttPerGame',
    'PassYdsPerGame',
    'PassTDsPerGame',
    'PassIntPerGame',
    'RushYdsPerGame',
    'RushTDsPerGame'
]

# Replace NaN in 'FantasyPtsPPR'
df_test['FantasyPtrPPR'] = df_test['FantasyPtsPPR'].fillna(0)
df_train['FantasyPtrPPR'] = df_train['FantasyPtsPPR'].fillna(0)


y_test = df_test['FantasyPtsPPR']
X_test = df_test[features]

y_train = df_train['FantasyPtsPPR']
X_train = df_train[features]

# print(y_test.head())
# print(X_test.head())
print(y_train.to_string())
print(X_train.head())

0     392.6
1     356.8
2     342.8
3     331.2
4     319.1
5     295.6
6     289.1
7     280.2
8     276.0
9     274.1
10    270.4
11    262.5
12    257.5
13    256.9
14    243.1
15    241.1
16    234.2
17    230.2
18    227.3
19    200.7
20    196.2
21    177.1
22    156.4
23    149.7
24    147.2
25    146.4
26    144.4
27    125.8
28    119.9
29    110.2
30    106.4
31    102.0
32    100.8
33     93.5
34     87.3
35     86.8
36     80.0
37     72.7
38     69.2
39     67.6
40     66.7
41     66.0
42     64.1
43     57.0
44     53.3
45     48.7
46     39.6
47     29.1
48     27.0
49     26.6
50     25.6
51     26.1
52     23.6
53     22.9
54     23.4
55     22.1
56     21.5
57     20.0
58     17.8
59     13.9
60     11.8
61     12.0
62     10.9
63      9.7
64      7.5
65      5.9
66      4.9
67      4.8
68      3.3
69      3.1
70      2.1
71      2.1
72      1.3
73      0.8
74     -0.3
75     -0.2
76      0.0
77      0.0
78     -0.4
79      0.0
80     -0.1
81     -0.3
82     -1.3
   P

In [6]:
# Normalize the data
from sklearn.preprocessing import StandardScaler
scaler = StandardScaler().fit(X_train)
X_train = scaler.transform(X_train)
X_test = scaler.transform(X_test)

print(X_train[:5])
print(X_test[:5])

[[ 1.02924367  1.18054938  1.34183296  1.08316539  1.61274314  3.78749885]
 [ 0.83789294  0.89492849  0.81440061  0.69026841  2.00570684  3.78749885]
 [ 1.08058167  1.31443417  1.95717071 -0.09552553  0.24464729 -0.00409135]
 [ 0.59316236  0.92672612  1.03416409 -0.30015938  3.3025477   0.962035  ]
 [ 1.02924367  1.08683002  1.60554914  0.16640578  0.26890431  0.57923022]]
[[ 0.53919912  1.09511813  2.39669768 -0.75035382  3.50964199  0.57923022]
 [ 0.58120293  0.81396006  1.25392757 -0.48842251  1.64670297  2.9125165 ]
 [ 1.36994132  1.57072791  2.57250846 -0.09552553  0.04573974 -0.00409135]
 [ 0.98723985  1.30423342  2.39669768  0.82123407  0.9044382   0.28756944]
 [ 0.56720166  0.71003996  0.99021139 -0.09552553  3.3932083   1.16255179]]


In [7]:
# Train and evaluate
from sklearn.linear_model import LinearRegression
model = LinearRegression().fit(X_train, y_train)

train_preds = model.predict(X_train)
preds = model.predict(X_test)
from sklearn.metrics import mean_squared_error, r2_score
print("Training MSE:", mean_squared_error(y_train, train_preds))
print("Training R^2:", r2_score(y_train, train_preds))
print("2024 MSE:", mean_squared_error(y_test, preds))
print("2024 R^2:", r2_score(y_test, preds))

Training MSE: 3000.71582665461
Training R^2: 0.7564396820899209
2024 MSE: 4997.999655728513
2024 R^2: 0.6565093454279787


In [8]:
print(preds[:5])
print(y_test.head())

[315.73689869 286.3966851  296.07879397 260.76066175 234.39925536]
0    430.4
1    379.0
2    372.8
3    365.8
4    355.8
Name: FantasyPtsPPR, dtype: float64


In [9]:
# Model coefficients
print("Intercept:", model.intercept_)
print("Feature Coefficients:")
for feat, coef in zip(features, model.coef_):
    print(f"{feat}: {coef:.4f}")

Intercept: 104.7421686746988
Feature Coefficients:
PassAttPerGame: -8.2534
PassYdsPerGame: 72.9993
PassTDsPerGame: 33.0993
PassIntPerGame: -27.3508
RushYdsPerGame: 6.6928
RushTDsPerGame: 20.9952


In [10]:
# actual - 1d data frame
# predicted - 1d numpy array
# player - 1d data frame that matches the predictions to the player
# display - T/F for displaying a preview of the results
def compute_rank_squared_error(actual, predicted, player, display):
    results = pd.DataFrame({
        'Player': player,
        'Actual': actual.values,
        'Predicted': predicted
    }, index=actual.index)

    results['ActualRank'] = results['Actual'].rank(ascending=False, method='min').astype(int)
    results['PredictedRank'] = results['Predicted'].rank(ascending=False, method='min').astype(int)

    results['RankError'] = results['PredictedRank'] - results['ActualRank']

    rank_squared_error = (results['RankError'] ** 2).sum()

    if display:
        print(results.head())

    return rank_squared_error

In [11]:
rank_squared_error = compute_rank_squared_error(y_test, preds, df_test['Player'], True)

print("2024 Rank Squared Error:", rank_squared_error)
# print(results.head())


           Player  Actual   Predicted  ActualRank  PredictedRank  RankError
0   Lamar Jackson   430.4  315.736899           1              1          0
1      Josh Allen   379.0  286.396685           2              4          2
2      Joe Burrow   372.8  296.078794           3              3          0
3  Baker Mayfield   365.8  260.760662           4              6          2
4  Jayden Daniels   355.8  234.399255           5              9          4
2024 Rank Squared Error: 10089
