In [1]:
import pandas as pd
pd.options.mode.chained_assignment = None  # default='warn'

import numpy as np
from tqdm import tqdm

In [2]:
df = pd.read_csv(f"processed data/qb_proc_data.csv")

In [3]:
df = df[df["Rookie"] == 0]

In [4]:
corrs = df.corr(numeric_only=True)

In [5]:
corrs[["FPTS_TG"]]

Unnamed: 0,FPTS_TG
pid,0.129003
Season,0.202124
AVG,-0.760771
FPTS_TG,1.0
Y/A_PASSING_avg,0.403851
FPTS/G_MISC_avg,0.587862
CMP_PASSING/G_avg,0.447485
ATT_PASSING/G_avg,0.436051
PCT_PASSING/G_avg,-0.318341
YDS_PASSING/G_avg,0.500064


In [6]:
rel_cols = ["Season"]
for i, vals in corrs[["FPTS_TG"]].iterrows():
    val = vals["FPTS_TG"]
    if (val > 0.4 or val < -0.4) and "_ls" not in i:
        rel_cols.append(i)

In [7]:
rel_df = df[rel_cols]

In [8]:
for col in rel_df.columns[2:]:
    if "FPTS" not in col:
        if "_avg" in col:
            new_c = rel_df["FPTS/G_MISC_avg"]*rel_df[col]
        elif "_ewm" in col:
            new_c = rel_df["FPTS/G_MISC_ewm"]*rel_df[col]
        
        new_name = col+"_FP"

        rel_df[new_name] = new_c

In [9]:
corrs = rel_df.corr()
corrs[["FPTS_TG"]]

Unnamed: 0,FPTS_TG
Season,0.202124
AVG,-0.760771
FPTS_TG,1.0
Y/A_PASSING_avg,0.403851
FPTS/G_MISC_avg,0.587862
CMP_PASSING/G_avg,0.447485
ATT_PASSING/G_avg,0.436051
YDS_PASSING/G_avg,0.500064
TD_PASSING/G_avg,0.516905
Y/A_PASSING_ewm,0.421689


In [10]:
model_cols = ["Season"]
for i, vals in corrs[["FPTS_TG"]].iterrows():
    val = vals["FPTS_TG"]
    if val > 0.5 or val < -0.5:
        model_cols.append(i)

In [11]:
import pickle
with open("model features/qb_feats", "wb") as fp:   #Pickling
    pickle.dump(model_cols, fp)

In [12]:
model_df = rel_df[model_cols]

In [13]:
model_df

Unnamed: 0,Season,AVG,FPTS_TG,FPTS/G_MISC_avg,YDS_PASSING/G_avg,TD_PASSING/G_avg,FPTS/G_MISC_ewm,YDS_PASSING/G_ewm,TD_PASSING/G_ewm,Y/A_PASSING_avg_FP,CMP_PASSING/G_avg_FP,ATT_PASSING/G_avg_FP,YDS_PASSING/G_avg_FP,TD_PASSING/G_avg_FP,Y/A_PASSING_ewm_FP,CMP_PASSING/G_ewm_FP,ATT_PASSING/G_ewm_FP,YDS_PASSING/G_ewm_FP,TD_PASSING/G_ewm_FP
0,2018,118.0,26.1,10.400000,284.000000,0.000000,10.400000,284.000000,0.000000,84.240000,228.800000,364.000000,2953.600000,0.000000,84.240000,228.800000,364.000000,2953.600000,0.000000
1,2018,110.5,22.2,16.866667,283.958333,1.645833,16.738754,279.960286,1.624583,137.744444,394.258333,589.279167,4789.430556,27.759722,136.710347,383.586177,575.245966,4686.186315,27.193498
2,2018,111.5,21.3,18.133333,294.784127,1.896032,17.954692,290.140906,1.905838,142.044444,447.360847,682.057989,5345.418836,34.381376,139.134408,438.133227,672.353585,5209.390555,34.218733
3,2018,48.0,20.7,24.100000,242.714286,2.714286,24.100000,242.714286,2.714286,200.030000,433.800000,702.342857,5849.414286,65.414286,200.030000,433.800000,702.342857,5849.414286,65.414286
4,2018,97.0,20.5,19.600000,275.690476,2.104762,19.754416,276.887401,2.098226,139.160000,452.853333,766.266667,5403.533333,41.253333,142.628876,456.291952,762.942597,5469.748795,41.449224
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
215,2022,139.0,15.2,17.033333,226.786356,1.536356,16.780566,222.097078,1.529058,111.284444,363.990087,594.503690,3862.927601,26.169267,110.187107,348.645527,570.771377,3726.914637,25.658459
216,2022,103.0,12.9,19.566667,285.012255,2.137255,19.505221,282.763062,2.151374,159.142222,445.645221,683.058905,5576.739788,41.818954,157.702040,445.708791,679.379521,5515.355886,41.963034
217,2022,186.5,15.1,22.850000,279.118750,1.897917,22.961522,282.948042,1.926155,190.797500,526.406875,765.475000,6377.863438,43.367396,193.895478,532.275465,771.180787,6496.917798,44.227444
218,2022,187.0,16.3,12.933333,168.401786,1.354167,12.742593,154.073814,1.373404,96.137778,168.441270,279.683333,2177.996429,17.513889,93.577760,154.057632,256.958640,1963.299947,17.500733


In [14]:
from sklearn.linear_model import LinearRegression
from sklearn.model_selection import train_test_split
from sklearn import preprocessing

In [15]:
X = model_df.drop(["FPTS_TG", "Season"], axis=1)
y = model_df[["FPTS_TG"]]
X_train, X_test, y_train, y_test = train_test_split(X, y, random_state=0)

lr = LinearRegression()
sc = preprocessing.StandardScaler()

X_train_sc = sc.fit_transform(X_train)

X_test_sc = sc.transform(X_test)

lr.fit(X_train_sc, y_train)
y_pred = lr.predict(X_test_sc)

In [16]:
from sklearn.metrics import r2_score, mean_absolute_error, mean_squared_error

In [17]:
def evaluate(y_test, y_pred):
    mae = mean_absolute_error(y_test, y_pred)
    mse = mean_squared_error(y_test, y_pred)
    r2 = r2_score(y_test, y_pred)
    
    print("mae: ", mae)
    print("mse: ", mse)
    print("r2: ", r2)

In [18]:
evaluate(y_test, y_pred)

mae:  3.824538027698082
mse:  26.867818123053603
r2:  0.4194781508590467


In [19]:
X_22 = model_df[model_df.Season == 2022].drop(["FPTS_TG", "Season"], axis=1)
mdd = df[df.Season == 2022][["FPTS_TG", "Player"]].copy()
mdd["comp"] = lr.predict(sc.transform(X_22))

evaluate(mdd.FPTS_TG, mdd.comp)

mae:  3.0247652384830377
mse:  15.430294619276449
r2:  0.18748972307477352


In [20]:
from sklearn.ensemble import RandomForestRegressor

In [21]:
from sklearn.model_selection import RandomizedSearchCV

In [22]:
random_grid = {'n_estimators': [300, 500, 700],
               'max_features': ["sqrt"],
               'max_depth': [2, 5, 10, 20],
               'min_samples_split': [2, 5, 10],
              }

In [23]:
rf = RandomForestRegressor()

In [24]:
rf_random = RandomizedSearchCV(estimator = rf,param_distributions = random_grid,n_iter = 100, cv = 5, verbose=99, random_state=21, n_jobs = -1)

In [25]:
rf_random.fit(X_train, y_train)



Fitting 5 folds for each of 36 candidates, totalling 180 fits


  self.best_estimator_.fit(X, y, **fit_params)


In [26]:
rf_random.best_params_

{'n_estimators': 300,
 'min_samples_split': 5,
 'max_features': 'sqrt',
 'max_depth': 10}

In [27]:
rf_opt = RandomForestRegressor(max_depth=20, max_features="sqrt", min_samples_split = 5, n_estimators=700, random_state=0)

In [28]:
rf_opt.fit(X_train, y_train)

  rf_opt.fit(X_train, y_train)


In [29]:
y_pred = rf_opt.predict(X_test)

In [30]:
evaluate(y_train, rf_opt.predict(X_train))

mae:  1.7034679548681035
mse:  5.911226665113667
r2:  0.8844804383471017


In [31]:
X_22 = model_df[model_df.Season == 2022].drop(["FPTS_TG", "Season"], axis=1)
mdd = df[df.Season == 2022][["FPTS_TG", "Player"]].copy()
mdd["comp"] = rf_opt.predict((X_22))

evaluate(mdd.FPTS_TG, mdd.comp)

mae:  1.9265991569277279
mse:  6.009878405916277
r2:  0.6835389026352272


In [32]:
mdd

Unnamed: 0,FPTS_TG,Player,comp
192,25.2,Patrick Mahomes II (KC),22.517688
193,24.3,Josh Allen (BUF),22.747505
194,25.6,Jalen Hurts (PHI),21.303472
195,21.7,Joe Burrow (CIN),20.898445
196,20.5,Justin Fields (CHI),16.811231
197,18.0,Kirk Cousins (MIN),17.29584
198,17.9,Trevor Lawrence (JAC),17.267422
199,18.4,Daniel Jones (NYG),13.694272
200,17.1,Jared Goff (DET),16.948338
201,17.1,Justin Herbert (LAC),19.768614
