In [15]:
import pandas as pd
pd.options.mode.chained_assignment = None  # default='warn'

import numpy as np
from tqdm import tqdm

In [16]:
df = pd.read_csv(f"processed data/wr_proc_data.csv")

In [17]:
df = df[df["Rookie"] == 0]

In [18]:
df.Season.value_counts()

2018    128
2021    123
2019    117
2020    101
2022     73
Name: Season, dtype: int64

In [19]:
corrs = df.corr(numeric_only=True)

In [20]:
corrs[["FPTS_TG"]]

Unnamed: 0,FPTS_TG
pid,-0.028053
Season,0.10732
AVG,-0.715617
FPTS_TG,1.0
FPTS/G_MISC_avg,0.67611
ATT_RUSHING/G_avg,-0.042159
YDS_RUSHING/G_avg,0.017107
TD_RUSHING/G_avg,0.023284
REC_RECEIVING/G_avg,0.638836
TGT_RECEIVING/G_avg,0.617874


In [21]:
rel_cols = ["Season"]
for i, vals in corrs[["FPTS_TG"]].iterrows():
    val = vals["FPTS_TG"]
    if (val > 0.4 or val < -0.4) and "_ls" not in i:
        rel_cols.append(i)

In [22]:
rel_df = df[rel_cols]

In [23]:
rel_cols

['Season',
 'AVG',
 'FPTS_TG',
 'FPTS/G_MISC_avg',
 'REC_RECEIVING/G_avg',
 'TGT_RECEIVING/G_avg',
 'YDS_RECEIVING/G_avg',
 'TD_RECEIVING/G_avg',
 'FPTS/G_MISC_ewm',
 'REC_RECEIVING/G_ewm',
 'TGT_RECEIVING/G_ewm',
 'YDS_RECEIVING/G_ewm',
 'TD_RECEIVING/G_ewm']

In [24]:
for col in rel_cols[2:]:
    new_c = rel_df[col]/rel_df["AVG"]
    
    new_name = col+"_ADP"
    
    rel_df[new_name] = new_c

In [25]:
corrs = rel_df.corr()

In [26]:
corrs[["FPTS_TG"]]

Unnamed: 0,FPTS_TG
Season,0.10732
AVG,-0.715617
FPTS_TG,1.0
FPTS/G_MISC_avg,0.67611
REC_RECEIVING/G_avg,0.638836
TGT_RECEIVING/G_avg,0.617874
YDS_RECEIVING/G_avg,0.668811
TD_RECEIVING/G_avg,0.569134
FPTS/G_MISC_ewm,0.690826
REC_RECEIVING/G_ewm,0.652924


In [27]:
rel_df = df[rel_cols]

In [28]:
for col in rel_df.columns[2:]:
    if "FPTS" not in col:
        if "_avg" in col:
            new_c = rel_df["FPTS/G_MISC_avg"]*rel_df[col]
        elif "_ewm" in col:
            new_c = rel_df["FPTS/G_MISC_ewm"]*rel_df[col]
        
        new_name = col+"_FP"

        rel_df[new_name] = new_c

In [29]:
corrs = rel_df.corr()

In [30]:
corrs[["FPTS_TG"]]#.tail(30)

Unnamed: 0,FPTS_TG
Season,0.10732
AVG,-0.715617
FPTS_TG,1.0
FPTS/G_MISC_avg,0.67611
REC_RECEIVING/G_avg,0.638836
TGT_RECEIVING/G_avg,0.617874
YDS_RECEIVING/G_avg,0.668811
TD_RECEIVING/G_avg,0.569134
FPTS/G_MISC_ewm,0.690826
REC_RECEIVING/G_ewm,0.652924


In [31]:
model_cols = ["Season"]
for i, vals in corrs[["FPTS_TG"]].iterrows():
    val = vals["FPTS_TG"]
    if val > 0.6 or val < -0.6:
        model_cols.append(i)

In [32]:
model_cols

['Season',
 'AVG',
 'FPTS_TG',
 'FPTS/G_MISC_avg',
 'REC_RECEIVING/G_avg',
 'TGT_RECEIVING/G_avg',
 'YDS_RECEIVING/G_avg',
 'FPTS/G_MISC_ewm',
 'REC_RECEIVING/G_ewm',
 'TGT_RECEIVING/G_ewm',
 'YDS_RECEIVING/G_ewm',
 'REC_RECEIVING/G_avg_FP',
 'TGT_RECEIVING/G_avg_FP',
 'YDS_RECEIVING/G_avg_FP',
 'TD_RECEIVING/G_avg_FP',
 'REC_RECEIVING/G_ewm_FP',
 'TGT_RECEIVING/G_ewm_FP',
 'YDS_RECEIVING/G_ewm_FP',
 'TD_RECEIVING/G_ewm_FP']

In [33]:
import pickle
with open("model features/wr_feats", "wb") as fp:   #Pickling
    pickle.dump(model_cols, fp)

In [34]:
model_df = rel_df[model_cols]

In [35]:
from sklearn.linear_model import LinearRegression
from sklearn.model_selection import train_test_split
from sklearn import preprocessing

In [36]:
X = model_df.drop(["FPTS_TG", "Season"], axis=1)
y = model_df[["FPTS_TG"]]
X_train, X_test, y_train, y_test = train_test_split(X, y, random_state=0)

lr = LinearRegression()
sc = preprocessing.StandardScaler()

X_train_sc = sc.fit_transform(X_train)

X_test_sc = sc.transform(X_test)

lr.fit(X_train_sc, y_train)
y_pred = lr.predict(X_test_sc)

In [37]:
from sklearn.metrics import r2_score, mean_absolute_error, mean_squared_error

In [38]:
def evaluate(y_test, y_pred):
    mae = mean_absolute_error(y_test, y_pred)
    mse = mean_squared_error(y_test, y_pred)
    r2 = r2_score(y_test, y_pred)
    
    print("mae: ", mae)
    print("mse: ", mse)
    print("r2: ", r2)

In [39]:
evaluate(y_test, y_pred)

mae:  2.418968325006274
mse:  10.68335051758811
r2:  0.5257893431326928


In [40]:
X_22 = model_df[model_df.Season == 2022].drop(["FPTS_TG", "Season"], axis=1)
mdd = df[df.Season == 2022][["FPTS_TG", "Player"]].copy()
mdd["comp"] = lr.predict(sc.transform(X_22))

evaluate(mdd.FPTS_TG, mdd.comp)

mae:  1.8011197607726082
mse:  5.041973192301104
r2:  0.6481234357710106


In [41]:
from sklearn.ensemble import RandomForestRegressor

In [50]:
from sklearn.model_selection import RandomizedSearchCV

In [51]:
random_grid = {'n_estimators': [300, 500, 700],
               'max_features': ["sqrt"],
               'max_depth': [2, 5, 10, 20],
               'min_samples_split': [2, 5, 10],
              }

In [52]:
rf = RandomForestRegressor()

In [53]:
rf_random = RandomizedSearchCV(estimator = rf,param_distributions = random_grid,n_iter = 100, cv = 5, verbose=99, random_state=21, n_jobs = -1)

In [54]:
rf_random.fit(X_train, y_train)



Fitting 5 folds for each of 36 candidates, totalling 180 fits


  self.best_estimator_.fit(X, y, **fit_params)


In [55]:
rf_random.best_params_

{'n_estimators': 300,
 'min_samples_split': 5,
 'max_features': 'sqrt',
 'max_depth': 5}

In [56]:
rf_opt = RandomForestRegressor(max_depth=5, max_features="sqrt", min_samples_split = 5, n_estimators=300, random_state=0)

In [57]:
rf_opt.fit(X_train, y_train)

  rf_opt.fit(X_train, y_train)


In [58]:
y_pred = rf_opt.predict(X_test)

In [59]:
evaluate(y_train, rf_opt.predict(X_train))

mae:  1.8890061760921162
mse:  5.598100376981032
r2:  0.7161771318513714


In [60]:
evaluate(y_test, y_pred)

mae:  2.232325274756466
mse:  8.501854077671544
r2:  0.6226212179292221


In [61]:
X_22 = model_df[model_df.Season == 2022].drop(["FPTS_TG", "Season"], axis=1)
mdd = df[df.Season == 2022][["FPTS_TG", "Player"]].copy()
mdd["comp"] = rf_opt.predict((X_22))

evaluate(mdd.FPTS_TG, mdd.comp)

mae:  1.7143629873458839
mse:  4.253691517369805
r2:  0.7031371847221244


In [62]:
mdd

Unnamed: 0,FPTS_TG,Player,comp
560,17.9,Justin Jefferson (MIN),16.002079
561,16.8,Davante Adams (LV),16.539148
562,16.6,Tyreek Hill (MIA),13.914949
563,15.7,Stefon Diggs (BUF),14.923685
564,15.0,A.J. Brown (PHI),12.598064
...,...,...,...
637,3.5,Robbie Chosen (MIA),5.111030
639,4.3,Sammy Watkins (FA),6.660404
642,3.2,KJ Hamler (FA),5.798845
643,2.9,Kenny Golladay (FA),5.933813
