In [1]:
import pandas as pd
pd.options.mode.chained_assignment = None  # default='warn'

import numpy as np
from tqdm import tqdm

In [2]:
df = pd.read_csv(f"processed data/te_proc_data.csv")

In [3]:
df = df[df["Rookie"] == 0]

In [4]:
corrs = df.corr(numeric_only=True)
corrs[["FPTS_TG"]]

Unnamed: 0,FPTS_TG
pid,-0.001598
Season,0.190213
AVG,-0.648411
FPTS_TG,1.0
FPTS/G_MISC_avg,0.640463
ATT_RUSHING/G_avg,-0.010697
YDS_RUSHING/G_avg,-0.004092
TD_RUSHING/G_avg,0.017059
REC_RECEIVING/G_avg,0.55785
TGT_RECEIVING/G_avg,0.538935


In [5]:
rel_cols = ["Season"]
for i, vals in corrs[["FPTS_TG"]].iterrows():
    val = vals["FPTS_TG"]
    if (val > 0.4 or val < -0.4) and "_ls" not in i:
        rel_cols.append(i)

In [6]:
rel_df = df[rel_cols]

In [7]:
for col in rel_df.columns[2:]:
    if "FPTS" not in col:
        if "_avg" in col:
            new_c = rel_df["FPTS/G_MISC_avg"]*rel_df[col]
        elif "_ewm" in col:
            new_c = rel_df["FPTS/G_MISC_ewm"]*rel_df[col]
        
        new_name = col+"_FP"

        rel_df[new_name] = new_c

In [8]:
corrs = rel_df.corr()
corrs[["FPTS_TG"]]

Unnamed: 0,FPTS_TG
Season,0.190213
AVG,-0.648411
FPTS_TG,1.0
FPTS/G_MISC_avg,0.640463
REC_RECEIVING/G_avg,0.55785
TGT_RECEIVING/G_avg,0.538935
YDS_RECEIVING/G_avg,0.606269
TD_RECEIVING/G_avg,0.419335
FPTS/G_MISC_ewm,0.662162
REC_RECEIVING/G_ewm,0.58005


In [9]:
model_cols = ["Season"]
for i, vals in corrs[["FPTS_TG"]].iterrows():
    val = vals["FPTS_TG"]
    if val > 0.55 or val < -0.55:
        model_cols.append(i)

In [10]:
model_cols

['Season',
 'AVG',
 'FPTS_TG',
 'FPTS/G_MISC_avg',
 'REC_RECEIVING/G_avg',
 'YDS_RECEIVING/G_avg',
 'FPTS/G_MISC_ewm',
 'REC_RECEIVING/G_ewm',
 'TGT_RECEIVING/G_ewm',
 'YDS_RECEIVING/G_ewm',
 'REC_RECEIVING/G_avg_FP',
 'TGT_RECEIVING/G_avg_FP',
 'YDS_RECEIVING/G_avg_FP',
 'REC_RECEIVING/G_ewm_FP',
 'TGT_RECEIVING/G_ewm_FP',
 'YDS_RECEIVING/G_ewm_FP',
 'TD_RECEIVING/G_ewm_FP']

In [11]:
import pickle
with open("model features/te_feats", "wb") as fp:   #Pickling
    pickle.dump(model_cols, fp)

In [12]:
model_df = rel_df[model_cols]

In [13]:
from sklearn.linear_model import LinearRegression
from sklearn.model_selection import train_test_split
from sklearn import preprocessing

In [14]:
X = model_df.drop(["FPTS_TG", "Season"], axis=1)
y = model_df[["FPTS_TG"]]
X_train, X_test, y_train, y_test = train_test_split(X, y, random_state=0)

lr = LinearRegression()
sc = preprocessing.StandardScaler()

X_train_sc = sc.fit_transform(X_train)

X_test_sc = sc.transform(X_test)

lr.fit(X_train_sc, y_train)
y_pred = lr.predict(X_test_sc)

In [15]:
from sklearn.metrics import r2_score, mean_absolute_error, mean_squared_error

In [16]:
def evaluate(y_test, y_pred):
    mae = mean_absolute_error(y_test, y_pred)
    mse = mean_squared_error(y_test, y_pred)
    r2 = r2_score(y_test, y_pred)
    
    print("mae: ", mae)
    print("mse: ", mse)
    print("r2: ", r2)

In [17]:
evaluate(y_test, y_pred)

mae:  1.731131218465299
mse:  4.827488031453941
r2:  0.5755354568326221


In [18]:
X_22 = model_df[model_df.Season == 2022].drop(["FPTS_TG", "Season"], axis=1)
mdd = df[df.Season == 2022][["FPTS_TG", "Player"]].copy()
mdd["comp"] = lr.predict(sc.transform(X_22))

evaluate(mdd.FPTS_TG, mdd.comp)

mae:  1.74028136425031
mse:  4.601485068558475
r2:  0.4081061452601005


In [19]:
from sklearn.ensemble import RandomForestRegressor

In [20]:
from sklearn.model_selection import RandomizedSearchCV

In [21]:
random_grid = {'n_estimators': [300, 500, 700],
               'max_features': ["sqrt"],
               'max_depth': [2, 5, 10, 20],
               'min_samples_split': [2, 5, 10],
              }

In [22]:
rf = RandomForestRegressor()

In [23]:
rf_random = RandomizedSearchCV(estimator = rf,param_distributions = random_grid,n_iter = 100, cv = 5, verbose=99, random_state=21, n_jobs = -1)

In [24]:
rf_random.fit(X_train, y_train)

Fitting 5 folds for each of 36 candidates, totalling 180 fits


  self.best_estimator_.fit(X, y, **fit_params)


In [25]:
rf_random.best_params_

{'n_estimators': 300,
 'min_samples_split': 10,
 'max_features': 'sqrt',
 'max_depth': 10}

In [26]:
rf_opt = RandomForestRegressor(max_depth=20, max_features="sqrt", min_samples_split = 10, n_estimators=300, random_state=0)
rf_opt.fit(X_train, y_train)

  rf_opt.fit(X_train, y_train)


In [27]:
y_pred = rf_opt.predict(X_test)

In [28]:
evaluate(y_train, rf_opt.predict(X_train))

mae:  1.2965943821170698
mse:  2.7478401567547017
r2:  0.7849981759505931


In [29]:
X_22 = model_df[model_df.Season == 2022].drop(["FPTS_TG", "Season"], axis=1)
mdd = df[df.Season == 2022][["FPTS_TG", "Player"]].copy()
mdd["comp"] = rf_opt.predict((X_22))

evaluate(mdd.FPTS_TG, mdd.comp)

mae:  1.4119440232268943
mse:  3.2349904783610386
r2:  0.5838797788636888


In [30]:
mdd

Unnamed: 0,FPTS_TG,Player,comp
244,15.4,Travis Kelce (KC),13.569053
245,10.1,T.J. Hockenson (MIN),7.188012
246,11.4,George Kittle (SF),11.419083
247,10.3,Mark Andrews (BAL),11.142138
248,8.8,Taysom Hill (NO),3.91581
249,8.3,Evan Engram (JAC),6.237784
250,7.2,Cole Kmet (CHI),6.715887
251,7.8,Pat Freiermuth (PIT),7.591573
252,7.3,Tyler Higbee (LAR),6.208232
253,7.6,Dalton Schultz (HOU),8.228522
