In [1]:
import pandas as pd
import numpy as np
from tqdm import tqdm
pd.options.mode.chained_assignment = None  # default='warn'


In [2]:
df = pd.read_csv(f"processed data/rb_proc_data.csv")

In [3]:
df = df[df["Rookie"] == 0]

In [4]:
df.Season.value_counts()

2018    96
2021    95
2019    86
2020    84
2022    59
Name: Season, dtype: int64

In [5]:
corrs = df.corr()

  corrs = df.corr()


In [6]:
rel_cols = ["Season"]
for i, vals in corrs[["FPTS_TG"]].iterrows():
    val = vals["FPTS_TG"]
    if (val > 0.17 or val < -0.17) and "_ls" not in i:
        rel_cols.append(i)
rel_cols.append("Career_Years")

In [7]:
rel_df = df[rel_cols]

In [8]:
for col in rel_cols[2:]:
    new_c = rel_df[col]/rel_df["AVG"]
    
    new_name = col+"_ADP"
    
    rel_df[new_name] = new_c

In [9]:
corrs = rel_df.corr()

In [10]:
corrs[["FPTS_TG"]]

Unnamed: 0,FPTS_TG
Season,0.096831
AVG,-0.722483
FPTS_TG,1.0
FPTS/G_MISC_avg,0.643941
ATT_RUSHING/G_avg,0.529244
YDS_RUSHING/G_avg,0.565597
TD_RUSHING/G_avg,0.547808
REC_RECEIVING/G_avg,0.475149
TGT_RECEIVING/G_avg,0.4688
YDS_RECEIVING/G_avg,0.486112


In [11]:
rel_df = df[rel_cols]

In [12]:
for col in rel_df.columns[2:]:
    if "FPTS" not in col:
        if "_avg" in col:
            new_c = rel_df["FPTS/G_MISC_avg"]*rel_df[col]
        elif "_ewm" in col:
            new_c = rel_df["FPTS/G_MISC_ewm"]*rel_df[col]
        
        new_name = col+"_FP"

        rel_df[new_name] = new_c

In [13]:
corrs = rel_df.corr()

In [14]:
corrs[["FPTS_TG"]]#.tail(30)

Unnamed: 0,FPTS_TG
Season,0.096831
AVG,-0.722483
FPTS_TG,1.0
FPTS/G_MISC_avg,0.643941
ATT_RUSHING/G_avg,0.529244
YDS_RUSHING/G_avg,0.565597
TD_RUSHING/G_avg,0.547808
REC_RECEIVING/G_avg,0.475149
TGT_RECEIVING/G_avg,0.4688
YDS_RECEIVING/G_avg,0.486112


In [15]:
model_cols = ["Season"]
for i, vals in corrs[["FPTS_TG"]].iterrows():
    val = vals["FPTS_TG"]
    if val > 0.55 or val < -0.55:
        model_cols.append(i)

In [16]:
import pickle
with open("model features/rb_feats", "wb") as fp:   #Pickling
    pickle.dump(model_cols, fp)

In [17]:
model_cols

['Season',
 'AVG',
 'FPTS_TG',
 'FPTS/G_MISC_avg',
 'YDS_RUSHING/G_avg',
 'FPTS/G_MISC_ewm',
 'YDS_RUSHING/G_ewm',
 'TD_RUSHING/G_ewm',
 'ATT_RUSHING/G_avg_FP',
 'YDS_RUSHING/G_avg_FP',
 'TD_RUSHING/G_avg_FP',
 'REC_RECEIVING/G_avg_FP',
 'TGT_RECEIVING/G_avg_FP',
 'YDS_RECEIVING/G_avg_FP',
 'YDS/ATT_RUSHING_avg_FP',
 'ATT_RUSHING/G_ewm_FP',
 'YDS_RUSHING/G_ewm_FP',
 'TD_RUSHING/G_ewm_FP',
 'REC_RECEIVING/G_ewm_FP',
 'TGT_RECEIVING/G_ewm_FP',
 'YDS_RECEIVING/G_ewm_FP',
 'YDS/ATT_RUSHING_ewm_FP',
 'Career_Years_FP']

In [18]:
import seaborn as sns
import matplotlib.pyplot as plt

In [19]:
model_df = rel_df[model_cols]

In [20]:
model_df.columns

Index(['Season', 'AVG', 'FPTS_TG', 'FPTS/G_MISC_avg', 'YDS_RUSHING/G_avg',
       'FPTS/G_MISC_ewm', 'YDS_RUSHING/G_ewm', 'TD_RUSHING/G_ewm',
       'ATT_RUSHING/G_avg_FP', 'YDS_RUSHING/G_avg_FP', 'TD_RUSHING/G_avg_FP',
       'REC_RECEIVING/G_avg_FP', 'TGT_RECEIVING/G_avg_FP',
       'YDS_RECEIVING/G_avg_FP', 'YDS/ATT_RUSHING_avg_FP',
       'ATT_RUSHING/G_ewm_FP', 'YDS_RUSHING/G_ewm_FP', 'TD_RUSHING/G_ewm_FP',
       'REC_RECEIVING/G_ewm_FP', 'TGT_RECEIVING/G_ewm_FP',
       'YDS_RECEIVING/G_ewm_FP', 'YDS/ATT_RUSHING_ewm_FP', 'Career_Years_FP'],
      dtype='object')

In [21]:
#plt.figure(figsize=(15, 10))
#sns.regplot(model_df["AVG"], model_df["FPTS_TG"])

In [22]:
#plt.figure(figsize=(15, 10))
#sns.regplot(model_df["FPTS_TG"], model_df["REC_RECEIVING_avg_FP"])

In [23]:

plt.figure(figsize=(15, 10))
#sns.regplot(model_df["FPTS_TG"], model_df["REC_RECEIVING/G_ls_FP"])

<Figure size 1500x1000 with 0 Axes>

<Figure size 1500x1000 with 0 Axes>

In [24]:
model_df = model_df.fillna(0)

In [25]:
# Linear Regression

In [26]:
from sklearn.linear_model import LinearRegression
from sklearn.model_selection import train_test_split
from sklearn import preprocessing

In [27]:
X = model_df.drop(["FPTS_TG", "Season"], axis=1)
y = model_df[["FPTS_TG"]]
X_train, X_test, y_train, y_test = train_test_split(X, y, random_state=0)

lr = LinearRegression()
sc = preprocessing.StandardScaler()

X_train_sc = sc.fit_transform(X_train)

X_test_sc = sc.transform(X_test)

lr.fit(X_train_sc, y_train)
y_pred = lr.predict(X_test_sc)

In [28]:
from sklearn.metrics import r2_score, mean_absolute_error, mean_squared_error

In [29]:
def evaluate(y_test, y_pred):
    mae = mean_absolute_error(y_test, y_pred)
    mse = mean_squared_error(y_test, y_pred)
    r2 = r2_score(y_test, y_pred)
    
    print("mae: ", mae)
    print("mse: ", mse)
    print("r2: ", r2)

In [30]:
evaluate(y_test, y_pred)

mae:  2.885988602756518
mse:  14.000469248783693
r2:  0.574638755523613


In [31]:
X_22 = model_df[model_df.Season == 2022].drop(["FPTS_TG", "Season"], axis=1)
mdd = df[df.Season == 2022][["FPTS_TG", "Player"]].copy()
mdd["comp"] = lr.predict(sc.transform(X_22))

evaluate(mdd.FPTS_TG, mdd.comp)

mae:  2.3624924713727964
mse:  8.683329365983994
r2:  0.49586322923604464


In [32]:
from sklearn.ensemble import RandomForestRegressor

In [33]:
rf = RandomForestRegressor(max_depth=2, random_state=0)

In [34]:
random_grid = {'n_estimators': [200, 300, 400],
               'max_features': ["sqrt"],
               'max_depth': [2, 5, 10, 20],
               'min_samples_split': [3, 5, 10, 15],
              }

In [35]:
from sklearn.model_selection import RandomizedSearchCV

In [36]:
rf_random = RandomizedSearchCV(estimator = rf,param_distributions = random_grid,n_iter = 100, cv = 5, verbose=99, random_state=21, n_jobs = -1)

In [37]:
rf_random.fit(X_train, y_train)



Fitting 5 folds for each of 48 candidates, totalling 240 fits


  self.best_estimator_.fit(X, y, **fit_params)


In [38]:
rf_random.best_params_

{'n_estimators': 400,
 'min_samples_split': 5,
 'max_features': 'sqrt',
 'max_depth': 10}

In [39]:
rf_opt = RandomForestRegressor(max_depth=10, max_features="sqrt", min_samples_split = 5, n_estimators=400, random_state=0)

In [40]:
rf_opt.fit(X_train, y_train)

  rf_opt.fit(X_train, y_train)


In [41]:
y_pred = rf_opt.predict(X_test)

In [42]:
evaluate(y_train, rf_opt.predict(X_train))

mae:  1.2602867600571739
mse:  2.7456700681547677
r2:  0.8998354815172858


In [43]:
evaluate(y_test, y_pred)

mae:  2.7639209168940972
mse:  14.426460486930342
r2:  0.5616963205255976


In [45]:
X_22 = model_df[model_df.Season == 2022].drop(["FPTS_TG", "Season"], axis=1)
mdd = df[df.Season == 2022][["FPTS_TG", "Player", "pid"]].copy()
mdd["comp"] = rf_opt.predict((X_22))

evaluate(mdd.FPTS_TG, mdd.comp)

mae:  1.33620977830087
mse:  3.6921127548902177
r2:  0.7856433030355472


In [47]:
mdd.to_csv("projected data/base_rbs.csv", index=False)

In [104]:
df[df["pid"] == 16483][["pid", "Player", "FPTS/G_MISC_avg", "FPTS/G_MISC_ewm"]]

Unnamed: 0,pid,Player,FPTS/G_MISC_avg,FPTS/G_MISC_ewm
23,16483,Austin Ekeler (LAC),5.8,5.8
122,16483,Austin Ekeler (LAC),8.25,8.670354
252,16483,Austin Ekeler (LAC),10.966667,12.172516
326,16483,Austin Ekeler (LAC),13.633333,13.930704
440,16483,Austin Ekeler (LAC),16.5,16.880957


In [43]:
df[df["pid"] == 16393][["pid", "Player", "FPTS/G_MISC_avg", "FPTS/G_MISC_ewm"]]

Unnamed: 0,pid,Player,FPTS/G_MISC_avg,FPTS/G_MISC_ewm
2,16393,Christian McCaffrey (SF),11.8,11.8
117,16393,Christian McCaffrey (SF),16.3,17.072078
275,16393,Christian McCaffrey (SF),19.466667,21.026541
362,16393,Christian McCaffrey (SF),24.633333,25.346918
441,16393,Christian McCaffrey (SF),22.9,21.659131
