In [1]:
import pandas as pd
import numpy as np
from tqdm import tqdm

In [3]:
import pickle
from sklearn.linear_model import LinearRegression
from sklearn.model_selection import train_test_split
from sklearn import preprocessing
from sklearn.ensemble import RandomForestRegressor

In [10]:
def create_features(rel_df, pos):
    if pos == "rb" or pos ==  "wr" or pos == "qb" or pos == "te":
        for col in rel_df.columns[4:]:
            if "FPTS" not in col:
                if "_avg" in col:
                    new_c = rel_df["FPTS/G_MISC_avg"]*rel_df[col]
                elif "_ewm" in col:
                    new_c = rel_df["FPTS/G_MISC_ewm"]*rel_df[col]
                
                new_name = col+"_FP"

                rel_df[new_name] = new_c
    return rel_df

In [22]:
def train(pos):
    df = pd.read_csv(f"processed data/{pos}_proc_data.csv")
    df = df[df["Rookie"] == 0]
    df = df.drop(["Rookie", "Career_Years"], axis=1)
    df_feats = create_features(df, pos)
    
    
    with open(f"model features/{pos}_feats", "rb") as fp:   # Unpickling
        model_cols = pickle.load(fp)
        
    model_df = df_feats[model_cols]
    
    X = model_df.drop(["FPTS_TG", "Season"], axis=1)
    y = model_df[["FPTS_TG"]]

    if pos == "wr":
        rf_opt = RandomForestRegressor(max_depth=5, max_features="sqrt", min_samples_split = 5, n_estimators=300, random_state=0)
    elif pos == "rb":
        rf_opt = RandomForestRegressor(max_depth=20, max_features="sqrt", min_samples_split = 2, n_estimators=300, random_state=0)
    elif pos == "qb":
        rf_opt = RandomForestRegressor(max_depth=20, max_features="sqrt", min_samples_split = 5, n_estimators=700, random_state=0)
    elif pos == "te":
        rf_opt = RandomForestRegressor(max_depth=20, max_features="sqrt", min_samples_split = 10, n_estimators=300, random_state=0)
    sc = preprocessing.StandardScaler()
    #X = sc.fit_transform(X)
    
    
    rf_opt.fit(X, y.values.ravel())
    
    return rf_opt, sc#[model_df.Season == 2022]

In [31]:
def predict_ns(pos, model, sc):
    df = pd.read_csv(f"processed data/{pos}_ns_proc.csv")
    
    df = df[df["Rookie"] == 0]
    df = df.drop(["Rookie", "Career_Years"], axis=1)
    df = df[~df["YDS_RUSHING/G_avg"].isna()]
    #return df
    df_feats = create_features(df, pos)
    
    
    with open(f"model features/{pos}_feats", "rb") as fp:   # Unpickling
        model_cols = pickle.load(fp)
    #return model_cols
    model_df = df_feats[model_cols]
    
    X = model_df.drop(["FPTS_TG", "Season"], axis=1)
    X_sc = X#sc.transform(X)
    
    pred = model.predict(X_sc)
    
    df_feats["FPTS_TG_pred"] = pred
    
    df_board = df_feats[["pid", "Player", "Season", "AVG", "FPTS_TG_pred"]+model_cols[3:]]
    return df_board

In [36]:
full_board.to_csv("board.csv", index=False)

In [32]:
pos = ["wr", "rb", "te", "qb"]

full_board = pd.DataFrame()

for p in tqdm(pos):
    model, scaler = train(p)
    
    pred_board = predict_ns(p, model, scaler)
    
    full_board = pd.concat([full_board, pred_board])

100%|████████████████████████████████████████████████████████████████████████████████████| 4/4 [00:03<00:00,  1.25it/s]


In [26]:
#pred_board.sort_values("FPTS_TG_pred", ascending=False).head(20)

In [27]:
#pred_board.sort_values("FPTS_TG_pred", ascending=False).head(20)

In [28]:
full_board.sort_values("AVG", ascending=True).head(20)

Unnamed: 0,pid,Player,Season,AVG,FPTS_TG_pred,FPTS/G_MISC_avg,REC_RECEIVING/G_avg,TGT_RECEIVING/G_avg,YDS_RECEIVING/G_avg,FPTS/G_MISC_ewm,...,Y/A_PASSING_avg_FP,CMP_PASSING/G_avg_FP,ATT_PASSING/G_avg_FP,YDS_PASSING/G_avg_FP,TD_PASSING/G_avg_FP,Y/A_PASSING_ewm_FP,CMP_PASSING/G_ewm_FP,ATT_PASSING/G_ewm_FP,YDS_PASSING/G_ewm_FP,TD_PASSING/G_ewm_FP
0,19236,Justin Jefferson MIN (13),2023,1.3,15.177861,16.2,6.460784,9.48652,96.323529,16.594503,...,,,,,,,,,,
0,16393,Christian McCaffrey SF (9),2023,1.7,16.658,21.8,,,,20.425414,...,,,,,,,,,,
1,19788,Ja'Marr Chase CIN (7),2023,3.0,15.619821,15.4,5.728507,8.918552,83.024887,15.382843,...,,,,,,,,,,
1,16483,Austin Ekeler LAC (5),2023,4.0,19.344333,17.075,,,,17.63039,...,,,,,,,,,,
2,16433,Cooper Kupp LAR (10),2023,5.3,15.829172,16.866667,7.665359,10.130283,89.894989,17.61288,...,,,,,,,,,,
0,11594,Travis Kelce KC (10),2023,5.7,13.660302,15.466667,6.406863,,81.139461,15.276403,...,,,,,,,,,,
3,15802,Tyreek Hill MIA (10),2023,7.3,15.518191,16.6,6.443137,9.45098,86.179085,16.374793,...,,,,,,,,,,
2,17240,Saquon Barkley NYG (13),2023,8.3,13.138333,12.05,,,,12.31912,...,,,,,,,,,,
4,17246,Nick Chubb CLE (5),2023,11.3,15.572,15.475,,,,15.514382,...,,,,,,,,,,
4,13981,Stefon Diggs BUF (13),2023,11.3,16.067451,15.366667,6.822304,9.732843,84.528186,15.29517,...,,,,,,,,,,


In [37]:
import scipy.stats

In [69]:
1- scipy.stats.norm(11.3, 1.51).cdf(300)

4.166275324735125e-09