In [1]:
import pandas as pd
pd.options.mode.chained_assignment = None  # default='warn'

import numpy as np
from tqdm import tqdm

In [2]:
import pickle
from sklearn.linear_model import LinearRegression
from sklearn.model_selection import train_test_split
from sklearn import preprocessing
from sklearn.ensemble import RandomForestRegressor

In [3]:
def create_features(rel_df, pos):
    if pos ==  "wr":
        for col in rel_df.columns[4:]:
            if "FPTS_TG" not in col and rel_df[col].dtype !="O":
                new_c = rel_df[col]/rel_df["AVG"]

                new_name = col+"_ADP"

                rel_df[new_name] = new_c
                
    if pos == "rb" or pos ==  "wr" or pos == "qb" or pos == "te":
        for col in rel_df.columns[4:]:
            if "FPTS" not in col and "_ADP" not in col:
                if "_avg" in col:
                    new_c = rel_df["FPTS/G_MISC_avg"]*rel_df[col]
                elif "_ewm" in col:
                    new_c = rel_df["FPTS/G_MISC_ewm"]*rel_df[col]
                
                new_name = col+"_FP"

                rel_df[new_name] = new_c
    return rel_df

In [4]:
def train(pos):
    df = pd.read_csv(f"processed data/{pos}_proc_data.csv")
    df = df[df["Rookie"] == 0]
    df = df.drop(["Rookie"], axis=1)
    #return df
    df_feats = create_features(df, pos)
    
    
    with open(f"model features/{pos}_feats", "rb") as fp:   # Unpickling
        model_cols = pickle.load(fp)
        
    #print(model_cols)
    model_df = df_feats[model_cols].dropna()
    
    X = model_df.drop(["FPTS_TG", "Season"], axis=1)
    y = model_df[["FPTS_TG"]]

    if pos == "wr":
        rf_opt = RandomForestRegressor(max_depth=5, max_features="sqrt", min_samples_split = 10, n_estimators=400, random_state=0)
    elif pos == "rb":
        rf_opt = RandomForestRegressor(max_depth=10, max_features="sqrt", min_samples_split = 5, n_estimators=400, random_state=0)
    elif pos == "qb":
        rf_opt = RandomForestRegressor(max_depth=20, max_features="sqrt", min_samples_split = 5, n_estimators=700, random_state=0)
    elif pos == "te":
        rf_opt = RandomForestRegressor(max_depth=20, max_features="sqrt", min_samples_split = 10, n_estimators=300, random_state=0)
    sc = preprocessing.StandardScaler()
    #X = sc.fit_transform(X)
    
    
    rf_opt.fit(X, y.values.ravel())
    
    return rf_opt, sc#[model_df.Season == 2022]

In [10]:
def predict_ns(pos, model, sc):
    df = pd.read_csv(f"processed data/{pos}_rookie.csv")
    
    df["Position"] = [pos]*len(df)
    df = df[df["Rookie"] == 0]
    df = df.drop(["Rookie"], axis=1)
    df = df[~df["YDS_RUSHING/G_avg"].isna()]
    #return df.columns[4:]
    df_feats = create_features(df, pos)
    
    
    with open(f"model features/{pos}_feats", "rb") as fp:   # Unpickling
        model_cols = pickle.load(fp)
    #return model_cols
    model_df = df_feats[model_cols]
    
    X = model_df.drop(["FPTS_TG", "Season"], axis=1)
    X_sc = X#sc.transform(X)
    
    pred = model.predict(X_sc.fillna(0))
    
    df_feats["FPTS_TG_pred"] = pred
    
    df_board = df_feats[["pid", "Player", "Season", "AVG", "FPTS_TG_pred", "STD", "Position"]]
    return df_board

In [15]:
model, scaler = train('rb')
    
pred_board = predict_ns('rb', model, scaler)

In [17]:
pos = ["wr", "rb", "te", "qb"]

full_board = pd.DataFrame()

for p in tqdm(pos):
    model, scaler = train(p)
    
    pred_board = predict_ns(p, model, scaler)
    
    full_board = pd.concat([full_board, pred_board])

100%|████████████████████████████████████████████████████████████████████████████████████| 4/4 [00:03<00:00,  1.09it/s]


In [85]:
#pred_board.sort_values("FPTS_TG_pred", ascending=False).head(20)

In [86]:
#pred_board.sort_values("FPTS_TG_pred", ascending=False).head(20)

In [18]:
full_board = full_board.sort_values("AVG")

In [19]:
full_board.sort_values("FPTS_TG_pred", ascending=False).head(30)

Unnamed: 0,pid,Player,Season,AVG,FPTS_TG_pred,STD,Position
1,17298,Josh Allen BUF (13),2023,20.0,21.682469,4.358899,qb
0,16413,Patrick Mahomes II KC (10),2023,15.0,20.733982,1.0,qb
2,19275,Jalen Hurts PHI (10),2023,22.7,20.41146,1.154701,qb
6,19781,Justin Fields CHI (13),2023,47.0,20.300229,2.0,qb
3,17233,Lamar Jackson BAL (13),2023,34.3,20.15394,3.05505,qb
4,19196,Joe Burrow CIN (7),2023,35.0,20.153888,3.605551,qb
5,18635,Justin Herbert LAC (5),2023,46.0,19.279383,5.0,qb
8,15600,Dak Prescott DAL (7),2023,77.7,19.254327,9.451631,qb
9,16398,Deshaun Watson CLE (5),2023,82.7,18.36293,4.50925,qb
11,11177,Kirk Cousins MIN (13),2023,107.0,18.292942,4.358899,qb


In [20]:
full_board.to_csv("board.csv", index=False)

In [37]:
import scipy.stats

In [89]:
1- scipy.stats.norm(11.3, 1.51).cdf(4)

0.9999993323853829