In [1]:
import os
import sys
src_dir = os.path.join(os.getcwd())
abs_path = os.path.abspath(os.path.join(src_dir, os.pardir, 'src'))
sys.path.append(abs_path)

from utils import GLOBAL, functions

In [2]:
import pandas as pd
import numpy as np
from sklearn.linear_model import LinearRegression, ElasticNet
from sklearn.model_selection import train_test_split
from sklearn.metrics import mean_absolute_error

In [3]:
df = pd.read_csv(GLOBAL.CLEANED_DATA_2014_2019)

In [4]:
df = df.iloc[:, 1:]

In [5]:
df = df.groupby(['player_id', 'tm', 'player', 'pos', 'season'], as_index=False)\
    .agg({
        'offensive_snapcount': np.sum,
        'offensive_snapcount_percentage': np.mean,
        'passing_rating': np.mean,
        'passing_yds': np.sum,
        'passing_td': np.sum,
        'passing_att': np.sum,
        'receiving_yds': np.sum,
        'receiving_td': np.sum,
        'receiving_rec': np.sum,
        'receiving_tar': np.sum,
        'rushing_att': np.sum,
        'standard_fantasy_points': np.sum,
        'ppr_fantasy_points': np.sum,
        'half_ppr_fantasy_points': np.sum
})

In [6]:
df = df.loc[df['season'] >= 2012]

In [7]:
#Change based on scoring format: "half_ppr", "ppr", or "standard"
scoring_format = "half_ppr"

pd.set_option('chained_assignment', None)

lag_features = [
    'rushing_att', 
    'receiving_tar', 
    'offensive_snapcount', 
    'offensive_snapcount_percentage',
    f'{scoring_format}_fantasy_points',
    'passing_rating',
    'passing_att', 
    'passing_td'
]

for lag in range(1, 6):
    shifted = df.groupby('player_id').shift(lag)
    for column in lag_features:
        df[f'lag_{column}_{lag}'] = shifted[column]

df = df.fillna(-1)

In [10]:
#Serparate by pos
wr_df = df.loc[df['pos'] == 'WR']
rb_df = df.loc[df['pos'] == 'RB']
te_df = df.loc[df['pos'] == 'TE']
qb_df = df.loc[df['pos'] == 'QB']

In [None]:
#Correlation matrices to find best correlation for fantasy points, change based on [pos]_df for different pos
wr_df.corr()[[f'{scoring_format}_fantasy_points']]

Wide Receivers

In [16]:
#Use 2014-108 to train model
wr_train_df = wr_df.loc[(wr_df['season'] < 2019) & (wr_df['lag_offensive_snapcount_1'] > 50)]

In [17]:
WR_X = wr_train_df[[
    'lag_receiving_tar_1', 'lag_offensive_snapcount_1', f'lag_{scoring_format}_fantasy_points_1'
]].values

In [19]:
wr_y = wr_train_df[f'{scoring_format}_fantasy_points'].values

In [23]:
WR_X_train, WR_X_test, wr_y_train, wr_y_test = train_test_split(WR_X, wr_y, test_size=0.2, random_state=10)

wr_lr = LinearRegression()

wr_lr.fit(WR_X_train, wr_y_train);

In [25]:
wr_y_predict = wr_lr.predict(WR_X_test)

In [26]:
mean_absolute_error(wr_y_test, wr_y_predict)

39.62662611510035

In [None]:
pd.set_option('display.max_rows', None)

wr_df_predict = wr_df.loc[
    (wr_df['season'] == 2019) & (wr_df['offensive_snapcount'] > 50), ['player', 'receiving_tar', 'offensive_snapcount', f'{scoring_format}_fantasy_points']
]

wr_df_predict[f'{scoring_format}_predicted_2020'] = wr_lr.predict(wr_df_predict[['receiving_tar', 'offensive_snapcount', f'{scoring_format}_fantasy_points']].values)

wr_df_predict.sort_values(by=f'{scoring_format}_predicted_2020', ascending=False).head(100)