In [1]:
import pandas as pd
import numpy as np
from pathlib import Path
from helpers import *

In [2]:
# path to project directory
path = Path('./')

In [3]:
# read in training dataset
train_df = pd.read_csv(path/'data/train_v4.csv', index_col=0, dtype={'season':str})

In [4]:
# add all lag points scored to training set
train_df, player_lag_vars = player_lag_features(train_df, ['total_points'], ['all', 1, 2])

In [5]:
# lets grab some indexes for a validation set starting at gw 20 in 2019/20 season
valid_season = '1920'
valid_gw = 20
valid_len = 6

In [6]:
def simple_model(df, valid_season, valid_gw, valid_len):
    valid_start, valid_end = validation_season_idx(df, valid_season, [valid_gw], valid_len)[0]
    
    train_idx = range(valid_start)
    valid_idx = range(valid_start, valid_end + 1)    
    
    train = df.iloc[train_idx]
    valid = df[['player', 'gw', 'season', 'minutes', 'total_points']].iloc[valid_idx]
    
    season_point = valid['season'].iloc[0]
    gw_point = valid['gw'].iloc[0]
    
    # get player total per game average at validation point
    player_points_pg = train_df[(train_df['season'] == season_point) & 
                                (train_df['gw'] == gw_point)][['player', 'total_points_pg_last_all']]
    
    pred_df = valid.merge(player_points_pg, on='player', how='left')   
    pred_df.fillna(0, inplace=True)
    preds = pred_df['total_points_pg_last_all'] * pred_df['minutes'] / 90
    targs = pred_df['total_points']
    
    return preds, targs

In [7]:
preds, targs = simple_model(train_df, valid_season, valid_gw, valid_len)

In [8]:
r_mse(preds, targs)

1.904038

In [9]:
# we can calculate the same predictions directly using the season and gameweek
# to check the above function 
# get player total per game average at validation point
player_points_pg = train_df[(train_df['season'] == '1920') & 
                            (train_df['gw'] == valid_gw)][['player', 'total_points_pg_last_all']]

valid_df = train_df[(train_df['season'] == '1920') 
                    & (train_df['gw'] > valid_gw - 1)
                    & (train_df['gw'] < valid_gw + 6)][['player', 'minutes', 'total_points']]

pred_df = valid_df.merge(player_points_pg, on='player', how='left')
pred_df['pred'] = pred_df['total_points_pg_last_all'] * pred_df['minutes'] / 90
pred_df.fillna(0, inplace=True)

In [10]:
r_mse(pred_df['pred'], pred_df['total_points'])

1.904038

In [11]:
# player_df = train_df[(train_df['season'] == valid_season) & 
#          (train_df['gw'] <= valid_gw)][['player', 'gw'] + player_lag_vars]

# player_vals = player_df[player_df['gw'] == player_df.groupby('player')['gw'].transform('max')]

# player_vals = player_vals.drop('gw', axis=1)

# player_vals

In [12]:
# cat_vars = ['gw', 'season']
# cont_vars = ['minutes']
# dep_var = ['total_points']

# valid_start, valid_end = validation_season_idx(train_df, valid_season, [valid_gw], valid_len)[0]

# train_idx = range(valid_start)
# valid_idx = range(valid_start, valid_end + 1)    

# train = train_df[['player'] + cat_vars + cont_vars + player_lag_vars + dep_var].iloc[train_idx]
# valid = train_df[['player'] + cat_vars + cont_vars + dep_var].iloc[valid_idx]

# valid = valid.merge(player_vals, on='player', how='left')

# ## NEXT: create the proper validation set with replaced values

In [13]:
# lag_train_df = pd.concat([train, valid]).reset_index()
# lag_train_df

In [16]:
# We can adapt this approach to also create validation sets with lag features
# When making predictions for gw +2 and beyond we cannot use those weeks's lag features
# This would be leakage if we did
# Instead, each subsequent validation week should have the same lag values as the first
def create_lag_train(df, cat_vars, cont_vars, lag_vars, valid_season, valid_gw, valid_len):

    # get all the lag data for the current season up to the first validation gameweek
    player_lag_vals = df[(df['season'] == valid_season) & 
                         (df['gw'] <= valid_gw)][['player', 'gw'] + player_lag_vars]
    
    # get the last avaialable lag data for each player
    # for most it will be the first validation week
    # but sometimes teams have blank gameweeks
    # in these cases it will be the previous gameweek
    player_lag_vals = player_lag_vals[player_lag_vals['gw'] == player_lag_vals.groupby('player')['gw'].transform('max')]
    player_lag_vals = player_lag_vals.drop('gw', axis=1)

    # get the validation start and end indexes
    valid_start, valid_end = validation_season_idx(train_df, valid_season, [valid_gw], valid_len)[0]
    train_idx = range(valid_start)
    valid_idx = range(valid_start, valid_end + 1)    

    # split out train and validation sets
    # do not include lag vars in validation set
    train = df[['player'] + cat_vars + cont_vars + player_lag_vars + dep_var].iloc[train_idx]
    valid = df[['player'] + cat_vars + cont_vars + dep_var].iloc[valid_idx]

    # add in lag vars
    # will be the same for all validation gameweeks
    valid = valid.merge(player_lag_vals, on='player', how='left')
    
    # concatenate train and test again
    lag_train_df = pd.concat([train, valid]).reset_index()

    return lag_train_df

In [17]:
cat_vars = ['gw', 'season']
cont_vars = ['minutes']
dep_var = ['total_points']

create_lag_train(train_df, cat_vars, cont_vars, player_lag_vars, valid_season, valid_gw, valid_len)

Unnamed: 0,index,player,gw,season,minutes,total_points_pg_last_all,total_points_pg_last_1,total_points_pg_last_2,total_points
0,0,Aaron_Cresswell,1,1617,0,,,,0
1,1,Aaron_Lennon,1,1617,15,,,,1
2,2,Aaron_Ramsey,1,1617,60,,,,2
3,3,Abdoulaye_Doucouré,1,1617,0,,,,0
4,4,Abdul Rahman_Baba,1,1617,0,,,,0
...,...,...,...,...,...,...,...,...,...
82049,3659,Yoshinori_Muto,25,1920,0,4.320000,,,0
82050,3660,Youri_Tielemans,25,1920,79,4.447674,2.000000,2.168675,9
82051,3661,Yves_Bissouma,25,1920,0,2.302017,12.857143,2.195122,0
82052,3662,Çaglar_Söyüncü,25,1920,90,3.546372,0.000000,0.000000,1
