In [1]:
import pandas as pd
import numpy as np
from pathlib import Path
from helpers import *

In [2]:
# path to project directory
path = Path('./')

In [3]:
# read in training dataset
train_df = pd.read_csv(path/'data/train_v4.csv', index_col=0, dtype={'season':str})

In [4]:
# add all lag points scored to training set
train_df, player_lag_vars = player_lag_features(train_df, ['total_points'], ['all', 1, 2])

In [53]:
# lets grab some indexes for a validation set starting at gw 20 in 2019/20 season
valid_season = '1920'
valid_gw = 20
valid_len = 6

In [54]:
def simple_model(df, valid_season, valid_gw, valid_len):
    valid_start, valid_end = validation_season_idx(df, valid_season, [valid_gw], valid_len)[0]
    
    train_idx = range(valid_start)
    valid_idx = range(valid_start, valid_end + 1)    
    
    train = df.iloc[train_idx]
    valid = df[['player', 'gw', 'season', 'minutes', 'total_points']].iloc[valid_idx]
    
    season_point = valid['season'].iloc[0]
    gw_point = valid['gw'].iloc[0]
    
    # get player total per game average at validation point
    player_points_pg = train_df[(train_df['season'] == season_point) & 
                                (train_df['gw'] == gw_point)][['player', 'total_points_pg_last_all']]
    
    pred_df = valid.merge(player_points_pg, on='player', how='left')   
    pred_df.fillna(0, inplace=True)
    preds = pred_df['total_points_pg_last_all'] * pred_df['minutes'] / 90
    targs = pred_df['total_points']
    
    return preds, targs

In [51]:
preds, targs = simple_model(train_df, valid_season, valid_gw, valid_len)

In [52]:
r_mse(preds, targs)

1.904038

In [43]:
# we can calculate the same predictions directly using the season and gameweek
# to check the above function 
# get player total per game average at validation point
player_points_pg = train_df[(train_df['season'] == '1920') & 
                            (train_df['gw'] == valid_gw)][['player', 'total_points_pg_last_all']]

valid_df = train_df[(train_df['season'] == '1920') 
                    & (train_df['gw'] > valid_gw - 1)
                    & (train_df['gw'] < valid_gw + 6)][['player', 'minutes', 'total_points']]

pred_df = valid_df.merge(player_points_pg, on='player', how='left')
pred_df['pred'] = pred_df['total_points_pg_last_all'] * pred_df['minutes'] / 90
pred_df.fillna(0, inplace=True)

In [44]:
r_mse(pred_df['pred'], pred_df['total_points'])

1.904038

In [47]:
player_df = train_df[(train_df['season'] == valid_season) & 
         (train_df['gw'] <= valid_gw)][['player', 'gw'] + player_lag_vars]

player_vals = player_df[player_df['gw'] == player_df.groupby('player')['gw'].transform('max')]

player_vals

Unnamed: 0,player,gw,total_points_pg_last_all,total_points_pg_last_1,total_points_pg_last_2
78390,Ederson_Santana de Moraes,20,4.237371,-24.545455,-0.891089
78391,Xherdan_Shaqiri,20,5.071273,,3.913043
78392,Hugo_Lloris,20,4.194837,,
78393,Cenk_Tosun,20,4.779793,,2.647059
78394,Callum_Robinson,20,4.687500,,
...,...,...,...,...,...
78969,Grant_Hanley,20,0.600000,2.000000,1.000000
78970,Roberto_Pereyra,20,3.879065,2.000000,3.103448
78971,Sead_Kolasinac,20,3.919355,,
78972,Stuart_Armstrong,20,4.460506,3.176471,2.812500


In [58]:
cat_vars = ['gw', 'season']
cont_vars = ['minutes']
dep_var = ['total_points']

valid_start, valid_end = validation_season_idx(train_df, valid_season, [valid_gw], valid_len)[0]

train_idx = range(valid_start)
valid_idx = range(valid_start, valid_end + 1)    

train = train_df.iloc[train_idx]
valid = train_df[['player']+ cat_vars + cont_vars + dep_var].iloc[valid_idx]

valid.merge(player_vals, on='player', how='left')

## NEXT: create the proper validation set with replaced values

Unnamed: 0,player,gw_x,season,minutes,total_points,gw_y,total_points_pg_last_all,total_points_pg_last_1,total_points_pg_last_2
0,Ederson_Santana de Moraes,20,1920,0,0,20.0,4.237371,-24.545455,-0.891089
1,Xherdan_Shaqiri,20,1920,0,0,20.0,5.071273,,3.913043
2,Hugo_Lloris,20,1920,0,0,20.0,4.194837,,
3,Cenk_Tosun,20,1920,0,0,20.0,4.779793,,2.647059
4,Callum_Robinson,20,1920,58,1,20.0,4.687500,,
...,...,...,...,...,...,...,...,...,...
3659,Yoshinori_Muto,25,1920,0,0,20.0,4.320000,,
3660,Youri_Tielemans,25,1920,79,9,20.0,4.447674,2.000000,2.168675
3661,Yves_Bissouma,25,1920,0,0,20.0,2.302017,12.857143,2.195122
3662,Çaglar_Söyüncü,25,1920,90,1,20.0,3.546372,0.000000,0.000000


In [13]:
# We can adapt this approach to also create validation sets with lag features
# When making predictions for gw +2 and beyond we cannot use those weeks's lag features
# This would be leakage if we did
# Instead, each subsequent validation week should have the same lag values as the first
# def create_lag_train(df, lag_vars, valid_gw, valid_len):
    
    


#     return df, valid_idxs

player_points_pg = train_df[(train_df['season'] == '1920') & 
                            (train_df['gw'] == 20)][['player', 'total_points_pg_last_all']]

# train_df[:idxs[0]]
valid_df = train_df[idxs[0]:idxs[1]+1][['player'] + player_lag_vars]
valid_df

# player_lags = train_df[(train_df['season'] == '1920') & 
#                        (train_df['gw'] == 20)][['player', 'total_points_pg_last_all']]

Unnamed: 0,player,total_points_pg_last_all,total_points_pg_last_1,total_points_pg_last_2
78390,Ederson_Santana de Moraes,4.237371,-24.545455,-0.891089
78391,Xherdan_Shaqiri,5.071273,,3.913043
78392,Hugo_Lloris,4.194837,,
78393,Cenk_Tosun,4.779793,,2.647059
78394,Callum_Robinson,4.687500,,
...,...,...,...,...
82049,Yoshinori_Muto,4.027972,,
82050,Youri_Tielemans,4.316109,2.000000,2.547170
82051,Yves_Bissouma,2.326870,,
82052,Çaglar_Söyüncü,3.465793,2.000000,1.500000
