In [1]:
import pandas as pd
import numpy as np
from pathlib import Path
from helpers import *

In [2]:
# path to project directory
path = Path('./')

In [3]:
# read in training dataset
train_df = pd.read_csv(path/'data/train_v4.csv', index_col=0, dtype={'season':str})

In [4]:
# add all lag points scored to training set
train_df = player_lag_features(train_df, ['total_points'], ['all'])

In [5]:
# lets grab some indexes for a validation set starting at gw 20 in 2019/20 season
idxs = validation_season_idx(train_df, '1920', [20])[0]
idxs

(78390, 78973, 80156, 82053)

In [6]:
def simple_model(df, idxs):
    valid_start, valid_end_gw1, valid_end_gw3, valid_end_gw6 = idxs
    
    train_idx = range(valid_start)
    valid_idx = range(valid_start, valid_end_gw6 + 1)    
    
    train = df.iloc[train_idx]
    valid = df[['player', 'gw', 'season', 'minutes', 'total_points']].iloc[valid_idx]
    
    season_point = valid['season'].iloc[0]
    gw_point = valid['gw'].iloc[0]
    
    # get player total per game average at validation point
    player_points_pg = train_df[(train_df['season'] == season_point) & 
                                (train_df['gw'] == gw_point)][['player', 'total_points_pg_last_all']]
    
    pred_df = valid.merge(player_points_pg, on='player', how='left')   
    pred_df.fillna(0, inplace=True)
    preds = pred_df['total_points_pg_last_all'] * pred_df['minutes'] / 90
    targs = pred_df['total_points']
    
    return preds, targs

In [7]:
preds, targs = simple_model(train_df, idxs)

In [8]:
r_mse(preds, targs)

1.904038

In [9]:
# we can calculate the same predictions directly using the season and gameweek
# to check the above function 
# get player total per game average at validation point
player_points_pg = train_df[(train_df['season'] == '1920') & 
                            (train_df['gw'] == 20)][['player', 'total_points_pg_last_all']]

valid_df = train_df[(train_df['season'] == '1920') 
                    & (train_df['gw'] > 19)
                    & (train_df['gw'] < 26)][['player', 'minutes', 'total_points']]

pred_df = valid_df.merge(player_points_pg, on='player', how='left')
pred_df['pred'] = pred_df['total_points_pg_last_all'] * pred_df['minutes'] / 90
pred_df.fillna(0, inplace=True)

In [10]:
r_mse(pred_df['pred'], pred_df['total_points'])

1.904038