In [1]:
# import required packages
# from fastai2.tabular.all import *
import xgboost as xgb
import pandas as pd
import numpy as np
from pathlib import Path
from sklearn.preprocessing import LabelEncoder, OneHotEncoder
from sklearn.feature_extraction import DictVectorizer
from helpers import *

pd.options.display.max_columns = None

In [2]:
# path to project directory
path = Path('./')

In [3]:
# read in training dataset
train_df = pd.read_csv(path/'data/train_v5.csv', index_col=0, dtype={'season':str})

In [4]:
train_df.shape

(97847, 37)

In [5]:
# features for model
season = '2021'
cat_vars = ['season', 'position', 'was_home']
cont_vars = ['gw', 'minutes']
dep_var = ['total_points']

# find the latest gameweek
last_gw = train_df['gw'][train_df['season'] == season].max()

if np.isnan(last_gw): 
    next_gw = 1 
else: 
    next_gw = last_gw + 1
    
next_gw

14

In [6]:
# read in remaining_season.csv and sort
remaining_season_df = pd.read_csv(path/'data/remaining_season.csv', 
                                  index_col=0).sort_values(by=['gw', 'team'])

remaining_season_df['season'] = season
remaining_season_df['total_points'] = 0
remaining_season_df['kickoff_time'] = '9999-99-99'

# append remaining season to end of training set
train_df = train_df.append(remaining_season_df, sort=False).reset_index(drop=True)

In [7]:
# add player lag features
lag_train_df, team_lag_vars = team_lag_features(train_df, ['total_points'], ['all', 1, 2, 3, 4, 5, 10])
lag_train_df, player_lag_vars = player_lag_features(lag_train_df, ['total_points'], 
                                                    ['all', 1, 2, 3, 4, 5, 10])

In [8]:
# we want to set gw and season as ordered categorical variables
# need lists with ordered categories
ordered_gws = list(range(1,39))
ordered_seasons = ['1617', '1718', '1819', '1920', '2021']

# set as categories with correct order 
lag_train_df['gw'] = lag_train_df['gw'].astype('category')
lag_train_df['season'] = lag_train_df['season'].astype('category')

lag_train_df['gw'].cat.set_categories(ordered_gws, ordered=True, inplace=True)
lag_train_df['season'].cat.set_categories(ordered_seasons, ordered=True, inplace=True)

In [9]:
# create dataset with latest lag numbers for all future gameweeks
valid_season = '2021'
valid_gw = next_gw
valid_len = 38 - next_gw + 1

train_predict_df, train_idx, valid_idx = create_lag_train(lag_train_df, 
                                                      cat_vars, cont_vars, 
                                                      player_lag_vars, team_lag_vars, dep_var,
                                                      valid_season, valid_gw, valid_len)

In [10]:
# take a look at the dataframe
train_predict_df.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 113211 entries, 0 to 113210
Data columns (total 37 columns):
 #   Column                                  Non-Null Count   Dtype   
---  ------                                  --------------   -----   
 0   gw                                      113211 non-null  category
 1   minutes                                 113211 non-null  float64 
 2   minutes_last_1                          113211 non-null  float64 
 3   minutes_last_10                         113211 non-null  float64 
 4   minutes_last_2                          113211 non-null  float64 
 5   minutes_last_3                          113211 non-null  float64 
 6   minutes_last_4                          113211 non-null  float64 
 7   minutes_last_5                          113211 non-null  float64 
 8   minutes_last_all                        113211 non-null  float64 
 9   opponent_team                           113211 non-null  object  
 10  player                          

In [11]:
# split out dependent variable
X, y = train_predict_df[cat_vars + cont_vars + player_lag_vars + team_lag_vars].copy(), train_predict_df[dep_var].copy()

In [12]:
# since position is categorical, it should be a string
X['position'] = X['position'].apply(str)

# need to transform season
enc = LabelEncoder()
X['season'] = enc.fit_transform(X['season'])
X_dict = X.to_dict("records")

# Create the DictVectorizer object: dv
dv = DictVectorizer(sparse=False, separator='_')

# Apply dv on df: df_encoded
X_encoded = dv.fit_transform(X_dict)

X_df = pd.DataFrame(X_encoded, columns=dv.feature_names_)

In [13]:
# split out training and prediction sets
X_train = X_df.iloc[train_idx]
y_train = y.iloc[train_idx]
X_test = X_df.iloc[valid_idx]
y_test = y.iloc[valid_idx]

In [14]:
# instatiate and train XGB Regressor
xg_reg = xgb.XGBRegressor(objective="reg:squarederror",
                          gamma=0.42, learning_rate=0.047, max_depth=4, n_estimators=171, subsample=0.6)

xg_reg.fit(X_train, y_train)
preds = xg_reg.predict(X_test)

In [15]:
preds

array([0.03454611, 0.01281273, 0.99044394, ..., 0.00515348, 1.4613855 ,
       1.2209891 ], dtype=float32)

In [16]:
remaining_season_df['predicted_points'] = preds

In [17]:
remaining_season_df

Unnamed: 0,gw,team,opponent_team,player,position,price,play_proba,short_name,was_home,relative_market_value_team,relative_market_value_opponent_team,season,minutes,total_points,kickoff_time,predicted_points
7894,14,Arsenal,Everton,Mesut_Özil,3,6.8,0.0,Özil,False,1.317898,1.137628,2021,0.0,0,9999-99-99,0.034546
7895,14,Arsenal,Everton,Sokratis_Papastathopoulos,2,4.8,0.0,Sokratis,False,1.317898,1.137628,2021,0.0,0,9999-99-99,0.012813
7896,14,Arsenal,Everton,David_Luiz Moreira Marinho,2,5.4,1.0,David Luiz,False,1.317898,1.137628,2021,22.5,0,9999-99-99,0.990444
7897,14,Arsenal,Everton,Pierre-Emerick_Aubameyang,3,11.4,1.0,Aubameyang,False,1.317898,1.137628,2021,90.0,0,9999-99-99,4.523584
7898,14,Arsenal,Everton,Cédric_Soares,2,4.6,1.0,Cédric,False,1.317898,1.137628,2021,1.0,0,9999-99-99,0.901083
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
7673,38,Wolverhampton Wanderers,Manchester United,Ki-Jana_Hoever,2,4.4,1.0,Hoever,True,0.911137,1.696657,2021,0.0,0,9999-99-99,0.007410
7674,38,Wolverhampton Wanderers,Manchester United,Nélson_Cabral Semedo,2,5.5,1.0,Semedo,True,0.911137,1.696657,2021,90.0,0,9999-99-99,3.050200
7675,38,Wolverhampton Wanderers,Manchester United,Meritan_Shabani,3,4.5,1.0,Shabani,True,0.911137,1.696657,2021,0.0,0,9999-99-99,0.005153
7676,38,Wolverhampton Wanderers,Manchester United,Rayan_Ait Nouri,2,5.0,1.0,Ait Nouri,True,0.911137,1.696657,2021,21.8,0,9999-99-99,1.461385


In [18]:
remaining_season_df[remaining_season_df['gw'] == next_gw].sort_values(by='predicted_points', ascending=False).head(50)

Unnamed: 0,gw,team,opponent_team,player,position,price,play_proba,short_name,was_home,relative_market_value_team,relative_market_value_opponent_team,season,minutes,total_points,kickoff_time,predicted_points
339,14,Tottenham Hotspur,Leicester City,Harry_Kane,4,10.9,1.0,Kane,True,1.670117,1.091969,2021,90.0,0,9999-99-99,7.064682
341,14,Tottenham Hotspur,Leicester City,Heung-Min_Son,3,9.6,1.0,Son,True,1.670117,1.091969,2021,88.6,0,9999-99-99,6.737853
7876,14,Liverpool,Crystal Palace,Mohamed_Salah,3,12.4,1.0,Salah,False,2.474081,0.474529,2021,81.5,0,9999-99-99,6.629431
7997,14,Manchester City,Southampton,Kevin_De Bruyne,3,11.9,1.0,De Bruyne,False,2.384115,0.535526,2021,90.0,0,9999-99-99,6.303384
301,14,Newcastle United,Fulham,Callum_Wilson,4,6.6,1.0,Wilson,True,0.602889,0.486494,2021,90.0,0,9999-99-99,6.296814
8007,14,Manchester City,Southampton,Gabriel Fernando_de Jesus,4,9.3,1.0,Jesus,False,2.384115,0.535526,2021,90.0,0,9999-99-99,6.276495
7873,14,Liverpool,Crystal Palace,Sadio_Mané,3,11.9,1.0,Mané,False,2.474081,0.474529,2021,76.8,0,9999-99-99,6.261296
250,14,Manchester United,Leeds,Bruno Miguel_Borges Fernandes,3,11.0,1.0,Fernandes,True,1.696657,0.439779,2021,78.8,0,9999-99-99,6.257599
7871,14,Liverpool,Crystal Palace,Roberto_Firmino,4,9.2,1.0,Firmino,False,2.474081,0.474529,2021,86.4,0,9999-99-99,6.046278
163,14,Chelsea,West Ham United,Timo_Werner,4,9.4,1.0,Werner,True,1.860172,0.663391,2021,83.6,0,9999-99-99,5.691399


In [19]:
# rename previous week's predictions file
# generate previous week's filename
last_gw = next_gw - 1
filename = 'history/2020-21/predictions_gw' + str(last_gw) + '.csv'
# windows
! move predictions.csv $filename
# linux
# ! mv predictions.csv $filename

        1 file(s) moved.


In [20]:
# write to predictions.csv
## RENAME PREVIOUS WEEK FIRST
predictions = remaining_season_df[['gw', 'player', 'team', 'opponent_team', 
                                   'position', 'price', 'play_proba', 'short_name',
                                   'was_home', 'predicted_points', 'minutes']]
predictions.to_csv('predictions.csv')