In [1]:
# import required packages
# from fastai2.tabular.all import *
import xgboost as xgb
import pandas as pd
import numpy as np
from pathlib import Path
from sklearn.preprocessing import LabelEncoder, OneHotEncoder
from sklearn.feature_extraction import DictVectorizer
from helpers import *

pd.options.display.max_columns = None

In [2]:
# path to project directory
path = Path('./')

In [3]:
# read in training dataset
train_df = pd.read_csv(path/'data/train_v5.csv', index_col=0, dtype={'season':str})

In [4]:
train_df.shape

(98458, 37)

In [5]:
# features for model
season = '2021'
cat_vars = ['season', 'position', 'was_home']
cont_vars = ['gw', 'minutes']
dep_var = ['total_points']

# find the latest gameweek
last_gw = train_df['gw'][train_df['season'] == season].max()

if np.isnan(last_gw): 
    next_gw = 1 
else: 
    next_gw = last_gw + 1
    
next_gw

15

In [6]:
# read in remaining_season.csv and sort
remaining_season_df = pd.read_csv(path/'data/remaining_season.csv', 
                                  index_col=0).sort_values(by=['gw', 'team'])

remaining_season_df['season'] = season
remaining_season_df['total_points'] = 0
remaining_season_df['kickoff_time'] = '9999-99-99'

# append remaining season to end of training set
train_df = train_df.append(remaining_season_df, sort=False).reset_index(drop=True)

In [7]:
# add player lag features
lag_train_df, team_lag_vars = team_lag_features(train_df, ['total_points'], ['all', 1, 2, 3, 4, 5, 10])
lag_train_df, player_lag_vars = player_lag_features(lag_train_df, ['total_points'], 
                                                    ['all', 1, 2, 3, 4, 5, 10])

In [8]:
# we want to set gw and season as ordered categorical variables
# need lists with ordered categories
ordered_gws = list(range(1,39))
ordered_seasons = ['1617', '1718', '1819', '1920', '2021']

# set as categories with correct order 
lag_train_df['gw'] = lag_train_df['gw'].astype('category')
lag_train_df['season'] = lag_train_df['season'].astype('category')

lag_train_df['gw'].cat.set_categories(ordered_gws, ordered=True, inplace=True)
lag_train_df['season'].cat.set_categories(ordered_seasons, ordered=True, inplace=True)

In [9]:
# create dataset with latest lag numbers for all future gameweeks
valid_season = '2021'
valid_gw = next_gw
valid_len = 38 - next_gw + 1

train_predict_df, train_idx, valid_idx = create_lag_train(lag_train_df, 
                                                      cat_vars, cont_vars, 
                                                      player_lag_vars, team_lag_vars, dep_var,
                                                      valid_season, valid_gw, valid_len)

In [10]:
# take a look at the dataframe
train_predict_df.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 113313 entries, 0 to 113312
Data columns (total 37 columns):
 #   Column                                  Non-Null Count   Dtype   
---  ------                                  --------------   -----   
 0   gw                                      113313 non-null  category
 1   minutes                                 113313 non-null  float64 
 2   minutes_last_1                          113313 non-null  float64 
 3   minutes_last_10                         113313 non-null  float64 
 4   minutes_last_2                          113313 non-null  float64 
 5   minutes_last_3                          113313 non-null  float64 
 6   minutes_last_4                          113313 non-null  float64 
 7   minutes_last_5                          113313 non-null  float64 
 8   minutes_last_all                        113313 non-null  float64 
 9   opponent_team                           113313 non-null  object  
 10  player                          

In [11]:
# split out dependent variable
X, y = train_predict_df[cat_vars + cont_vars + player_lag_vars + team_lag_vars].copy(), train_predict_df[dep_var].copy()

In [12]:
# since position is categorical, it should be a string
X['position'] = X['position'].apply(str)

# need to transform season
enc = LabelEncoder()
X['season'] = enc.fit_transform(X['season'])
X_dict = X.to_dict("records")

# Create the DictVectorizer object: dv
dv = DictVectorizer(sparse=False, separator='_')

# Apply dv on df: df_encoded
X_encoded = dv.fit_transform(X_dict)

X_df = pd.DataFrame(X_encoded, columns=dv.feature_names_)

In [13]:
# split out training and prediction sets
X_train = X_df.iloc[train_idx]
y_train = y.iloc[train_idx]
X_test = X_df.iloc[valid_idx]
y_test = y.iloc[valid_idx]

In [14]:
# instatiate and train XGB Regressor
xg_reg = xgb.XGBRegressor(objective="reg:squarederror",
                          gamma=0.42, learning_rate=0.047, max_depth=4, n_estimators=171, subsample=0.6)

xg_reg.fit(X_train, y_train)
preds = xg_reg.predict(X_test)

In [15]:
preds

array([0.02676913, 0.01692116, 1.9646041 , ..., 1.1928705 , 0.01639271,
       0.01639271], dtype=float32)

In [16]:
remaining_season_df['predicted_points'] = preds

In [17]:
remaining_season_df

Unnamed: 0,gw,team,opponent_team,player,position,price,play_proba,short_name,was_home,relative_market_value_team,relative_market_value_opponent_team,season,minutes,total_points,kickoff_time,predicted_points
91,15,Arsenal,Chelsea,Mesut_Özil,3,6.8,0.00,Özil,True,1.317380,1.85944,2021,0.0,0,9999-99-99,0.026769
92,15,Arsenal,Chelsea,Sokratis_Papastathopoulos,2,4.8,0.00,Sokratis,True,1.317380,1.85944,2021,0.0,0,9999-99-99,0.016921
93,15,Arsenal,Chelsea,David_Luiz Moreira Marinho,2,5.4,1.00,David Luiz,True,1.317380,1.85944,2021,67.5,0,9999-99-99,1.964604
94,15,Arsenal,Chelsea,Pierre-Emerick_Aubameyang,3,11.4,0.75,Aubameyang,True,1.317380,1.85944,2021,54.0,0,9999-99-99,1.745235
95,15,Arsenal,Chelsea,Cédric_Soares,2,4.6,1.00,Cédric,True,1.317380,1.85944,2021,1.0,0,9999-99-99,0.870770
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
7418,38,Wolverhampton Wanderers,Manchester United,Meritan_Shabani,3,4.5,1.00,Shabani,True,0.910778,1.69599,2021,0.0,0,9999-99-99,0.016393
7419,38,Wolverhampton Wanderers,Manchester United,Rayan_Ait Nouri,2,5.0,1.00,Ait Nouri,True,0.910778,1.69599,2021,15.6,0,9999-99-99,1.168844
7420,38,Wolverhampton Wanderers,Manchester United,Owen_Otasowie,3,4.5,1.00,Otasowie,True,0.910778,1.69599,2021,20.8,0,9999-99-99,1.192870
7421,38,Wolverhampton Wanderers,Manchester United,Theo_Corbeanu,3,4.5,1.00,Corbeanu,True,0.910778,1.69599,2021,0.0,0,9999-99-99,0.016393


In [18]:
remaining_season_df[remaining_season_df['gw'] == next_gw].sort_values(by='predicted_points', ascending=False).head(50)

Unnamed: 0,gw,team,opponent_team,player,position,price,play_proba,short_name,was_home,relative_market_value_team,relative_market_value_opponent_team,season,minutes,total_points,kickoff_time,predicted_points
261,15,Liverpool,West Bromwich Albion,Mohamed_Salah,3,12.5,1.0,Salah,True,2.473108,0.255967,2021,71.8,0,9999-99-99,6.822416
285,15,Manchester City,Newcastle United,Kevin_De Bruyne,3,11.8,1.0,De Bruyne,True,2.383177,0.602651,2021,90.0,0,9999-99-99,6.604663
7802,15,Tottenham Hotspur,Wolverhampton Wanderers,Harry_Kane,4,11.0,1.0,Kane,False,1.66946,0.910778,2021,90.0,0,9999-99-99,6.585641
256,15,Liverpool,West Bromwich Albion,Roberto_Firmino,4,9.3,1.0,Firmino,True,2.473108,0.255967,2021,83.2,0,9999-99-99,6.289239
289,15,Manchester City,Newcastle United,Raheem_Sterling,3,11.4,1.0,Sterling,True,2.383177,0.602651,2021,72.0,0,9999-99-99,6.164451
7804,15,Tottenham Hotspur,Wolverhampton Wanderers,Heung-Min_Son,3,9.7,1.0,Son,False,1.66946,0.910778,2021,88.6,0,9999-99-99,6.141796
227,15,Leicester City,Manchester United,Jamie_Vardy,4,10.2,1.0,Vardy,True,1.09154,1.69599,2021,89.4,0,9999-99-99,6.096792
258,15,Liverpool,West Bromwich Albion,Sadio_Mané,3,11.9,1.0,Mané,True,2.473108,0.255967,2021,70.4,0,9999-99-99,6.069838
7597,15,Southampton,Fulham,Che_Adams,4,6.1,1.0,Adams,False,0.535315,0.486303,2021,86.8,0,9999-99-99,5.876266
198,15,Leeds,Burnley,Patrick_Bamford,4,6.4,1.0,Bamford,True,0.439606,0.341918,2021,85.2,0,9999-99-99,5.747579


In [19]:
# rename previous week's predictions file
# generate previous week's filename
last_gw = next_gw - 1
filename = 'history/2020-21/predictions_gw' + str(last_gw) + '.csv'
# windows
! move predictions.csv $filename
# linux
# ! mv predictions.csv $filename

        1 file(s) moved.


In [20]:
# write to predictions.csv
## RENAME PREVIOUS WEEK FIRST
predictions = remaining_season_df[['gw', 'player', 'team', 'opponent_team', 
                                   'position', 'price', 'play_proba', 'short_name',
                                   'was_home', 'predicted_points', 'minutes']]
predictions.to_csv('predictions.csv')