In [1]:
# import required packages
# from fastai2.tabular.all import *
import xgboost as xgb
import pandas as pd
import numpy as np
from pathlib import Path
from sklearn.preprocessing import LabelEncoder, OneHotEncoder
from sklearn.feature_extraction import DictVectorizer
from helpers import *

pd.options.display.max_columns = None

In [2]:
# path to project directory
path = Path('./')

In [7]:
# read in training dataset
train_df = pd.read_csv(path/'data/train_v7.csv', index_col=0, dtype={'season':str,
                                                                     'squad':str,
                                                                     'comp':str})

In [8]:
train_df.shape

(99563, 60)

In [9]:
# features for model
season = '2021'
cat_vars = ['season', 'position', 'was_home']
cont_vars = ['gw', 'minutes']
dep_var = ['total_points']

# find the latest gameweek
last_gw = train_df['gw'][train_df['season'] == season].max()

if np.isnan(last_gw): 
    next_gw = 1 
else: 
    next_gw = last_gw + 1
    
next_gw

17

In [10]:
# read in remaining_season.csv and sort
remaining_season_df = pd.read_csv(path/'data/remaining_season.csv', 
                                  index_col=0).sort_values(by=['gw', 'team'])

remaining_season_df['season'] = season
remaining_season_df['total_points'] = 0
remaining_season_df['kickoff_time'] = '9999-99-99'

# append remaining season to end of training set
train_df = train_df.append(remaining_season_df, sort=False).reset_index(drop=True)

In [118]:
# add player lag features
# lag_train_df, team_lag_vars = team_lag_features(train_df, ['total_points'], ['all', 1, 2, 3, 4, 5, 10])
# lag_train_df, player_lag_vars = player_lag_features(lag_train_df, ['total_points'], 
#                                                     ['all', 1, 2, 3, 4, 5, 10])

lag_train_df, team_lag_vars = team_lag_features(train_df, ['total_points', 'xg'], [5, 10, 20])
lag_train_df, player_lag_vars = player_lag_features(lag_train_df, ['total_points'], 
                                                    ['all', 3, 5, 10, 20, 38])

In [119]:
# manually add team pg conceded fields for xg 
team_lag_vars += [x.replace('team', 'team_conceded') for x in team_lag_vars if 'xg' in x]

In [120]:
# we want to set gw and season as ordered categorical variables
# need lists with ordered categories
ordered_gws = list(range(1,39))
ordered_seasons = ['1617', '1718', '1819', '1920', '2021']

# set as categories with correct order 
lag_train_df['gw'] = lag_train_df['gw'].astype('category')
lag_train_df['season'] = lag_train_df['season'].astype('category')

lag_train_df['gw'].cat.set_categories(ordered_gws, ordered=True, inplace=True)
lag_train_df['season'].cat.set_categories(ordered_seasons, ordered=True, inplace=True)

In [121]:
# create dataset with latest lag numbers for all future gameweeks
valid_season = '2021'
valid_gw = next_gw
valid_len = 38 - next_gw + 1

train_predict_df, train_idx, valid_idx = create_lag_train(lag_train_df, 
                                                      cat_vars, cont_vars, 
                                                      player_lag_vars, team_lag_vars, dep_var,
                                                      valid_season, valid_gw, valid_len)

In [122]:
# take a look at the dataframe
train_predict_df.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 113433 entries, 0 to 113432
Data columns (total 39 columns):
 #   Column                                 Non-Null Count   Dtype   
---  ------                                 --------------   -----   
 0   gw                                     113433 non-null  category
 1   minutes                                113433 non-null  float64 
 2   minutes_last_10                        113433 non-null  float64 
 3   minutes_last_20                        113433 non-null  float64 
 4   minutes_last_3                         113433 non-null  float64 
 5   minutes_last_38                        113433 non-null  float64 
 6   minutes_last_5                         113433 non-null  float64 
 7   minutes_last_all                       113433 non-null  float64 
 8   opponent_team                          113433 non-null  object  
 9   player                                 113433 non-null  object  
 10  position                               11343

In [123]:
# split out dependent variable
X, y = train_predict_df[cat_vars + cont_vars + player_lag_vars + team_lag_vars].copy(), train_predict_df[dep_var].copy()

In [124]:
# since position is categorical, it should be a string
X['position'] = X['position'].apply(str)

# need to transform season
enc = LabelEncoder()
X['season'] = enc.fit_transform(X['season'])
X_dict = X.to_dict("records")

# Create the DictVectorizer object: dv
dv = DictVectorizer(sparse=False, separator='_')

# Apply dv on df: df_encoded
X_encoded = dv.fit_transform(X_dict)

X_df = pd.DataFrame(X_encoded, columns=dv.feature_names_)

In [125]:
# split out training and prediction sets
X_train = X_df.iloc[train_idx]
y_train = y.iloc[train_idx]
X_test = X_df.iloc[valid_idx]
y_test = y.iloc[valid_idx]

In [126]:
# instatiate and train XGB Regressor
# xg_reg = xgb.XGBRegressor(objective="reg:squarederror",
#                           gamma=0.42, learning_rate=0.047, max_depth=4, n_estimators=171, subsample=0.6)

xg_reg = xgb.XGBRegressor(objective="reg:squarederror",
                                  gamma=0.34, learning_rate=0.088, max_depth=5, n_estimators=55, subsample=0.78)

xg_reg.fit(X_train, y_train)
preds = xg_reg.predict(X_test)

In [127]:
preds

array([ 5.8003366e-03,  1.0671347e-02,  1.0525670e+00, ...,
       -5.3465366e-04, -5.3465366e-04, -5.3465366e-04], dtype=float32)

In [128]:
remaining_season_df['predicted_points'] = preds

In [129]:
remaining_season_df

Unnamed: 0,gw,team,opponent_team,player,position,price,play_proba,short_name,was_home,relative_market_value_team,relative_market_value_opponent_team,season,minutes,total_points,kickoff_time,predicted_points
7370,17,Arsenal,West Bromwich Albion,Mesut Özil,3,6.8,0.00,Özil,False,1.316625,0.255820,2021,0.00,0,9999-99-99,0.005800
7371,17,Arsenal,West Bromwich Albion,Sokratis Papastathopoulos,2,4.8,0.00,Sokratis,False,1.316625,0.255820,2021,0.00,0,9999-99-99,0.010671
7372,17,Arsenal,West Bromwich Albion,David Luiz Moreira Marinho,2,5.4,0.75,David Luiz,False,1.316625,0.255820,2021,33.75,0,9999-99-99,1.052567
7373,17,Arsenal,West Bromwich Albion,Pierre-Emerick Aubameyang,3,11.4,1.00,Aubameyang,False,1.316625,0.255820,2021,67.50,0,9999-99-99,4.255922
7374,17,Arsenal,West Bromwich Albion,Cédric Soares,2,4.6,1.00,Cédric,False,1.316625,0.255820,2021,1.00,0,9999-99-99,0.931585
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
6924,38,Wolverhampton Wanderers,Manchester United,Theo Corbeanu,3,4.5,1.00,Corbeanu,True,0.910256,1.695018,2021,0.00,0,9999-99-99,-0.000535
6925,38,Wolverhampton Wanderers,Manchester United,Taylor Perry,3,4.5,1.00,Perry,True,0.910256,1.695018,2021,0.00,0,9999-99-99,-0.000535
6926,38,Wolverhampton Wanderers,Manchester United,Luke Cundle,3,4.5,1.00,Cundle,True,0.910256,1.695018,2021,0.00,0,9999-99-99,-0.000535
6927,38,Wolverhampton Wanderers,Manchester United,Lewis Richards,2,4.0,1.00,Richards,True,0.910256,1.695018,2021,0.00,0,9999-99-99,-0.000535


In [None]:
remaining_season_df[remaining_season_df['gw'] == next_gw].sort_values(by='predicted_points', ascending=False).head(50)

In [130]:
remaining_season_df[remaining_season_df['gw'] == next_gw].sort_values(by='predicted_points', ascending=False).head(50)

Unnamed: 0,gw,team,opponent_team,player,position,price,play_proba,short_name,was_home,relative_market_value_team,relative_market_value_opponent_team,season,minutes,total_points,kickoff_time,predicted_points
7288,17,Leicester City,Newcastle United,Jamie Vardy,4,10.2,1.0,Vardy,False,1.090914,0.602306,2021,76.2,0,9999-99-99,8.645499
7322,17,Liverpool,Southampton,Mohamed Salah,3,12.6,1.0,Salah,False,2.471691,0.535009,2021,77.2,0,9999-99-99,6.898355
7319,17,Liverpool,Southampton,Sadio Mané,3,11.9,1.0,Mané,False,2.471691,0.535009,2021,83.2,0,9999-99-99,6.638753
407,17,Tottenham Hotspur,Leeds,Heung-Min Son,3,9.7,1.0,Son,True,1.668504,0.439354,2021,87.25,0,9999-99-99,5.716894
7327,17,Liverpool,Southampton,Trent Alexander-Arnold,2,7.2,1.0,Alexander-Arnold,False,2.471691,0.535009,2021,85.4,0,9999-99-99,5.636406
196,17,Burnley,Fulham,Chris Wood,4,6.2,1.0,Wood,True,0.341722,0.486024,2021,84.0,0,9999-99-99,5.481269
7173,17,Manchester City,Chelsea,Raheem Sterling,3,11.4,1.0,Sterling,False,2.381811,1.858374,2021,90.0,0,9999-99-99,5.403155
251,17,Crystal Palace,Sheffield United,Christian Benteke,4,5.5,1.0,Benteke,True,0.47407,0.386932,2021,84.75,0,9999-99-99,5.393681
282,17,Everton,West Ham United,Jordan Pickford,1,4.8,1.0,Pickford,True,1.144393,0.666121,2021,67.5,0,9999-99-99,5.34807
7169,17,Manchester City,Chelsea,Kevin De Bruyne,3,11.7,1.0,De Bruyne,False,2.381811,1.858374,2021,90.0,0,9999-99-99,5.31653


In [131]:
# rename previous week's predictions file
# generate previous week's filename
last_gw = next_gw - 1
filename = 'history/2020-21/predictions_gw' + str(last_gw) + '.csv'
# windows
! move predictions.csv $filename
# linux
# ! mv predictions.csv $filename

        1 file(s) moved.


In [132]:
# write to predictions.csv
## RENAME PREVIOUS WEEK FIRST
predictions = remaining_season_df[['gw', 'player', 'team', 'opponent_team', 
                                   'position', 'price', 'play_proba', 'short_name',
                                   'was_home', 'predicted_points', 'minutes']]
predictions.to_csv('predictions.csv')