In [7]:
# import required packages
# from fastai2.tabular.all import *
import xgboost as xgb
import pandas as pd
import numpy as np
from pathlib import Path
from sklearn.preprocessing import LabelEncoder, OneHotEncoder
from sklearn.feature_extraction import DictVectorizer
from helpers import *

pd.options.display.max_columns = None

In [8]:
# path to project directory
path = Path('./')

In [9]:
# read in training dataset
train_df = pd.read_csv(path/'data/train_v7.csv', index_col=0, dtype={'season':str,
                                                                     'squad':str,
                                                                     'comp':str})

In [10]:
train_df.shape

(100123, 58)

In [11]:
# features for model
season = '2021'
cat_vars = ['season', 'position', 'was_home']
cont_vars = ['gw', 'minutes']
dep_var = ['total_points']

# find the latest gameweek
last_gw = train_df['gw'][train_df['season'] == season].max()

if np.isnan(last_gw): 
    next_gw = 1 
else: 
    next_gw = last_gw + 1
    
next_gw

18

In [12]:
# read in remaining_season.csv and sort
remaining_season_df = pd.read_csv(path/'data/remaining_season.csv', 
                                  index_col=0).sort_values(by=['gw', 'team'])

remaining_season_df['season'] = season
remaining_season_df['total_points'] = 0
remaining_season_df['kickoff_time'] = '9999-99-99'

# append remaining season to end of training set
train_df = train_df.append(remaining_season_df, sort=False).reset_index(drop=True)

In [13]:
# add player lag features
lag_train_df, team_lag_vars = team_lag_features(train_df, ['total_points'], ['all', 3, 5, 10, 20])
lag_train_df, player_lag_vars = player_lag_features(lag_train_df, ['total_points'], ['all', 1, 2, 3, 4, 5, 10])

# add team conceded fields
pattern = re.compile('total_points_team_pg_last_.*_opponent')
team_lag_vars += [x.replace('team', 'team_conceded') for x in team_lag_vars if pattern.match(x)]

# but remove them for the player's team
pattern = re.compile('total_points_team_pg_last_.*_opponent')
team_lag_vars = [x for x in team_lag_vars if not pattern.match(x)]

In [15]:
# we want to set gw and season as ordered categorical variables
# need lists with ordered categories
ordered_gws = list(range(1,39))
ordered_seasons = ['1617', '1718', '1819', '1920', '2021']

# set as categories with correct order 
lag_train_df['gw'] = lag_train_df['gw'].astype('category')
lag_train_df['season'] = lag_train_df['season'].astype('category')

lag_train_df['gw'].cat.set_categories(ordered_gws, ordered=True, inplace=True)
lag_train_df['season'].cat.set_categories(ordered_seasons, ordered=True, inplace=True)

In [16]:
# create dataset with latest lag numbers for all future gameweeks
valid_season = '2021'
valid_gw = next_gw
valid_len = 38 - next_gw + 1

train_predict_df, train_idx, valid_idx = create_lag_train(lag_train_df, 
                                                      cat_vars, cont_vars, 
                                                      player_lag_vars, team_lag_vars, dep_var,
                                                      valid_season, valid_gw, valid_len)

In [17]:
# take a look at the dataframe
train_predict_df.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 113698 entries, 0 to 113697
Data columns (total 33 columns):
 #   Column                                           Non-Null Count   Dtype   
---  ------                                           --------------   -----   
 0   gw                                               113698 non-null  category
 1   minutes                                          113698 non-null  float64 
 2   minutes_last_1                                   113698 non-null  float64 
 3   minutes_last_10                                  113698 non-null  float64 
 4   minutes_last_2                                   113698 non-null  float64 
 5   minutes_last_3                                   113698 non-null  float64 
 6   minutes_last_4                                   113698 non-null  float64 
 7   minutes_last_5                                   113698 non-null  float64 
 8   minutes_last_all                                 113698 non-null  float64 
 9   oppo

In [18]:
# split out dependent variable
X, y = train_predict_df[cat_vars + cont_vars + player_lag_vars + team_lag_vars].copy(), train_predict_df[dep_var].copy()

In [19]:
# since position is categorical, it should be a string
X['position'] = X['position'].apply(str)

# need to transform season
enc = LabelEncoder()
X['season'] = enc.fit_transform(X['season'])
X_dict = X.to_dict("records")

# Create the DictVectorizer object: dv
dv = DictVectorizer(sparse=False, separator='_')

# Apply dv on df: df_encoded
X_encoded = dv.fit_transform(X_dict)

X_df = pd.DataFrame(X_encoded, columns=dv.feature_names_)

In [20]:
# split out training and prediction sets
X_train = X_df.iloc[train_idx]
y_train = y.iloc[train_idx]
X_test = X_df.iloc[valid_idx]
y_test = y.iloc[valid_idx]

In [22]:
# instatiate and train XGB Regressor
model_params = {"gamma": 0.42,
                "learning_rate": 0.047, 
                "max_depth": 4,
                "n_estimators": 171,
                "subsample": 0.6}

xg_reg = xgb.XGBRegressor(objective="reg:squarederror",
                          gamma=model_params['gamma'], 
                          learning_rate=model_params['learning_rate'], 
                          max_depth=model_params['max_depth'],
                          n_estimators=model_params['n_estimators'],
                          subsample=model_params['subsample'])

xg_reg.fit(X_train, y_train)
preds = xg_reg.predict(X_test)

In [23]:
preds

array([ 1.2301207e-02, -1.2629926e-02,  1.1176634e+00, ...,
        1.7628074e-04,  1.7628074e-04,  5.8135092e-03], dtype=float32)

In [24]:
remaining_season_df['predicted_points'] = preds

In [25]:
remaining_season_df

Unnamed: 0,gw,team,opponent_team,player,position,price,play_proba,short_name,was_home,relative_market_value_team,relative_market_value_opponent_team,season,minutes,total_points,kickoff_time,predicted_points
186,18,Arsenal,Crystal Palace,Mesut Özil,3,6.8,0.0,Özil,True,1.261632,0.474726,2021,0.0,0,9999-99-99,0.012301
187,18,Arsenal,Crystal Palace,Sokratis Papastathopoulos,2,4.8,0.0,Sokratis,True,1.261632,0.474726,2021,0.0,0,9999-99-99,-0.012630
188,18,Arsenal,Crystal Palace,David Luiz Moreira Marinho,2,5.4,1.0,David Luiz,True,1.261632,0.474726,2021,45.0,0,9999-99-99,1.117663
189,18,Arsenal,Crystal Palace,Pierre-Emerick Aubameyang,3,11.3,1.0,Aubameyang,True,1.261632,0.474726,2021,67.5,0,9999-99-99,3.890360
190,18,Arsenal,Crystal Palace,Cédric Soares,2,4.6,1.0,Cédric,True,1.261632,0.474726,2021,1.0,0,9999-99-99,1.039865
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
6776,38,Wolverhampton Wanderers,Manchester United,Luke Cundle,3,4.5,1.0,Cundle,True,0.950893,1.731115,2021,0.0,0,9999-99-99,-0.003113
6777,38,Wolverhampton Wanderers,Manchester United,Lewis Richards,2,4.0,1.0,Richards,True,0.950893,1.731115,2021,0.0,0,9999-99-99,0.000176
6778,38,Wolverhampton Wanderers,Manchester United,Christian Marques,2,4.0,1.0,Marques,True,0.950893,1.731115,2021,0.0,0,9999-99-99,0.000176
6779,38,Wolverhampton Wanderers,Manchester United,Andreas Söndergaard,1,4.0,1.0,Söndergaard,True,0.950893,1.731115,2021,0.0,0,9999-99-99,0.000176


In [26]:
remaining_season_df[remaining_season_df['gw'] == next_gw].sort_values(by='predicted_points', ascending=False).head(50)

Unnamed: 0,gw,team,opponent_team,player,position,price,play_proba,short_name,was_home,relative_market_value_team,relative_market_value_opponent_team,season,minutes,total_points,kickoff_time,predicted_points
131,18,Tottenham Hotspur,Fulham,Heung-Min Son,3,9.8,1.0,Son,True,1.670812,0.495697,2021,87.25,0,9999-99-99,7.73966
129,18,Tottenham Hotspur,Fulham,Harry Kane,4,11.0,1.0,Kane,True,1.670812,0.495697,2021,89.0,0,9999-99-99,7.075349
478,18,Manchester City,Brighton and Hove Albion,Raheem Sterling,3,11.4,0.75,Sterling,True,2.385107,0.549587,2021,67.5,0,9999-99-99,7.066947
474,18,Manchester City,Brighton and Hove Albion,Kevin De Bruyne,3,11.8,1.0,De Bruyne,True,2.385107,0.549587,2021,88.75,0,9999-99-99,6.637831
6794,18,Manchester United,Burnley,Bruno Miguel Borges Fernandes,3,11.3,1.0,Fernandes,False,1.731115,0.342195,2021,83.0,0,9999-99-99,6.423244
6795,18,Manchester United,Burnley,Anthony Martial,4,8.8,1.0,Martial,False,1.731115,0.342195,2021,73.8,0,9999-99-99,5.61629
6798,18,Manchester United,Burnley,Marcus Rashford,3,9.6,1.0,Rashford,False,1.731115,0.342195,2021,85.0,0,9999-99-99,5.308924
419,18,Wolverhampton Wanderers,Everton,Pedro Lomba Neto,3,6.0,1.0,Neto,True,0.950893,1.145976,2021,85.4,0,9999-99-99,5.073385
124,18,Tottenham Hotspur,Fulham,Hugo Lloris,1,5.6,1.0,Lloris,True,1.670812,0.495697,2021,90.0,0,9999-99-99,5.033155
473,18,Manchester City,Brighton and Hove Albion,Ilkay Gündogan,3,5.4,1.0,Gündogan,True,2.385107,0.549587,2021,90.0,0,9999-99-99,5.014047


In [27]:
# rename previous week's predictions file
# generate previous week's filename
last_gw = next_gw - 1
filename = 'history/2020-21/predictions_gw' + str(last_gw) + '.csv'
# windows
! move predictions.csv $filename
# linux
# ! mv predictions.csv $filename

        1 file(s) moved.


In [28]:
# write to predictions.csv
## RENAME PREVIOUS WEEK FIRST
predictions = remaining_season_df[['gw', 'player', 'team', 'opponent_team', 
                                   'position', 'price', 'play_proba', 'short_name',
                                   'was_home', 'predicted_points', 'minutes']]
predictions.to_csv('predictions.csv')