In [1]:
# import required packages
# from fastai2.tabular.all import *
import xgboost as xgb
import pandas as pd
import numpy as np
from pathlib import Path
from sklearn.preprocessing import LabelEncoder, OneHotEncoder
from sklearn.feature_extraction import DictVectorizer
from helpers import *

pd.options.display.max_columns = None

In [2]:
# path to project directory
path = Path('./')

In [3]:
# read in training dataset
train_df = pd.read_csv(path/'data/train_v5.csv', index_col=0, dtype={'season':str})

In [4]:
train_df.tail()

Unnamed: 0,player,gw,position,minutes,team,opponent_team,relative_market_value_team,relative_market_value_opponent_team,was_home,total_points,assists,bonus,bps,clean_sheets,creativity,goals_conceded,goals_scored,ict_index,influence,own_goals,penalties_missed,penalties_saved,red_cards,saves,selected,team_a_score,team_h_score,threat,transfers_balance,transfers_in,transfers_out,yellow_cards,kickoff_time,season,play_proba,relative_market_value_team_season,relative_market_value_opponent_team_season
93686,Yves_Bissouma,6,3,90,Brighton and Hove Albion,West Bromwich Albion,0.517388,0.238855,True,2,0,0,9,0,3.3,1,0,0.8,4.2,0,0,0,0,0,275399,1,1,0.0,14690,38217,23527,0,2020-10-26T17:30:00Z,2021,1.0,,
93687,Zack_Steffen,6,1,0,Manchester City,West Ham United,2.421052,0.645502,False,0,0,0,0,0,0.0,0,0,0.0,0.0,0,0,0,0,0,1519,1,1,0.0,-27,105,132,0,2020-10-24T11:30:00Z,2021,1.0,,
93688,Zeze Steven_Sessegnon,6,2,0,Fulham,Crystal Palace,0.490375,0.50322,True,0,0,0,0,0,0.0,0,0,0.0,0.0,0,0,0,0,0,4005,2,1,0.0,-299,21,320,0,2020-10-24T14:00:00Z,2021,0.0,,
93689,Çaglar_Söyüncü,6,2,0,Leicester City,Arsenal,1.051364,1.416652,False,0,0,0,0,0,0.0,0,0,0.0,0.0,0,0,0,0,0,408583,1,0,0.0,-190759,254,191013,0,2020-10-25T19:15:00Z,2021,0.0,,
93690,Ørjan_Nyland,6,1,0,Aston Villa,Leeds,0.698967,0.40649,True,0,0,0,0,0,0.0,0,0,0.0,0.0,0,0,0,0,0,739239,3,0,0.0,-97731,608,98339,0,2020-10-23T19:00:00Z,2021,0.0,,


In [5]:
train_df.shape

(93691, 37)

In [6]:
# features for model
season = '2021'
cat_vars = ['season', 'position', 'was_home']
cont_vars = ['gw', 'minutes']
dep_var = ['total_points']

# find the latest gameweek
last_gw = train_df['gw'][train_df['season'] == season].max()

if np.isnan(last_gw): 
    next_gw = 1 
else: 
    next_gw = last_gw + 1
    
next_gw

7

In [7]:
# read in remaining_season.csv and sort
remaining_season_df = pd.read_csv(path/'data/remaining_season.csv', 
                                  index_col=0).sort_values(by=['gw', 'team'])

remaining_season_df['season'] = season
remaining_season_df['total_points'] = 0
remaining_season_df['kickoff_time'] = '9999-99-99'

# append remaining season to end of training set
train_df = train_df.append(remaining_season_df, sort=False).reset_index(drop=True)

In [8]:
# add player lag features
lag_train_df, team_lag_vars = team_lag_features(train_df, ['total_points'], ['all', 1, 2, 3, 4, 5, 10])
lag_train_df, player_lag_vars = player_lag_features(lag_train_df, ['total_points'], 
                                                    ['all', 1, 2, 3, 4, 5, 10])

In [9]:
# we want to set gw and season as ordered categorical variables
# need lists with ordered categories
ordered_gws = list(range(1,39))
ordered_seasons = ['1617', '1718', '1819', '1920', '2021']

# set as categories with correct order 
lag_train_df['gw'] = lag_train_df['gw'].astype('category')
lag_train_df['season'] = lag_train_df['season'].astype('category')

lag_train_df['gw'].cat.set_categories(ordered_gws, ordered=True, inplace=True)
lag_train_df['season'].cat.set_categories(ordered_seasons, ordered=True, inplace=True)

In [10]:
# create dataset with latest lag numbers for all future gameweeks
valid_season = '2021'
valid_gw = next_gw
valid_len = 38 - next_gw + 1

train_predict_df, train_idx, valid_idx = create_lag_train(lag_train_df, 
                                                      cat_vars, cont_vars, 
                                                      player_lag_vars, team_lag_vars, dep_var,
                                                      valid_season, valid_gw, valid_len)

In [13]:
train_predict_df[train_predict_df['player'] == 'Kevin_De Bruyne'].tail(50)

Unnamed: 0,gw,minutes,minutes_last_1,minutes_last_10,minutes_last_2,minutes_last_3,minutes_last_4,minutes_last_5,minutes_last_all,opponent_team,player,position,season,team,total_points,total_points_pg_last_1,total_points_pg_last_10,total_points_pg_last_2,total_points_pg_last_3,total_points_pg_last_4,total_points_pg_last_5,total_points_pg_last_all,total_points_team_pg_last_1,total_points_team_pg_last_10,total_points_team_pg_last_10_opponent,total_points_team_pg_last_1_opponent,total_points_team_pg_last_2,total_points_team_pg_last_2_opponent,total_points_team_pg_last_3,total_points_team_pg_last_3_opponent,total_points_team_pg_last_4,total_points_team_pg_last_4_opponent,total_points_team_pg_last_5,total_points_team_pg_last_5_opponent,total_points_team_pg_last_all,total_points_team_pg_last_all_opponent,was_home
83051,27,90.0,78.0,836.0,168.0,258.0,348.0,410.0,8976.0,Leicester City,Kevin_De Bruyne,3,1920,Manchester City,3,16.153846,8.397129,8.571429,8.023256,6.465517,7.463415,6.437166,72.0,51.0,40.0,50.0,42.0,42.5,46.666667,48.666667,44.25,43.0,50.8,41.0,54.928571,40.842857,False
84180,29,0.0,90.0,836.0,168.0,258.0,348.0,438.0,9066.0,Manchester United,Kevin_De Bruyne,3,1920,Manchester City,0,3.0,6.674641,9.107143,6.627907,6.724138,5.753425,6.403044,50.0,48.8,49.1,35.0,61.0,54.0,44.666667,60.666667,47.5,58.5,45.4,50.6,54.893617,45.535211,False
84881,30,69.0,0.0,746.0,90.0,168.0,258.0,348.0,9066.0,Arsenal,Kevin_De Bruyne,3,1920,Manchester City,14,,6.997319,3.0,9.107143,6.627907,6.724138,6.403044,17.0,45.6,45.2,60.0,33.5,54.5,46.333333,63.0,37.75,58.75,41.4,54.4,54.626761,45.584507,True
84882,30,29.0,69.0,749.0,69.0,159.0,237.0,327.0,9135.0,Burnley,Kevin_De Bruyne,3,1920,Manchester City,1,18.26087,8.050734,18.26087,9.622642,11.772152,9.082569,6.492611,81.0,51.2,40.5,31.0,49.0,39.5,49.333333,50.333333,55.0,47.25,46.4,47.0,54.811189,37.335664,True
85576,31,90.0,29.0,688.0,98.0,98.0,188.0,266.0,9164.0,Chelsea,Kevin_De Bruyne,3,1920,Manchester City,8,3.103448,7.063953,13.77551,13.77551,8.617021,10.827068,6.481886,93.0,53.8,45.0,45.0,87.0,64.5,63.666667,56.0,60.25,53.0,62.6,45.6,55.076389,49.305556,False
86221,32,90.0,90.0,688.0,119.0,188.0,188.0,278.0,9254.0,Liverpool,Kevin_De Bruyne,3,1920,Manchester City,14,8.0,7.848837,6.806723,11.010638,11.010638,8.417266,6.49665,29.0,52.4,58.9,91.0,61.0,69.5,67.666667,61.666667,55.0,51.25,54.0,50.6,54.896552,54.744828,True
86872,33,31.0,90.0,716.0,180.0,209.0,278.0,278.0,9344.0,Southampton,Kevin_De Bruyne,3,1920,Manchester City,1,14.0,8.170391,11.0,9.904306,11.978417,11.978417,6.568921,82.0,52.9,40.8,48.0,55.5,33.0,68.0,48.333333,71.25,43.75,60.4,40.4,55.082192,37.315068,False
87525,34,90.0,31.0,657.0,121.0,211.0,240.0,309.0,9375.0,Newcastle United,Kevin_De Bruyne,3,1920,Manchester City,7,2.903226,8.767123,11.157025,9.810427,9.0,11.067961,6.5568,21.0,51.3,46.1,41.0,51.5,55.0,44.0,50.333333,56.25,58.5,61.2,58.4,54.85034,39.055046,True
88178,35,63.0,90.0,657.0,121.0,211.0,301.0,330.0,9465.0,Brighton and Hove Albion,Kevin_De Bruyne,3,1920,Manchester City,3,7.0,8.767123,5.950413,9.383886,8.9701,8.454545,6.561014,83.0,54.0,36.4,28.0,52.0,45.5,62.0,37.0,53.75,38.25,61.6,39.2,55.040541,35.5,False
88837,36,0.0,63.0,630.0,153.0,184.0,274.0,364.0,9528.0,Bournemouth,Kevin_De Bruyne,3,1920,Manchester City,0,4.285714,9.285714,5.882353,5.380435,8.211679,8.159341,6.54597,94.0,62.2,29.8,50.0,88.5,49.0,66.0,42.333333,70.0,37.5,61.8,34.6,55.302013,37.261745,True


In [14]:
# take a look at the dataframe
train_predict_df.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 112692 entries, 0 to 112691
Data columns (total 37 columns):
 #   Column                                  Non-Null Count   Dtype   
---  ------                                  --------------   -----   
 0   gw                                      112692 non-null  category
 1   minutes                                 112692 non-null  float64 
 2   minutes_last_1                          112692 non-null  float64 
 3   minutes_last_10                         112692 non-null  float64 
 4   minutes_last_2                          112692 non-null  float64 
 5   minutes_last_3                          112692 non-null  float64 
 6   minutes_last_4                          112692 non-null  float64 
 7   minutes_last_5                          112692 non-null  float64 
 8   minutes_last_all                        112692 non-null  float64 
 9   opponent_team                           112692 non-null  object  
 10  player                          

In [15]:
# split out dependent variable
X, y = train_predict_df[cat_vars + cont_vars + player_lag_vars + team_lag_vars].copy(), train_predict_df[dep_var].copy()

In [16]:
# since position is categorical, it should be a string
X['position'] = X['position'].apply(str)

# need to transform season
enc = LabelEncoder()
X['season'] = enc.fit_transform(X['season'])
X_dict = X.to_dict("records")

# Create the DictVectorizer object: dv
dv = DictVectorizer(sparse=False, separator='_')

# Apply dv on df: df_encoded
X_encoded = dv.fit_transform(X_dict)

X_df = pd.DataFrame(X_encoded, columns=dv.feature_names_)

In [17]:
# split out training and prediction sets
X_train = X_df.iloc[train_idx]
y_train = y.iloc[train_idx]
X_test = X_df.iloc[valid_idx]
y_test = y.iloc[valid_idx]

In [18]:
# instatiate and train XGB Regressor
xg_reg = xgb.XGBRegressor(objective="reg:squarederror",
                          gamma=0.42, learning_rate=0.047, max_depth=4, n_estimators=171, subsample=0.6)

xg_reg.fit(X_train, y_train)
preds = xg_reg.predict(X_test)

In [19]:
preds

array([-3.5387278e-03,  1.0962486e-03,  2.0101147e+00, ...,
       -6.4302683e-03, -6.4302683e-03, -6.4302683e-03], dtype=float32)

In [20]:
remaining_season_df['predicted_points'] = preds

In [21]:
remaining_season_df

Unnamed: 0,gw,team,opponent_team,player,position,price,play_proba,short_name,was_home,relative_market_value_team,relative_market_value_opponent_team,season,minutes,total_points,kickoff_time,predicted_points
9698,7,Arsenal,Manchester United,Mesut_Özil,3,6.8,0.00,Özil,False,1.416652,1.793036,2021,0.000,0,9999-99-99,-0.003539
9699,7,Arsenal,Manchester United,Sokratis_Papastathopoulos,2,4.9,0.00,Sokratis,False,1.416652,1.793036,2021,0.000,0,9999-99-99,0.001096
9700,7,Arsenal,Manchester United,David_Luiz Moreira Marinho,2,5.5,0.75,David Luiz,False,1.416652,1.793036,2021,59.625,0,9999-99-99,2.010115
9701,7,Arsenal,Manchester United,Pierre-Emerick_Aubameyang,3,11.7,1.00,Aubameyang,False,1.416652,1.793036,2021,90.000,0,9999-99-99,5.128233
9702,7,Arsenal,Manchester United,Cédric_Soares,2,4.7,1.00,Cédric,False,1.416652,1.793036,2021,0.000,0,9999-99-99,-0.003539
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
9490,38,Wolverhampton Wanderers,Manchester United,Ki-Jana_Hoever,2,4.5,1.00,Hoever,True,0.869561,1.793036,2021,7.250,0,9999-99-99,1.047197
9491,38,Wolverhampton Wanderers,Manchester United,Nélson_Cabral Semedo,2,5.5,1.00,Semedo,True,0.869561,1.793036,2021,82.250,0,9999-99-99,3.744982
9492,38,Wolverhampton Wanderers,Manchester United,Meritan_Shabani,3,4.5,1.00,Shabani,True,0.869561,1.793036,2021,0.000,0,9999-99-99,-0.006430
9493,38,Wolverhampton Wanderers,Manchester United,Rayan_Ait Nouri,2,5.0,1.00,Ait Nouri,True,0.869561,1.793036,2021,0.000,0,9999-99-99,-0.006430


In [22]:
remaining_season_df[remaining_season_df['gw'] == next_gw].sort_values(by='predicted_points', ascending=False).head(50)

Unnamed: 0,gw,team,opponent_team,player,position,price,play_proba,short_name,was_home,relative_market_value_team,relative_market_value_opponent_team,season,minutes,total_points,kickoff_time,predicted_points
9769,7,Manchester City,Sheffield United,Raheem_Sterling,3,11.6,1.0,Sterling,False,2.421052,,2021,87.75,0,9999-99-99,7.642272
301,7,Tottenham Hotspur,Brighton and Hove Albion,Harry_Kane,4,10.9,1.0,Kane,True,1.655619,0.517388,2021,88.6,0,9999-99-99,7.203769
303,7,Tottenham Hotspur,Brighton and Hove Albion,Heung-Min_Son,3,9.5,1.0,Son,True,1.655619,0.517388,2021,76.0,0,9999-99-99,7.13995
189,7,Liverpool,West Ham United,Mohamed_Salah,3,12.4,1.0,Salah,True,2.421052,0.645502,2021,90.0,0,9999-99-99,6.779417
9765,7,Manchester City,Sheffield United,Kevin_De Bruyne,3,11.5,1.0,De Bruyne,False,2.421052,,2021,90.0,0,9999-99-99,6.708321
184,7,Liverpool,West Ham United,Roberto_Firmino,4,9.3,1.0,Firmino,True,2.421052,0.645502,2021,80.2,0,9999-99-99,6.344937
9649,7,Leicester City,Leeds,Jamie_Vardy,4,10.0,1.0,Vardy,False,1.051364,0.40649,2021,87.0,0,9999-99-99,6.079337
186,7,Liverpool,West Ham United,Sadio_Mané,3,12.0,1.0,Mané,True,2.421052,0.645502,2021,64.75,0,9999-99-99,6.067736
9603,7,Chelsea,Burnley,Christian_Pulisic,3,8.3,1.0,Pulisic,False,1.880575,0.34092,2021,83.0,0,9999-99-99,5.890337
156,7,Leeds,Leicester City,Patrick_Bamford,4,5.9,1.0,Bamford,True,0.40649,1.051364,2021,85.8,0,9999-99-99,5.7782


In [23]:
# rename previous week's predictions file
# generate previous week's filename
last_gw = next_gw - 1
filename = 'history/2020-21/predictions_gw' + str(last_gw) + '.csv'
# windows
! move predictions.csv $filename
# linux
# ! mv predictions.csv $filename

        1 file(s) moved.


In [24]:
# write to predictions.csv
## RENAME PREVIOUS WEEK FIRST
predictions = remaining_season_df[['gw', 'player', 'team', 'opponent_team', 
                                   'position', 'price', 'play_proba', 'short_name',
                                   'was_home', 'predicted_points', 'minutes']]
predictions.to_csv('predictions.csv')

In [108]:
validation_gw_idx(lag_train_df, valid_season, valid_gw, valid_len)

(91401, 111239)

In [51]:
lag_train_df[(lag_train_df['gw'] == valid_gw) & (lag_train_df['season'] == valid_season)].index.min()

nan

In [55]:
lag_train_df.tail()

Unnamed: 0,player,gw,position,minutes,team,opponent_team,relative_market_value_team,relative_market_value_opponent_team,was_home,total_points,assists,bonus,bps,clean_sheets,creativity,goals_conceded,goals_scored,ict_index,influence,own_goals,penalties_missed,penalties_saved,red_cards,saves,selected,team_a_score,team_h_score,threat,transfers_balance,transfers_in,transfers_out,yellow_cards,kickoff_time,season,play_proba,relative_market_value_team_season,relative_market_value_opponent_team_season,total_points_team,total_points_team_last_all,total_points_team_pg_last_all,total_points_team_last_1,total_points_team_pg_last_1,total_points_team_last_2,total_points_team_pg_last_2,total_points_team_last_3,total_points_team_pg_last_3,total_points_team_last_4,total_points_team_pg_last_4,total_points_team_last_5,total_points_team_pg_last_5,total_points_team_last_10,total_points_team_pg_last_10,total_points_team_opponent,total_points_team_last_all_opponent,total_points_team_pg_last_all_opponent,total_points_team_last_1_opponent,total_points_team_pg_last_1_opponent,total_points_team_last_2_opponent,total_points_team_pg_last_2_opponent,total_points_team_last_3_opponent,total_points_team_pg_last_3_opponent,total_points_team_last_4_opponent,total_points_team_pg_last_4_opponent,total_points_team_last_5_opponent,total_points_team_pg_last_5_opponent,total_points_team_last_10_opponent,total_points_team_pg_last_10_opponent,minutes_last_all,minutes_last_1,minutes_last_2,minutes_last_3,minutes_last_4,minutes_last_5,minutes_last_10,total_points_last_all,total_points_pg_last_all,total_points_last_1,total_points_pg_last_1,total_points_last_2,total_points_pg_last_2,total_points_last_3,total_points_pg_last_3,total_points_last_4,total_points_pg_last_4,total_points_last_5,total_points_pg_last_5,total_points_last_10,total_points_pg_last_10,price
150913,Fabio_Silva,38,4,6.0,Wolverhampton Wanderers,Manchester United,0.798432,1.840445,True,,,,,,,,,,,,,,,,,,,,,,,,,,1.0,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,5.5
150914,Fernando_Marçal,38,2,36.375,Wolverhampton Wanderers,Manchester United,0.798432,1.840445,True,,,,,,,,,,,,,,,,,,,,,,,,,,0.75,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,5.0
150915,Vitor_Ferreira,38,3,0.0,Wolverhampton Wanderers,Manchester United,0.798432,1.840445,True,,,,,,,,,,,,,,,,,,,,,,,,,,1.0,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,5.0
150916,Ki-Jana_Hoever,38,2,0.0,Wolverhampton Wanderers,Manchester United,0.798432,1.840445,True,,,,,,,,,,,,,,,,,,,,,,,,,,1.0,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,4.5
150917,Nélson_Cabral Semedo,38,2,0.0,Wolverhampton Wanderers,Manchester United,0.798432,1.840445,True,,,,,,,,,,,,,,,,,,,,,,,,,,1.0,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,5.5


In [6]:
# parameters for model
season = '2021'
cat_vars = ['gw', 'season', 'position', 'team', 'opponent_team', 'was_home']
cont_vars = ['minutes', 'relative_market_value_team_season', 'relative_market_value_opponent_team_season']
dep_var = ['total_points']

In [7]:
# we want to set gw and season as ordered categorical variables
# need lists with ordered categories
ordered_gws = list(range(1,39))
ordered_seasons = ['1617', '1718', '1819', '1920', '2021']

In [8]:
# set as categories with correct order 
train_df['gw'] = train_df['gw'].astype('category')
train_df['season'] = train_df['season'].astype('category')

train_df['gw'].cat.set_categories(ordered_gws, ordered=True, inplace=True)
train_df['season'].cat.set_categories(ordered_seasons, ordered=True, inplace=True)

In [9]:
train_df['season']

0        1617
1        1617
2        1617
3        1617
4        1617
         ... 
90857    2021
90858    2021
90859    2021
90860    2021
90861    2021
Name: season, Length: 90862, dtype: category
Categories (5, object): [1617 < 1718 < 1819 < 1920 < 2021]

In [10]:
# dependent variable needs to be float
train_df['total_points'] = train_df['total_points'].astype('float64')

In [11]:
train_df[(train_df['team'] == 'Manchester United') & (train_df['player'] == 'David_de Gea')].head(50)

Unnamed: 0,player,gw,position,minutes,team,opponent_team,relative_market_value_team,relative_market_value_opponent_team,was_home,total_points,...,threat,transfers_balance,transfers_in,transfers_out,yellow_cards,kickoff_time,season,play_proba,relative_market_value_team_season,relative_market_value_opponent_team_season
132,David_de Gea,1,1,90,Manchester United,Bournemouth,,,False,2.0,...,0.0,0,0,0,0,2016-08-14T12:30:00Z,1617,,1.983179,0.384921
658,David_de Gea,2,1,90,Manchester United,Southampton,,,True,6.0,...,0.0,20807,48379,27572,0,2016-08-19T19:00:00Z,1617,,1.983179,0.796805
1198,David_de Gea,3,1,90,Manchester United,Hull City,,,False,6.0,...,0.0,8827,47432,38605,0,2016-08-27T16:30:00Z,1617,,1.983179,0.494447
1751,David_de Gea,4,1,90,Manchester United,Manchester City,,,True,2.0,...,0.0,-14166,39801,53967,0,2016-09-10T11:30:00Z,1617,,1.983179,2.311012
2333,David_de Gea,5,1,90,Manchester United,Watford,,,False,1.0,...,0.0,-8058,19085,27143,0,2016-09-18T11:00:00Z,1617,,1.983179,0.7042
2918,David_de Gea,6,1,90,Manchester United,Leicester City,,,True,2.0,...,0.0,-75576,13288,88864,0,2016-09-24T11:30:00Z,1617,,1.983179,0.650832
3504,David_de Gea,7,1,90,Manchester United,Stoke City,,,True,3.0,...,0.0,-35363,12586,47949,0,2016-10-02T11:00:00Z,1617,,1.983179,0.718705
4093,David_de Gea,8,1,90,Manchester United,Liverpool,,,False,8.0,...,0.0,-72768,2892,75660,0,2016-10-17T19:00:00Z,1617,,1.983179,1.46586
4686,David_de Gea,9,1,90,Manchester United,Chelsea,,,False,0.0,...,0.0,-20024,8617,28641,0,2016-10-23T15:00:00Z,1617,,1.983179,2.243698
5281,David_de Gea,10,1,90,Manchester United,Burnley,,,True,6.0,...,0.0,-24962,8417,33379,0,2016-10-29T14:00:00Z,1617,,1.983179,0.304218


In [12]:
# find the latest gameweek
last_gw = train_df['gw'][train_df['season'] == season].max()

if np.isnan(last_gw): 
    next_gw = 1 
else: 
    next_gw = last_gw + 1
    
next_gw

2

In [13]:
# read in remaining_season.csv
remaining_season_df = pd.read_csv(path/'data/remaining_season.csv', index_col=0)

In [14]:
remaining_season_df.rename(columns={"relative_market_value_team": "relative_market_value_team_season", 
                                    "relative_market_value_opponent_team": "relative_market_value_opponent_team_season"},
                           inplace=True)

In [15]:
# # set as categories with correct order 
# remaining_season_df['gw'] = train_df['gw'].astype('category')
# remaining_season_df['season'] = train_df['season'].astype('category')

# remaining_season_df['gw'].cat.set_categories(ordered_gws, ordered=True, inplace=True)
# remaining_season_df['season'].cat.set_categories(ordered_seasons, ordered=True, inplace=True)

In [16]:
remaining_season_df[(remaining_season_df['team'] == 'Manchester United') & (remaining_season_df['player'] == 'David_de Gea')]

Unnamed: 0,gw,team,opponent_team,player,position,price,play_proba,was_home,relative_market_value_team_season,relative_market_value_opponent_team_season,season,minutes
124,2,Manchester United,Crystal Palace,David_de Gea,1,5.5,1.0,True,1.883222,0.476734,2021,90.0
739,4,Manchester United,Tottenham Hotspur,David_de Gea,1,5.5,1.0,True,1.883222,1.694101,2021,90.0
1275,6,Manchester United,Chelsea,David_de Gea,1,5.5,1.0,True,1.883222,2.184688,2021,90.0
1510,7,Manchester United,Arsenal,David_de Gea,1,5.5,1.0,True,1.883222,1.516838,2021,90.0
2040,9,Manchester United,West Bromwich Albion,David_de Gea,1,5.5,1.0,True,1.883222,0.185012,2021,90.0
2802,12,Manchester United,Manchester City,David_de Gea,1,5.5,1.0,True,1.883222,2.441779,2021,90.0
3363,14,Manchester United,Leeds,David_de Gea,1,5.5,1.0,True,1.883222,0.300409,2021,90.0
3893,16,Manchester United,Wolverhampton Wanderers,David_de Gea,1,5.5,1.0,True,1.883222,0.788413,2021,90.0
4162,17,Manchester United,Aston Villa,David_de Gea,1,5.5,1.0,True,1.883222,0.594244,2021,90.0
4900,20,Manchester United,Sheffield United,David_de Gea,1,5.5,1.0,True,1.883222,0.361923,2021,90.0


In [17]:
# processors - categorify categorical variables and normalize continuous variables
# fill missing not used because new teams are almost certainly well below the league average
procs=[Categorify, Normalize]
to_nn = TabularPandas(train_df, procs, cat_vars + ['player'], cont_vars, #+ player_lag_vars + team_lag_vars,
                      #splits=splits, 
                      y_names=dep_var)

In [18]:
dls = to_nn.dataloaders()

In [19]:
dls.show_batch()

Unnamed: 0,gw,season,position,team,opponent_team,was_home,player,minutes,relative_market_value_team_season,relative_market_value_opponent_team_season,total_points
0,25,1718,3,Newcastle United,Burnley,True,Siem_de Jong,-6.982541e-07,0.547176,0.316798,0.0
1,13,1718,1,Chelsea,Liverpool,False,Willy_Caballero,-6.982541e-07,2.125018,1.619155,0.0
2,16,1819,2,West Ham United,Crystal Palace,True,Issa_Diop,90.0,0.749352,0.634856,0.0
3,35,1718,3,Stoke City,Burnley,True,Joe_Allen,90.0,0.581587,0.316798,2.0
4,34,1617,3,Sunderland,Middlesbrough,False,Adnan_Januzaj,14.0,0.418392,0.452793,1.0
5,11,1617,3,Chelsea,Everton,True,Cesc_Fàbregas,-6.982541e-07,2.243698,1.057509,0.0
6,7,1617,3,Hull City,Chelsea,True,Ryan_Mason,90.0,0.494447,2.243698,2.0
7,4,1819,3,Newcastle United,Manchester City,False,Matt_Ritchie,-6.982541e-07,0.483921,2.540586,0.0
8,38,1718,2,West Bromwich Albion,Crystal Palace,False,Nathan_Ferguson,-6.982541e-07,0.541354,0.635984,0.0
9,7,1617,4,West Bromwich Albion,Sunderland,False,Hal_Robson-Kanu,0.9999996,0.428062,0.418392,1.0


In [20]:
# set range of predictions - minimum to current max
max_y = np.max(train_df['total_points'])
min_y = np.min(train_df['total_points'])
y_range = (-1, max_y)

In [21]:
learn = tabular_learner(dls, y_range=y_range, layers=[1000,500], ps=[0.001,0.01], emb_drop=0.04,
                        n_out=1, loss_func=F.mse_loss, metrics=rmse)

In [22]:
learn.fit_one_cycle(7, 1e-2, wd=0.2)

epoch,train_loss,valid_loss,_rmse,time
0,4.333718,,,00:50
1,3.973915,,,00:53
2,3.771501,,,00:55
3,3.608671,,,00:53
4,3.530642,,,00:52
5,3.563715,,,00:56
6,3.111614,,,01:01


  warn("Your generator is empty.")


In [23]:
# cont_vars = ['minutes', 'relative_market_value_team', 'relative_market_value_opponent_team']

In [54]:
# adjustments
# remaining_season_df.loc[remaining_season_df['player'] == 'Raheem_Sterling', 'minutes'] = 90

In [55]:
to_test = TabularPandas(remaining_season_df, procs, cat_vars + ['player'], cont_vars)#, y_names=dep_var)

In [56]:
test_dl = dls.test_dl(remaining_season_df)

In [57]:
preds = learn.get_preds(dl=test_dl)

In [58]:
preds

(tensor([[ 3.1442e+00],
         [ 2.7156e+00],
         [ 2.6822e+00],
         ...,
         [ 1.7940e+00],
         [-1.2281e-03],
         [ 2.1926e-01]]),
 None)

In [59]:
remaining_season_df['predicted_points'] = preds[0].data.numpy().T[0]

In [60]:
remaining_season_df[remaining_season_df['gw'] == next_gw].sort_values(by='predicted_points', ascending=False).head(50)
# remaining_season_df[remaining_season_df['team'] == 'Sergio_Aguero']

Unnamed: 0,gw,team,opponent_team,player,position,price,play_proba,was_home,relative_market_value_team_season,relative_market_value_opponent_team_season,season,minutes,predicted_points
135,2,Manchester United,Crystal Palace,Bruno Miguel_Borges Fernandes,3,10.5,1.0,True,1.883222,0.476734,2021,83.6,7.071314
10140,2,Liverpool,Chelsea,Mohamed_Salah,3,12.0,1.0,False,2.394822,2.184688,2021,73.2,6.489123
136,2,Manchester United,Crystal Palace,Anthony_Martial,4,9.0,1.0,True,1.883222,0.476734,2021,87.6,6.333624
10027,2,Tottenham Hotspur,Southampton,Harry_Kane,4,10.5,1.0,False,1.694101,0.495869,2021,90.0,6.146582
104,2,Leicester City,Burnley,Jamie_Vardy,4,10.0,1.0,True,1.029069,0.351663,2021,90.0,6.081945
63,2,Everton,West Bromwich Albion,Richarlison_de Andrade,4,8.0,1.0,True,1.152919,0.185012,2021,80.8,6.066139
181,2,Arsenal,West Ham United,Pierre-Emerick_Aubameyang,3,12.0,1.0,True,1.516838,0.682407,2021,78.4,5.97019
10029,2,Tottenham Hotspur,Southampton,Heung-Min_Son,3,9.0,1.0,False,1.694101,0.495869,2021,78.4,5.647571
10162,2,Manchester City,Wolverhampton Wanderers,Raheem_Sterling,3,11.5,1.0,False,2.441779,0.788413,2021,90.0,5.647542
138,2,Manchester United,Crystal Palace,Dean_Henderson,1,5.5,1.0,True,1.883222,0.476734,2021,90.0,5.599158


In [62]:
# rename previous week's predictions file
# generate previous week's filename
last_gw = next_gw - 1
filename = 'predictions_gw' + str(last_gw) + '.csv'
! mv predictions.csv $filename

In [63]:
# write to predictions.csv
## RENAME PREVIOUS WEEK FIRST
predictions = remaining_season_df[['gw', 'player', 'team', 'position', 'price', 'play_proba', 'predicted_points']]
predictions.to_csv('predictions.csv')