In [1]:
# import required packages
from fastai2.tabular.all import *
from helpers import *

In [2]:
# path to project directory
path = Path('./')

In [3]:
# read in training dataset
train_df = pd.read_csv(path/'data/train_v5.csv', index_col=0, dtype={'season':str})

In [4]:
train_df.tail()

Unnamed: 0,player,gw,position,minutes,team,opponent_team,relative_market_value_team,relative_market_value_opponent_team,was_home,total_points,...,threat,transfers_balance,transfers_in,transfers_out,yellow_cards,kickoff_time,season,play_proba,relative_market_value_team_season,relative_market_value_opponent_team_season
90857,Yoshinori_Muto,1,4,0,Newcastle United,West Ham United,0.626058,0.682407,False,0,...,0.0,0,0,0,0,2020-09-12T19:00:00Z,2021,1.0,0.626058,0.682407
90858,Youri_Tielemans,1,3,90,Leicester City,West Bromwich Albion,1.029069,0.185012,False,3,...,9.0,0,0,0,0,2020-09-13T13:00:00Z,2021,1.0,1.029069,0.185012
90859,Yves_Bissouma,1,3,90,Brighton and Hove Albion,Chelsea,0.466873,2.184688,True,2,...,5.0,0,0,0,0,2020-09-14T19:15:00Z,2021,1.0,0.466873,2.184688
90860,Zeze Steven_Sessegnon,1,2,0,Fulham,Arsenal,0.372958,1.516838,True,0,...,0.0,0,0,0,0,2020-09-12T11:30:00Z,2021,0.0,0.372958,1.516838
90861,Çaglar_Söyüncü,1,2,90,Leicester City,West Bromwich Albion,1.029069,0.185012,False,6,...,6.0,0,0,0,0,2020-09-13T13:00:00Z,2021,1.0,1.029069,0.185012


In [5]:
train_df.shape

(90862, 37)

In [6]:
# parameters for model
season = '2021'
cat_vars = ['gw', 'season', 'position', 'team', 'opponent_team', 'was_home']
cont_vars = ['minutes', 'relative_market_value_team_season', 'relative_market_value_opponent_team_season']
dep_var = ['total_points']

In [7]:
# we want to set gw and season as ordered categorical variables
# need lists with ordered categories
ordered_gws = list(range(1,39))
ordered_seasons = ['1617', '1718', '1819', '1920', '2021']

In [8]:
# set as categories with correct order 
train_df['gw'] = train_df['gw'].astype('category')
train_df['season'] = train_df['season'].astype('category')

train_df['gw'].cat.set_categories(ordered_gws, ordered=True, inplace=True)
train_df['season'].cat.set_categories(ordered_seasons, ordered=True, inplace=True)

In [9]:
train_df['season']

0        1617
1        1617
2        1617
3        1617
4        1617
         ... 
90857    2021
90858    2021
90859    2021
90860    2021
90861    2021
Name: season, Length: 90862, dtype: category
Categories (5, object): [1617 < 1718 < 1819 < 1920 < 2021]

In [10]:
# dependent variable needs to be float
train_df['total_points'] = train_df['total_points'].astype('float64')

In [11]:
train_df[(train_df['team'] == 'Manchester United') & (train_df['player'] == 'David_de Gea')].head(50)

Unnamed: 0,player,gw,position,minutes,team,opponent_team,relative_market_value_team,relative_market_value_opponent_team,was_home,total_points,...,threat,transfers_balance,transfers_in,transfers_out,yellow_cards,kickoff_time,season,play_proba,relative_market_value_team_season,relative_market_value_opponent_team_season
132,David_de Gea,1,1,90,Manchester United,Bournemouth,,,False,2.0,...,0.0,0,0,0,0,2016-08-14T12:30:00Z,1617,,1.983179,0.384921
658,David_de Gea,2,1,90,Manchester United,Southampton,,,True,6.0,...,0.0,20807,48379,27572,0,2016-08-19T19:00:00Z,1617,,1.983179,0.796805
1198,David_de Gea,3,1,90,Manchester United,Hull City,,,False,6.0,...,0.0,8827,47432,38605,0,2016-08-27T16:30:00Z,1617,,1.983179,0.494447
1751,David_de Gea,4,1,90,Manchester United,Manchester City,,,True,2.0,...,0.0,-14166,39801,53967,0,2016-09-10T11:30:00Z,1617,,1.983179,2.311012
2333,David_de Gea,5,1,90,Manchester United,Watford,,,False,1.0,...,0.0,-8058,19085,27143,0,2016-09-18T11:00:00Z,1617,,1.983179,0.7042
2918,David_de Gea,6,1,90,Manchester United,Leicester City,,,True,2.0,...,0.0,-75576,13288,88864,0,2016-09-24T11:30:00Z,1617,,1.983179,0.650832
3504,David_de Gea,7,1,90,Manchester United,Stoke City,,,True,3.0,...,0.0,-35363,12586,47949,0,2016-10-02T11:00:00Z,1617,,1.983179,0.718705
4093,David_de Gea,8,1,90,Manchester United,Liverpool,,,False,8.0,...,0.0,-72768,2892,75660,0,2016-10-17T19:00:00Z,1617,,1.983179,1.46586
4686,David_de Gea,9,1,90,Manchester United,Chelsea,,,False,0.0,...,0.0,-20024,8617,28641,0,2016-10-23T15:00:00Z,1617,,1.983179,2.243698
5281,David_de Gea,10,1,90,Manchester United,Burnley,,,True,6.0,...,0.0,-24962,8417,33379,0,2016-10-29T14:00:00Z,1617,,1.983179,0.304218


In [12]:
# find the latest gameweek
last_gw = train_df['gw'][train_df['season'] == season].max()

if np.isnan(last_gw): 
    next_gw = 1 
else: 
    next_gw = last_gw + 1
    
next_gw

2

In [13]:
# read in remaining_season.csv
remaining_season_df = pd.read_csv(path/'data/remaining_season.csv', index_col=0)

In [14]:
remaining_season_df.rename(columns={"relative_market_value_team": "relative_market_value_team_season", 
                                    "relative_market_value_opponent_team": "relative_market_value_opponent_team_season"},
                           inplace=True)

In [15]:
# # set as categories with correct order 
# remaining_season_df['gw'] = train_df['gw'].astype('category')
# remaining_season_df['season'] = train_df['season'].astype('category')

# remaining_season_df['gw'].cat.set_categories(ordered_gws, ordered=True, inplace=True)
# remaining_season_df['season'].cat.set_categories(ordered_seasons, ordered=True, inplace=True)

In [16]:
remaining_season_df[(remaining_season_df['team'] == 'Manchester United') & (remaining_season_df['player'] == 'David_de Gea')]

Unnamed: 0,gw,team,opponent_team,player,position,price,play_proba,was_home,relative_market_value_team_season,relative_market_value_opponent_team_season,season,minutes
124,2,Manchester United,Crystal Palace,David_de Gea,1,5.5,1.0,True,1.883222,0.476734,2021,90.0
739,4,Manchester United,Tottenham Hotspur,David_de Gea,1,5.5,1.0,True,1.883222,1.694101,2021,90.0
1275,6,Manchester United,Chelsea,David_de Gea,1,5.5,1.0,True,1.883222,2.184688,2021,90.0
1510,7,Manchester United,Arsenal,David_de Gea,1,5.5,1.0,True,1.883222,1.516838,2021,90.0
2040,9,Manchester United,West Bromwich Albion,David_de Gea,1,5.5,1.0,True,1.883222,0.185012,2021,90.0
2802,12,Manchester United,Manchester City,David_de Gea,1,5.5,1.0,True,1.883222,2.441779,2021,90.0
3363,14,Manchester United,Leeds,David_de Gea,1,5.5,1.0,True,1.883222,0.300409,2021,90.0
3893,16,Manchester United,Wolverhampton Wanderers,David_de Gea,1,5.5,1.0,True,1.883222,0.788413,2021,90.0
4162,17,Manchester United,Aston Villa,David_de Gea,1,5.5,1.0,True,1.883222,0.594244,2021,90.0
4900,20,Manchester United,Sheffield United,David_de Gea,1,5.5,1.0,True,1.883222,0.361923,2021,90.0


In [17]:
# processors - categorify categorical variables and normalize continuous variables
# fill missing not used because new teams are almost certainly well below the league average
procs=[Categorify, Normalize]
to_nn = TabularPandas(train_df, procs, cat_vars + ['player'], cont_vars, #+ player_lag_vars + team_lag_vars,
                      #splits=splits, 
                      y_names=dep_var)

In [18]:
dls = to_nn.dataloaders()

In [19]:
dls.show_batch()

Unnamed: 0,gw,season,position,team,opponent_team,was_home,player,minutes,relative_market_value_team_season,relative_market_value_opponent_team_season,total_points
0,25,1718,3,Newcastle United,Burnley,True,Siem_de Jong,-6.982541e-07,0.547176,0.316798,0.0
1,13,1718,1,Chelsea,Liverpool,False,Willy_Caballero,-6.982541e-07,2.125018,1.619155,0.0
2,16,1819,2,West Ham United,Crystal Palace,True,Issa_Diop,90.0,0.749352,0.634856,0.0
3,35,1718,3,Stoke City,Burnley,True,Joe_Allen,90.0,0.581587,0.316798,2.0
4,34,1617,3,Sunderland,Middlesbrough,False,Adnan_Januzaj,14.0,0.418392,0.452793,1.0
5,11,1617,3,Chelsea,Everton,True,Cesc_Fàbregas,-6.982541e-07,2.243698,1.057509,0.0
6,7,1617,3,Hull City,Chelsea,True,Ryan_Mason,90.0,0.494447,2.243698,2.0
7,4,1819,3,Newcastle United,Manchester City,False,Matt_Ritchie,-6.982541e-07,0.483921,2.540586,0.0
8,38,1718,2,West Bromwich Albion,Crystal Palace,False,Nathan_Ferguson,-6.982541e-07,0.541354,0.635984,0.0
9,7,1617,4,West Bromwich Albion,Sunderland,False,Hal_Robson-Kanu,0.9999996,0.428062,0.418392,1.0


In [20]:
# set range of predictions - minimum to current max
max_y = np.max(train_df['total_points'])
min_y = np.min(train_df['total_points'])
y_range = (-1, max_y)

In [21]:
learn = tabular_learner(dls, y_range=y_range, layers=[1000,500], ps=[0.001,0.01], emb_drop=0.04,
                        n_out=1, loss_func=F.mse_loss, metrics=rmse)

In [22]:
learn.fit_one_cycle(7, 1e-2, wd=0.2)

epoch,train_loss,valid_loss,_rmse,time
0,4.333718,,,00:50
1,3.973915,,,00:53
2,3.771501,,,00:55
3,3.608671,,,00:53
4,3.530642,,,00:52
5,3.563715,,,00:56
6,3.111614,,,01:01


  warn("Your generator is empty.")


In [23]:
# cont_vars = ['minutes', 'relative_market_value_team', 'relative_market_value_opponent_team']

In [54]:
# adjustments
# remaining_season_df.loc[remaining_season_df['player'] == 'Raheem_Sterling', 'minutes'] = 90

In [55]:
to_test = TabularPandas(remaining_season_df, procs, cat_vars + ['player'], cont_vars)#, y_names=dep_var)

In [56]:
test_dl = dls.test_dl(remaining_season_df)

In [57]:
preds = learn.get_preds(dl=test_dl)

In [58]:
preds

(tensor([[ 3.1442e+00],
         [ 2.7156e+00],
         [ 2.6822e+00],
         ...,
         [ 1.7940e+00],
         [-1.2281e-03],
         [ 2.1926e-01]]),
 None)

In [59]:
remaining_season_df['predicted_points'] = preds[0].data.numpy().T[0]

In [60]:
remaining_season_df[remaining_season_df['gw'] == next_gw].sort_values(by='predicted_points', ascending=False).head(50)
# remaining_season_df[remaining_season_df['team'] == 'Sergio_Aguero']

Unnamed: 0,gw,team,opponent_team,player,position,price,play_proba,was_home,relative_market_value_team_season,relative_market_value_opponent_team_season,season,minutes,predicted_points
135,2,Manchester United,Crystal Palace,Bruno Miguel_Borges Fernandes,3,10.5,1.0,True,1.883222,0.476734,2021,83.6,7.071314
10140,2,Liverpool,Chelsea,Mohamed_Salah,3,12.0,1.0,False,2.394822,2.184688,2021,73.2,6.489123
136,2,Manchester United,Crystal Palace,Anthony_Martial,4,9.0,1.0,True,1.883222,0.476734,2021,87.6,6.333624
10027,2,Tottenham Hotspur,Southampton,Harry_Kane,4,10.5,1.0,False,1.694101,0.495869,2021,90.0,6.146582
104,2,Leicester City,Burnley,Jamie_Vardy,4,10.0,1.0,True,1.029069,0.351663,2021,90.0,6.081945
63,2,Everton,West Bromwich Albion,Richarlison_de Andrade,4,8.0,1.0,True,1.152919,0.185012,2021,80.8,6.066139
181,2,Arsenal,West Ham United,Pierre-Emerick_Aubameyang,3,12.0,1.0,True,1.516838,0.682407,2021,78.4,5.97019
10029,2,Tottenham Hotspur,Southampton,Heung-Min_Son,3,9.0,1.0,False,1.694101,0.495869,2021,78.4,5.647571
10162,2,Manchester City,Wolverhampton Wanderers,Raheem_Sterling,3,11.5,1.0,False,2.441779,0.788413,2021,90.0,5.647542
138,2,Manchester United,Crystal Palace,Dean_Henderson,1,5.5,1.0,True,1.883222,0.476734,2021,90.0,5.599158


In [62]:
# rename previous week's predictions file
# generate previous week's filename
last_gw = next_gw - 1
filename = 'predictions_gw' + str(last_gw) + '.csv'
! mv predictions.csv $filename

In [63]:
# write to predictions.csv
## RENAME PREVIOUS WEEK FIRST
predictions = remaining_season_df[['gw', 'player', 'team', 'position', 'price', 'play_proba', 'predicted_points']]
predictions.to_csv('predictions.csv')