In [83]:
# import required packages
from fastai2.tabular.all import *
from helpers import *

In [84]:
# path to project directory
path = Path('./')

In [85]:
# read in training dataset
train_df = pd.read_csv(path/'data/train_v4.csv', index_col=0, dtype={'season':str})

In [86]:
train_df.tail()

Unnamed: 0,player,gw,position,minutes,team,opponent_team,relative_market_value_team,relative_market_value_opponent_team,was_home,total_points,...,threat,transfers_balance,transfers_in,transfers_out,yellow_cards,kickoff_time,season,play_proba,relative_market_value_team_season,relative_market_value_opponent_team_season
90432,Tommy_Doyle,38,3,0,Manchester City,Norwich,2.430397,0.327574,True,0,...,0.0,-2,22,24,0,2020-07-26T15:00:00Z,1920,1.0,2.727025,0.1983
90433,Joseph_Anang,38,1,0,West Ham United,Aston Villa,0.709989,0.553818,True,0,...,0.0,70,270,200,0,2020-07-26T15:00:00Z,1920,1.0,0.739196,0.338194
90434,Erik_Pieters,38,2,90,Burnley,Brighton and Hove Albion,0.370648,0.541184,True,3,...,2.0,139816,144388,4572,1,2020-07-26T15:00:00Z,1920,1.0,0.441799,0.476156
90435,Japhet_Tanganga,38,2,0,Tottenham Hotspur,Crystal Palace,1.604904,0.430493,False,0,...,0.0,7999,14840,6841,0,2020-07-26T15:00:00Z,1920,1.0,2.113981,0.495374
90436,Ravel_Morrison,38,3,0,Sheffield United,Southampton,0.348678,0.526987,False,0,...,0.0,-23,7,30,0,2020-07-26T15:00:00Z,1920,0.0,0.150631,0.62421


In [87]:
train_df.shape

(90437, 37)

In [88]:
# parameters for model
season = '2021'
cat_vars = ['gw', 'season', 'position', 'team', 'opponent_team', 'was_home']
cont_vars = ['minutes', 'relative_market_value_team_season', 'relative_market_value_opponent_team_season']
dep_var = ['total_points']

In [89]:
# we want to set gw and season as ordered categorical variables
# need lists with ordered categories
ordered_gws = list(range(1,39))
ordered_seasons = ['1617', '1718', '1819', '1920', '2021']

In [90]:
# set as categories with correct order 
train_df['gw'] = train_df['gw'].astype('category')
train_df['season'] = train_df['season'].astype('category')

train_df['gw'].cat.set_categories(ordered_gws, ordered=True, inplace=True)
train_df['season'].cat.set_categories(ordered_seasons, ordered=True, inplace=True)

In [91]:
train_df['season']

0        1617
1        1617
2        1617
3        1617
4        1617
         ... 
90432    1920
90433    1920
90434    1920
90435    1920
90436    1920
Name: season, Length: 90437, dtype: category
Categories (5, object): [1617 < 1718 < 1819 < 1920 < 2021]

In [92]:
# dependent variable needs to be float
train_df['total_points'] = train_df['total_points'].astype('float64')

In [93]:
train_df[(train_df['team'] == 'Manchester United') & (train_df['player'] == 'David_de Gea')].head(50)

Unnamed: 0,player,gw,position,minutes,team,opponent_team,relative_market_value_team,relative_market_value_opponent_team,was_home,total_points,...,threat,transfers_balance,transfers_in,transfers_out,yellow_cards,kickoff_time,season,play_proba,relative_market_value_team_season,relative_market_value_opponent_team_season
132,David_de Gea,1,1,90,Manchester United,Bournemouth,,,False,2.0,...,0.0,0,0,0,0,2016-08-14T12:30:00Z,1617,,1.983179,0.384921
658,David_de Gea,2,1,90,Manchester United,Southampton,,,True,6.0,...,0.0,20807,48379,27572,0,2016-08-19T19:00:00Z,1617,,1.983179,0.796805
1198,David_de Gea,3,1,90,Manchester United,Hull City,,,False,6.0,...,0.0,8827,47432,38605,0,2016-08-27T16:30:00Z,1617,,1.983179,0.494447
1751,David_de Gea,4,1,90,Manchester United,Manchester City,,,True,2.0,...,0.0,-14166,39801,53967,0,2016-09-10T11:30:00Z,1617,,1.983179,2.311012
2333,David_de Gea,5,1,90,Manchester United,Watford,,,False,1.0,...,0.0,-8058,19085,27143,0,2016-09-18T11:00:00Z,1617,,1.983179,0.7042
2918,David_de Gea,6,1,90,Manchester United,Leicester City,,,True,2.0,...,0.0,-75576,13288,88864,0,2016-09-24T11:30:00Z,1617,,1.983179,0.650832
3504,David_de Gea,7,1,90,Manchester United,Stoke City,,,True,3.0,...,0.0,-35363,12586,47949,0,2016-10-02T11:00:00Z,1617,,1.983179,0.718705
4093,David_de Gea,8,1,90,Manchester United,Liverpool,,,False,8.0,...,0.0,-72768,2892,75660,0,2016-10-17T19:00:00Z,1617,,1.983179,1.46586
4686,David_de Gea,9,1,90,Manchester United,Chelsea,,,False,0.0,...,0.0,-20024,8617,28641,0,2016-10-23T15:00:00Z,1617,,1.983179,2.243698
5281,David_de Gea,10,1,90,Manchester United,Burnley,,,True,6.0,...,0.0,-24962,8417,33379,0,2016-10-29T14:00:00Z,1617,,1.983179,0.304218


In [94]:
# find the latest gameweek
last_gw = train_df['gw'][train_df['season'] == season].max()

if np.isnan(last_gw): 
    next_gw = 1 
else: 
    next_gw = last_gw + 1
    
next_gw

1

In [95]:
# read in remaining_season.csv
remaining_season_df = pd.read_csv(path/'data/remaining_season.csv', index_col=0)

In [96]:
remaining_season_df.rename(columns={"relative_market_value_team": "relative_market_value_team_season", 
                                    "relative_market_value_opponent_team": "relative_market_value_opponent_team_season"},
                           inplace=True)

In [97]:
# # set as categories with correct order 
# remaining_season_df['gw'] = train_df['gw'].astype('category')
# remaining_season_df['season'] = train_df['season'].astype('category')

# remaining_season_df['gw'].cat.set_categories(ordered_gws, ordered=True, inplace=True)
# remaining_season_df['season'].cat.set_categories(ordered_seasons, ordered=True, inplace=True)

In [98]:
remaining_season_df[(remaining_season_df['team'] == 'Manchester United') & (remaining_season_df['player'] == 'David_de Gea')]

Unnamed: 0,gw,team,opponent_team,player,position,price,play_proba,was_home,relative_market_value_team_season,relative_market_value_opponent_team_season,season,minutes
322,2,Manchester United,Crystal Palace,David_de Gea,1,5.5,1.0,True,1.88289,0.442494,2021,90.0
920,4,Manchester United,Tottenham Hotspur,David_de Gea,1,5.5,1.0,True,1.88289,1.693803,2021,90.0
1440,6,Manchester United,Chelsea,David_de Gea,1,5.5,1.0,True,1.88289,2.229139,2021,90.0
1666,7,Manchester United,Arsenal,David_de Gea,1,5.5,1.0,True,1.88289,1.529012,2021,90.0
2180,9,Manchester United,West Bromwich Albion,David_de Gea,1,5.5,1.0,True,1.88289,0.18228,2021,90.0
2921,12,Manchester United,Manchester City,David_de Gea,1,5.5,1.0,True,1.88289,2.441349,2021,90.0
3465,14,Manchester United,Leeds,David_de Gea,1,5.5,1.0,True,1.88289,0.300356,2021,90.0
3979,16,Manchester United,Wolverhampton Wanderers,David_de Gea,1,5.5,1.0,True,1.88289,0.788274,2021,90.0
4240,17,Manchester United,Aston Villa,David_de Gea,1,5.5,1.0,True,1.88289,0.582872,2021,90.0
4955,20,Manchester United,Sheffield United,David_de Gea,1,5.5,1.0,True,1.88289,0.36186,2021,90.0


In [99]:
# processors - categorify categorical variables and normalize continuous variables
# fill missing not used because new teams are almost certainly well below the league average
procs=[Categorify, Normalize]
to_nn = TabularPandas(train_df, procs, cat_vars + ['player'], cont_vars, #+ player_lag_vars + team_lag_vars,
                      #splits=splits, 
                      y_names=dep_var)

In [100]:
dls = to_nn.dataloaders()

In [101]:
dls.show_batch()

Unnamed: 0,gw,season,position,team,opponent_team,was_home,player,minutes,relative_market_value_team_season,relative_market_value_opponent_team_season,total_points
0,20,1617,2,Manchester United,West Ham United,False,Phil_Jones,90.0,1.983179,0.895471,6.0
1,13,1718,4,Watford,Newcastle United,False,Troy_Deeney,5.823472e-08,0.547242,0.547176,0.0
2,22,1617,1,Tottenham Hotspur,Manchester City,False,Pau_López Sabata,5.823472e-08,1.43369,2.311012,0.0
3,27,1920,2,Arsenal,Everton,True,David_Luiz Moreira Marinho,90.0,1.448866,1.125166,4.0
4,3,1920,2,Tottenham Hotspur,Newcastle United,True,Jan_Vertonghen,5.823472e-08,2.113981,0.542356,0.0
5,3,1718,3,Arsenal,Liverpool,False,Alex_Iwobi,5.823472e-08,2.0735,1.619155,0.0
6,2,1819,2,Fulham,Tottenham Hotspur,False,Joe_Bryan,84.0,0.405284,1.983109,1.0
7,23,1920,1,West Ham United,Everton,True,Roberto_Jimenez Gago,5.823472e-08,0.739196,1.125166,0.0
8,9,1617,3,West Bromwich Albion,Liverpool,False,Sam_Field,5.823472e-08,0.428062,1.46586,0.0
9,3,1617,2,Bournemouth,Crystal Palace,False,Adam_Smith,90.0,0.384921,0.802197,2.0


In [102]:
# set range of predictions - minimum to current max
max_y = np.max(train_df['total_points'])
min_y = np.min(train_df['total_points'])
y_range = (-1, max_y)

In [114]:
learn = tabular_learner(dls, y_range=y_range, layers=[1000,500], ps=[0.001,0.01], emb_drop=0.04,
                        n_out=1, loss_func=F.mse_loss, metrics=rmse)

In [115]:
learn.fit_one_cycle(7, 1e-2, wd=0.2)

epoch,train_loss,valid_loss,_rmse,time
0,4.064017,,,00:25
1,3.502268,,,00:25
2,3.328249,,,00:26
3,3.526649,,,00:26
4,3.358534,,,00:26
5,3.175627,,,00:26
6,3.383806,,,00:27


  warn("Your generator is empty.")


In [116]:
# cont_vars = ['minutes', 'relative_market_value_team', 'relative_market_value_opponent_team']

In [117]:
to_test = TabularPandas(remaining_season_df, procs, cat_vars + ['player'], cont_vars)#, y_names=dep_var)

In [118]:
test_dl = dls.test_dl(remaining_season_df)

In [119]:
preds = learn.get_preds(dl=test_dl)

In [120]:
preds

(tensor([[3.2586],
         [2.3434],
         [2.0939],
         ...,
         [2.9563],
         [2.7505],
         [4.1722]]),
 None)

In [121]:
remaining_season_df['predicted_points'] = preds[0].data.numpy().T[0]

In [122]:
remaining_season_df[remaining_season_df['gw'] == 1].sort_values(by='predicted_points', ascending=False).head(20)

Unnamed: 0,gw,team,opponent_team,player,position,price,play_proba,was_home,relative_market_value_team_season,relative_market_value_opponent_team_season,season,minutes,predicted_points
88,1,Liverpool,Leeds,Mohamed_Salah,3,12.0,1.0,True,2.3944,0.300356,2021,90.0,8.124766
85,1,Liverpool,Leeds,Sadio_Mané,3,12.0,1.0,True,2.3944,0.300356,2021,90.0,7.411784
93,1,Liverpool,Leeds,Trent_Alexander-Arnold,2,7.5,1.0,True,2.3944,0.300356,2021,90.0,6.808165
81,1,Liverpool,Leeds,Xherdan_Shaqiri,3,6.5,1.0,True,2.3944,0.300356,2021,90.0,6.692647
9769,1,Arsenal,Fulham,Pierre-Emerick_Aubameyang,3,12.0,1.0,False,1.529012,0.347305,2021,90.0,6.596535
99,1,Liverpool,Leeds,Harvey_Elliott,3,4.5,1.0,True,2.3944,0.300356,2021,90.0,6.514181
96,1,Liverpool,Leeds,Rhian_Brewster,4,4.5,1.0,True,2.3944,0.300356,2021,90.0,6.455549
91,1,Liverpool,Leeds,Harry_Wilson,3,5.5,1.0,True,2.3944,0.300356,2021,90.0,6.40475
131,1,Tottenham Hotspur,Everton,Heung-Min_Son,3,9.0,1.0,True,1.693803,1.152716,2021,90.0,6.364668
89,1,Liverpool,Leeds,Andrew_Robertson,2,7.0,1.0,True,2.3944,0.300356,2021,90.0,6.364141


In [123]:
# write to predictions.csv
## RENAME PREVIOUS WEEK FIRST
predictions = remaining_season_df[['gw', 'player', 'team', 'position', 'price', 'play_proba', 'predicted_points']]
predictions.to_csv('predictions.csv')