In [2]:
# import required packages
from fastai2.tabular.all import *
from helpers import *

In [3]:
# path to project directory
path = Path('./')

In [4]:
# read in training dataset
train_df = pd.read_csv(path/'data/train_v4.csv', index_col=0, dtype={'season':str})

In [9]:
train_df.tail()

Unnamed: 0,player,gw,position,minutes,team,opponent_team,relative_market_value_team,relative_market_value_opponent_team,was_home,total_points,...,threat,transfers_balance,transfers_in,transfers_out,yellow_cards,kickoff_time,season,play_proba,relative_market_value_team_season,relative_market_value_opponent_team_season
90432,Tommy_Doyle,38,3,0,Manchester City,Norwich,2.430397,0.327574,True,0,...,0.0,-2,22,24,0,2020-07-26T15:00:00Z,1920,1.0,2.727025,0.1983
90433,Joseph_Anang,38,1,0,West Ham United,Aston Villa,0.709989,0.553818,True,0,...,0.0,70,270,200,0,2020-07-26T15:00:00Z,1920,1.0,0.739196,0.338194
90434,Erik_Pieters,38,2,90,Burnley,Brighton and Hove Albion,0.370648,0.541184,True,3,...,2.0,139816,144388,4572,1,2020-07-26T15:00:00Z,1920,1.0,0.441799,0.476156
90435,Japhet_Tanganga,38,2,0,Tottenham Hotspur,Crystal Palace,1.604904,0.430493,False,0,...,0.0,7999,14840,6841,0,2020-07-26T15:00:00Z,1920,1.0,2.113981,0.495374
90436,Ravel_Morrison,38,3,0,Sheffield United,Southampton,0.348678,0.526987,False,0,...,0.0,-23,7,30,0,2020-07-26T15:00:00Z,1920,0.0,0.150631,0.62421


In [10]:
train_df.shape

(90437, 37)

In [14]:
# parameters for model
season = '2021'
cat_vars = ['gw', 'season', 'position', 'team', 'opponent_team', 'was_home']
cont_vars = ['minutes', 'relative_market_value_team_season', 'relative_market_value_opponent_team_season']
dep_var = ['total_points']

In [17]:
# we want to set gw and season as ordered categorical variables
# need lists with ordered categories
ordered_gws = list(range(1,39))
ordered_seasons = ['1617', '1718', '1819', '1920']

In [18]:
# set as categories with correct order 
lag_train_df['gw'] = lag_train_df['gw'].astype('category')
lag_train_df['season'] = lag_train_df['season'].astype('category')

lag_train_df['gw'].cat.set_categories(ordered_gws, ordered=True, inplace=True)
lag_train_df['season'].cat.set_categories(ordered_seasons, ordered=True, inplace=True)

In [19]:
# dependent variable needs to be float
lag_train_df['total_points'] = lag_train_df['total_points'].astype('float64')

In [15]:
# find the latest gameweek
last_gw = train_df['gw'][train_df['season'] == season].max()

if np.isnan(last_gw): 
    next_gw = 1 
else: 
    next_gw = last_gw + 1
    
next_gw

1

In [20]:
# read in remaining_season.csv
remaining_season_df = pd.read_csv(path/'data/remaining_season.csv', index_col=0)

In [21]:
remaining_season_df.head(10)

Unnamed: 0,gw,team,opponent_team,player,position,price,play_proba,was_home,relative_market_value_team,relative_market_value_opponent_team,season,minutes
0,1,Fulham,Arsenal,Fabricio_Agosto Ramírez,1,4.0,1.0,True,0.318038,1.535042,1920,90.0
1,1,Fulham,Arsenal,Kevin_McDonald,3,5.0,1.0,True,0.318038,1.535042,1920,90.0
2,1,Fulham,Arsenal,Maxime_Le Marchand,2,4.5,1.0,True,0.318038,1.535042,1920,90.0
3,1,Fulham,Arsenal,Stefan_Johansen,3,5.0,1.0,True,0.318038,1.535042,1920,90.0
4,1,Fulham,Arsenal,Tom_Cairney,3,5.5,1.0,True,0.318038,1.535042,1920,90.0
5,1,Fulham,Arsenal,Tim_Ream,2,4.5,1.0,True,0.318038,1.535042,1920,90.0
6,1,Fulham,Arsenal,Anthony_Knockaert,3,5.5,1.0,True,0.318038,1.535042,1920,90.0
7,1,Fulham,Arsenal,Michael_Hector,2,4.5,1.0,True,0.318038,1.535042,1920,90.0
8,1,Fulham,Arsenal,Cyrus_Christie,2,4.5,1.0,True,0.318038,1.535042,1920,90.0
9,1,Fulham,Arsenal,Neeskens_Kebano,3,5.0,1.0,True,0.318038,1.535042,1920,90.0


In [37]:
# processors - categorify categorical variables and normalize continuous variables
# fill missing not used because new teams are almost certainly well below the league average
procs=[Categorify, Normalize]
to_nn = TabularPandas(lag_train_df, procs, cat_vars + ['player'], cont_vars, #+ player_lag_vars + team_lag_vars,
                      #splits=splits, 
                      y_names=dep_var)

In [38]:
dls = to_nn.dataloaders()

In [39]:
dls.show_batch()

Unnamed: 0,gw,season,position,team,opponent_team,was_home,player,minutes,relative_market_value_team,relative_market_value_opponent_team,total_points
0,21,1617,3,Manchester United,Liverpool,True,Juan_Mata,26.0,,,1.0
1,25,1920,3,West Ham United,Brighton and Hove Albion,True,Manuel_Lanzini,4.999999,0.740968,0.462137,1.0
2,24,1819,1,Southampton,Crystal Palace,True,Fraser_Forster,5.823472e-08,,,0.0
3,38,1920,2,Arsenal,Watford,True,Calum_Chambers,5.823472e-08,1.452343,0.472917,0.0
4,20,1718,2,Everton,West Bromwich Albion,False,Michael_Keane,90.0,,,8.0
5,18,1819,3,Brighton and Hove Albion,Bournemouth,False,Solomon_March,90.0,,,2.0
6,1,1819,4,Brighton and Hove Albion,Watford,False,Florin_Andone,5.823472e-08,,,0.0
7,27,1718,3,Manchester United,Newcastle United,False,Juan_Mata,24.0,,,1.0
8,22,1718,3,Leicester City,Huddersfield Town,True,Adrien Sebastian_Perruchet Silva,4.0,,,1.0
9,31,1617,4,Bournemouth,Liverpool,False,Callum_Wilson,5.823472e-08,,,0.0


In [28]:
# set range of predictions - minimum to current max
max_y = np.max(lag_train_df['total_points'])
min_y = np.min(lag_train_df['total_points'])
y_range = (-1, max_y)

In [29]:
learn = tabular_learner(dls, y_range=y_range, layers=[1000,500], ps=[0.001,0.01], emb_drop=0.04,
                        n_out=1, loss_func=F.mse_loss, metrics=rmse)

In [30]:
learn.fit_one_cycle(7, 1e-2, wd=0.2)

epoch,train_loss,valid_loss,_rmse,time
0,4.036372,,,00:18
1,3.49245,,,00:19
2,3.81205,,,00:19
3,3.534576,,,00:18
4,3.763046,,,00:18
5,3.230725,,,00:18
6,3.044583,,,00:18


  warn("Your generator is empty.")


In [33]:
cont_vars = ['minutes', 'relative_market_value_team', 'relative_market_value_opponent_team']

In [34]:
to_test = TabularPandas(remaining_season_df, procs, cat_vars + ['player'], cont_vars)#, y_names=dep_var)

In [40]:
test_dl = dls.test_dl(remaining_season_df)

In [41]:
preds = learn.get_preds(dl=test_dl)

In [42]:
preds

(tensor([[3.4369],
         [2.2753],
         [2.0237],
         ...,
         [2.9834],
         [2.7172],
         [4.2082]]),
 None)

In [44]:
remaining_season_df['predicted_points'] = preds[0].data.numpy().T[0]

In [45]:
remaining_season_df.sort_values(by='predicted_points', ascending=False).head(20)

Unnamed: 0,gw,team,opponent_team,player,position,price,play_proba,was_home,relative_market_value_team,relative_market_value_opponent_team,season,minutes,predicted_points
6684,27,Liverpool,Fulham,Trent_Alexander-Arnold,2,7.5,1.0,True,2.403843,0.318038,1920,90.0,9.126164
6679,27,Liverpool,Fulham,Mohamed_Salah,3,12.0,1.0,True,2.403843,0.318038,1920,90.0,8.965622
3679,15,Liverpool,West Bromwich Albion,Mohamed_Salah,3,12.0,1.0,True,2.403843,0.179463,1920,90.0,8.675791
6676,27,Liverpool,Fulham,Sadio_Mané,3,12.0,1.0,True,2.403843,0.318038,1920,90.0,8.618359
9490,38,Liverpool,Crystal Palace,Mohamed_Salah,3,12.0,1.0,True,2.403843,0.44424,1920,90.0,8.484838
88,1,Liverpool,Leeds,Mohamed_Salah,3,12.0,1.0,True,2.403843,0.301777,1920,90.0,8.404057
1624,7,Liverpool,West Ham United,Mohamed_Salah,3,12.0,1.0,True,2.403843,0.684978,1920,90.0,8.356295
3676,15,Liverpool,West Bromwich Albion,Sadio_Mané,3,12.0,1.0,True,2.403843,0.179463,1920,90.0,8.33329
12447,12,Liverpool,Fulham,Mohamed_Salah,3,12.0,1.0,False,2.403843,0.318038,1920,90.0,8.274717
7686,31,Liverpool,Aston Villa,Mohamed_Salah,3,12.0,1.0,True,2.403843,0.55689,1920,90.0,8.247001


In [46]:
# write to predictions.csv
## RENAME PREVIOUS WEEK FIRST
predictions = remaining_season_df[['gw', 'player', 'team', 'position', 'price', 'play_proba', 'predicted_points']]
predictions.to_csv('predictions.csv')