In [1]:
import xgboost as xgb
from sklearn.metrics import mean_squared_error
import pandas as pd
import numpy as np
from pathlib import Path
from sklearn.preprocessing import LabelEncoder, OneHotEncoder
from sklearn.model_selection import train_test_split, RandomizedSearchCV, PredefinedSplit
from sklearn.feature_extraction import DictVectorizer

In [6]:
# path to project directory
path = Path('/home/jupyter/fpl-prediction')

In [7]:
# read in training dataset
# created using fpl_data_clean notebook
train_df = pd.read_csv(path/'data/train.csv', index_col=0, dtype={'season':str})

In [8]:
train_df.head()

Unnamed: 0,player,gw,position,minutes,team,opponent_team,relative_market_value_team,relative_market_value_opponent_team,was_home,total_points,season,play_proba
0,Aaron_Cresswell,1,2,0,West Ham United,Chelsea,0.895471,2.243698,False,0,1617,
1,Aaron_Lennon,1,3,15,Everton,Tottenham Hotspur,1.057509,1.43369,True,1,1617,
2,Aaron_Ramsey,1,3,60,Arsenal,Liverpool,1.944129,1.46586,True,2,1617,
3,Abdoulaye_Doucouré,1,3,0,Watford,Southampton,0.7042,0.796805,False,0,1617,
4,Abdul Rahman_Baba,1,2,0,Chelsea,West Ham United,2.243698,0.895471,True,0,1617,


In [26]:
aaron = train_df[train_df['player'] == 'Aaron_Cresswell']

In [29]:
aaron.total_points.cumsum()

0          0
524        0
1061       0
1609       0
2190       0
2774       0
3360       0
3949       6
4541       6
5135       7
5733       9
6332      10
6933      11
7537      11
8142      12
8752      18
9366      25
9982      27
10600     29
11222     30
11846     36
12477     38
13111     38
13751     39
14400     40
15050     42
15576     43
16292     44
16547     45
17201     50
        ... 
60066    202
60666    204
61150    204
61755    205
62361    206
62969    210
63275    211
64177    212
64550    212
65223    212
66081    212
66696    212
67312    212
67936    212
68462    212
68991    212
69521    212
70053    212
70592    226
71133    233
71679    235
72230    235
72785    237
73341    237
73898    238
74457    239
75017    253
75581    253
76146    253
77149    259
Name: total_points, Length: 131, dtype: int64

In [36]:
train_df['points_cumulative'] = train_df.groupby(['player'])['total_points'].apply(lambda x: x.cumsum() - x)
train_df

Unnamed: 0,player,gw,position,minutes,team,opponent_team,relative_market_value_team,relative_market_value_opponent_team,was_home,total_points,season,play_proba,points_cumulative,points_last_5
0,Aaron_Cresswell,1,2,0,West Ham United,Chelsea,0.895471,2.243698,False,0,1617,,0,0.0
1,Aaron_Lennon,1,3,15,Everton,Tottenham Hotspur,1.057509,1.433690,True,1,1617,,0,1.0
2,Aaron_Ramsey,1,3,60,Arsenal,Liverpool,1.944129,1.465860,True,2,1617,,0,2.0
3,Abdoulaye_Doucouré,1,3,0,Watford,Southampton,0.704200,0.796805,False,0,1617,,0,0.0
4,Abdul Rahman_Baba,1,2,0,Chelsea,West Ham United,2.243698,0.895471,True,0,1617,,0,0.0
5,Abel_Hernández,1,4,90,Hull City,Leicester City,0.494447,0.650832,True,5,1617,,0,5.0
6,Adama_Diomande,1,4,90,Hull City,Leicester City,0.494447,0.650832,True,8,1617,,0,8.0
7,Adam_Clayton,1,3,90,Middlesbrough,Stoke City,0.452793,0.718705,True,2,1617,,0,2.0
8,Adam_Federici,1,1,0,Bournemouth,Manchester United,0.384921,1.983179,True,0,1617,,0,0.0
9,Adam_Forshaw,1,3,69,Middlesbrough,Stoke City,0.452793,0.718705,True,1,1617,,0,1.0


In [37]:
train_df[train_df['player'] == 'Aaron_Cresswell']

Unnamed: 0,player,gw,position,minutes,team,opponent_team,relative_market_value_team,relative_market_value_opponent_team,was_home,total_points,season,play_proba,points_cumulative,points_last_5
0,Aaron_Cresswell,1,2,0,West Ham United,Chelsea,0.895471,2.243698,False,0,1617,,0,0.0
524,Aaron_Cresswell,2,2,0,West Ham United,Bournemouth,0.895471,0.384921,True,0,1617,,0,0.0
1061,Aaron_Cresswell,3,2,0,West Ham United,Manchester City,0.895471,2.311012,False,0,1617,,0,0.0
1609,Aaron_Cresswell,4,2,0,West Ham United,Watford,0.895471,0.704200,True,0,1617,,0,0.0
2190,Aaron_Cresswell,5,2,0,West Ham United,West Bromwich Albion,0.895471,0.428062,False,0,1617,,0,0.0
2774,Aaron_Cresswell,6,2,0,West Ham United,Southampton,0.895471,0.796805,True,0,1617,,0,0.0
3360,Aaron_Cresswell,7,2,0,West Ham United,Middlesbrough,0.895471,0.452793,True,0,1617,,0,0.0
3949,Aaron_Cresswell,8,2,74,West Ham United,Crystal Palace,0.895471,0.802197,False,6,1617,,0,6.0
4541,Aaron_Cresswell,9,2,0,West Ham United,Sunderland,0.895471,0.418392,True,0,1617,,6,6.0
5135,Aaron_Cresswell,10,2,90,West Ham United,Everton,0.895471,1.057509,False,1,1617,,6,7.0


In [40]:
train_df['points_last_5'] = train_df.groupby(['player'])['total_points'].apply(lambda x: x.rolling(min_periods=1, window=6).sum() - x)
train_df

Unnamed: 0,player,gw,position,minutes,team,opponent_team,relative_market_value_team,relative_market_value_opponent_team,was_home,total_points,season,play_proba,points_cumulative,points_last_5
0,Aaron_Cresswell,1,2,0,West Ham United,Chelsea,0.895471,2.243698,False,0,1617,,0,0.0
1,Aaron_Lennon,1,3,15,Everton,Tottenham Hotspur,1.057509,1.433690,True,1,1617,,0,0.0
2,Aaron_Ramsey,1,3,60,Arsenal,Liverpool,1.944129,1.465860,True,2,1617,,0,0.0
3,Abdoulaye_Doucouré,1,3,0,Watford,Southampton,0.704200,0.796805,False,0,1617,,0,0.0
4,Abdul Rahman_Baba,1,2,0,Chelsea,West Ham United,2.243698,0.895471,True,0,1617,,0,0.0
5,Abel_Hernández,1,4,90,Hull City,Leicester City,0.494447,0.650832,True,5,1617,,0,0.0
6,Adama_Diomande,1,4,90,Hull City,Leicester City,0.494447,0.650832,True,8,1617,,0,0.0
7,Adam_Clayton,1,3,90,Middlesbrough,Stoke City,0.452793,0.718705,True,2,1617,,0,0.0
8,Adam_Federici,1,1,0,Bournemouth,Manchester United,0.384921,1.983179,True,0,1617,,0,0.0
9,Adam_Forshaw,1,3,69,Middlesbrough,Stoke City,0.452793,0.718705,True,1,1617,,0,0.0


In [42]:
train_df[train_df['player'] == 'Aaron_Cresswell']

Unnamed: 0,player,gw,position,minutes,team,opponent_team,relative_market_value_team,relative_market_value_opponent_team,was_home,total_points,season,play_proba,points_cumulative,points_last_5
0,Aaron_Cresswell,1,2,0,West Ham United,Chelsea,0.895471,2.243698,False,0,1617,,0,0.0
524,Aaron_Cresswell,2,2,0,West Ham United,Bournemouth,0.895471,0.384921,True,0,1617,,0,0.0
1061,Aaron_Cresswell,3,2,0,West Ham United,Manchester City,0.895471,2.311012,False,0,1617,,0,0.0
1609,Aaron_Cresswell,4,2,0,West Ham United,Watford,0.895471,0.704200,True,0,1617,,0,0.0
2190,Aaron_Cresswell,5,2,0,West Ham United,West Bromwich Albion,0.895471,0.428062,False,0,1617,,0,0.0
2774,Aaron_Cresswell,6,2,0,West Ham United,Southampton,0.895471,0.796805,True,0,1617,,0,0.0
3360,Aaron_Cresswell,7,2,0,West Ham United,Middlesbrough,0.895471,0.452793,True,0,1617,,0,0.0
3949,Aaron_Cresswell,8,2,74,West Ham United,Crystal Palace,0.895471,0.802197,False,6,1617,,0,0.0
4541,Aaron_Cresswell,9,2,0,West Ham United,Sunderland,0.895471,0.418392,True,0,1617,,6,6.0
5135,Aaron_Cresswell,10,2,90,West Ham United,Everton,0.895471,1.057509,False,1,1617,,6,6.0


In [9]:
train_df.info()

<class 'pandas.core.frame.DataFrame'>
Int64Index: 77810 entries, 0 to 77809
Data columns (total 12 columns):
player                                 77810 non-null object
gw                                     77810 non-null int64
position                               77810 non-null int64
minutes                                77810 non-null int64
team                                   77810 non-null object
opponent_team                          77810 non-null object
relative_market_value_team             77810 non-null float64
relative_market_value_opponent_team    77810 non-null float64
was_home                               77810 non-null bool
total_points                           77810 non-null int64
season                                 77810 non-null object
play_proba                             9807 non-null float64
dtypes: bool(1), float64(3), int64(4), object(4)
memory usage: 7.2+ MB


In [10]:
# set categorical, continuous variables
# and dependent variable
cat_vars = ['season', 'position', 'team', 'opponent_team', 'was_home']
cont_vars = ['gw', 'minutes', 'relative_market_value_team', 'relative_market_value_opponent_team']
dep_var = 'total_points'

In [11]:
X, y = train_df[cat_vars + cont_vars].copy(), train_df['total_points'].copy()

In [12]:
# find validation cut point - index for 18-19 season second half (gw 19)
#valid_start = train_df[(train_df['gw'] > 18) & (train_df['season'] == '1819')].index.min()
# or take validation as same as current prediction set
valid_start = train_df[(train_df['season'] == '1819') & (train_df['gw'] == 20)].index.min()
valid_end = train_df[(train_df['season'] == '1920') & (train_df['gw'] == 1)].index.min()

In [13]:
X['position'] = X['position'].apply(str)

In [14]:
X_dict = X.to_dict("records")

# Create the DictVectorizer object: dv
dv = DictVectorizer(sparse=False, separator='_')

# Apply dv on df: df_encoded
X_encoded = dv.fit_transform(X_dict)

In [15]:
X_df = pd.DataFrame(X_encoded, columns=dv.feature_names_)
X_df.head()

Unnamed: 0,gw,minutes,opponent_team_Arsenal,opponent_team_Aston Villa,opponent_team_Bournemouth,opponent_team_Brighton and Hove Albion,opponent_team_Burnley,opponent_team_Cardiff City,opponent_team_Chelsea,opponent_team_Crystal Palace,...,team_Southampton,team_Stoke City,team_Sunderland,team_Swansea City,team_Tottenham Hotspur,team_Watford,team_West Bromwich Albion,team_West Ham United,team_Wolverhampton Wanderers,was_home
0,1.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,1.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,1.0,0.0,0.0
1,1.0,15.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,1.0
2,1.0,60.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,1.0
3,1.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,1.0,0.0,0.0,0.0,0.0
4,1.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,1.0


In [19]:
# training and validation sets
X_train = X_df[0:valid_start]
y_train = y[0:valid_start]
X_test = X_df[valid_start:valid_end]
y_test = y[valid_start:valid_end]

In [16]:
X_train = X_df[0:valid_end]
y_train = y[0:valid_end]

In [17]:
test_fold = np.repeat([-1, 0], [valid_start, valid_end - valid_start])

In [18]:
len(test_fold)

67936

In [19]:
ps = PredefinedSplit(test_fold)

In [20]:
# Create the parameter grid: gbm_param_grid 
gbm_param_grid = {
    'n_estimators': [50, 100, 200],
    'max_depth': [2, 3, 5],
    #'colsample_bytree': [0.1, 0.5, 0.8, 1],
    'learning_rate': [0.1]
}

In [21]:
# Instantiate the regressor: gbm
gbm = xgb.XGBRegressor()

In [22]:
# Perform random search: grid_mse
randomized_mse = RandomizedSearchCV(estimator=gbm, 
                                    param_distributions=gbm_param_grid, 
                                    scoring="neg_mean_squared_error", 
                                    n_iter=10, 
                                    cv=ps, 
                                    verbose=1)

In [2]:
xgb.__version__

'0.90'

In [23]:
# Fit randomized_mse to the data
randomized_mse.fit(X_train, y_train)

  if getattr(data, 'base', None) is not None and \




  if getattr(data, 'base', None) is not None and \




  if getattr(data, 'base', None) is not None and \




  if getattr(data, 'base', None) is not None and \




  if getattr(data, 'base', None) is not None and \




  if getattr(data, 'base', None) is not None and \




  if getattr(data, 'base', None) is not None and \




  if getattr(data, 'base', None) is not None and \




  if getattr(data, 'base', None) is not None and \




  if getattr(data, 'base', None) is not None and \
  data.base is not None and isinstance(data, np.ndarray) \




RandomizedSearchCV(cv=PredefinedSplit(test_fold=array([-1, -1, ...,  0,  0])),
          error_score='raise-deprecating',
          estimator=XGBRegressor(base_score=0.5, booster='gbtree', colsample_bylevel=1,
       colsample_bynode=1, colsample_bytree=1, gamma=0,
       importance_type='gain', learning_rate=0.1, max_delta_step=0,
       max_depth=3, min_child_weight=1, missing=None, n_estimators=100,
       n_jobs=1, nthread=None, objective='reg:linear', random_state=0,
       reg_alpha=0, reg_lambda=1, scale_pos_weight=1, seed=None,
       silent=None, subsample=1, verbosity=1),
          fit_params=None, iid='warn', n_iter=10, n_jobs=None,
          param_distributions={'n_estimators': [50, 100, 200], 'max_depth': [2, 3, 5], 'learning_rate': [0.1]},
          pre_dispatch='2*n_jobs', random_state=None, refit=True,
          return_train_score='warn', scoring='neg_mean_squared_error',
          verbose=0)

In [24]:
# Print the best parameters and lowest RMSE
print("Best parameters found: ", randomized_mse.best_params_)
print("Lowest RMSE found: ", np.sqrt(np.abs(randomized_mse.best_score_)))

Best parameters found:  {'n_estimators': 200, 'max_depth': 2, 'learning_rate': 0.1}
Lowest RMSE found:  1.865529273027009


In [20]:
xg_reg = xgb.XGBRegressor()

xg_reg.fit(X_train, y_train)

preds = xg_reg.predict(X_test)

rmse = np.sqrt(mean_squared_error(preds, y_test))
print("RMSE: %f" % (rmse))

  if getattr(data, 'base', None) is not None and \
  data.base is not None and isinstance(data, np.ndarray) \


RMSE: 1.867267


In [21]:
max(preds)

7.584905

In [22]:
results = X_test.merge(y_test.to_frame(), left_index=True, right_index=True)
results['preds'] = preds
results

Unnamed: 0,gw,minutes,opponent_team_Arsenal,opponent_team_Aston Villa,opponent_team_Bournemouth,opponent_team_Brighton and Hove Albion,opponent_team_Burnley,opponent_team_Cardiff City,opponent_team_Chelsea,opponent_team_Crystal Palace,...,team_Sunderland,team_Swansea City,team_Tottenham Hotspur,team_Watford,team_West Bromwich Albion,team_West Ham United,team_Wolverhampton Wanderers,was_home,total_points,preds
56539,20.0,90.0,0.0,0.0,0.0,0.0,1.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,1.0,0.0,0.0,1,3.174152
56540,20.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,1.0,0,0.029738
56541,20.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0,0.027991
56542,20.0,90.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,2,2.892542
56543,20.0,90.0,0.0,0.0,0.0,0.0,0.0,0.0,1.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,1.0,2,2.021116
56544,20.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,1.0,0,0.018358
56545,20.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0,0.027991
56546,20.0,36.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,1.0,0.0,0.0,0.0,1.0,7,1.363438
56547,20.0,45.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,1.0,-1,1.608021
56548,20.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,1.0,0.0,0.0,0.0,1.0,0,0.005136
