In [6]:
%matplotlib inline

In [35]:
import xgboost as xgb
from sklearn.metrics import mean_squared_error
import pandas as pd
import numpy as np
from pathlib import Path
from sklearn.preprocessing import LabelEncoder, OneHotEncoder
from sklearn.model_selection import train_test_split, RandomizedSearchCV, PredefinedSplit
from sklearn.feature_extraction import DictVectorizer
from matplotlib import pyplot as plt
from scipy.stats import uniform, randint
from helpers import *

In [36]:
# path to project directory
path = Path('./')

In [37]:
# read in training dataset
train_df = pd.read_csv(path/'data/train_v4.csv', index_col=0, dtype={'season':str})

In [38]:
# add a bunch of player lag features
lag_train_df, player_lag_vars = player_lag_features(train_df, ['total_points'], ['all', 1, 2, 3, 4, 5, 10])

In [39]:
# take a look at the dataframe
lag_train_df.info()

<class 'pandas.core.frame.DataFrame'>
Int64Index: 90437 entries, 0 to 90436
Data columns (total 58 columns):
 #   Column                                      Non-Null Count  Dtype  
---  ------                                      --------------  -----  
 0   player                                      90437 non-null  object 
 1   gw                                          90437 non-null  int64  
 2   position                                    90437 non-null  int64  
 3   minutes                                     90437 non-null  int64  
 4   team                                        90437 non-null  object 
 5   opponent_team                               90437 non-null  object 
 6   relative_market_value_team                  22501 non-null  float64
 7   relative_market_value_opponent_team         22501 non-null  float64
 8   was_home                                    90437 non-null  bool   
 9   total_points                                90437 non-null  int64  
 10  assists   

In [40]:
# set validaton point/length and categorical/continuous variables
valid_season = '1920'
valid_gw = 20
valid_len = 6
cat_vars = ['season', 'position', 'team', 'opponent_team', 'was_home']
cont_vars = ['gw', 'minutes']
dep_var = ['total_points']

In [41]:
# create dataset with adjusted post-validation lag numbers
lag_train_df, train_idx, valid_idx = create_lag_train(lag_train_df, 
                                                      cat_vars, cont_vars, player_lag_vars, dep_var,
                                                      valid_season, valid_gw, valid_len)

In [42]:
# check features for a player
lag_train_df[lag_train_df['player'] == 'Mohamed_Salah'].tail(10)

Unnamed: 0,index,player,season,position,team,opponent_team,was_home,gw,minutes,total_points_pg_last_all,total_points_pg_last_1,total_points_pg_last_2,total_points_pg_last_3,total_points_pg_last_4,total_points_pg_last_5,total_points_pg_last_10,total_points
76699,76699,Mohamed_Salah,1920,3,Liverpool,Bournemouth,False,16,90,7.996931,,3.970588,3.970588,6.428571,5.366972,4.72028,13
77163,77163,Mohamed_Salah,1920,3,Liverpool,Watford,True,17,90,8.058961,13.0,13.0,9.113924,9.113924,8.852459,5.979021,16
78202,78202,Mohamed_Salah,1920,3,Liverpool,Leicester City,False,19,69,8.156212,16.0,14.5,14.5,11.612903,11.612903,8.024476,3
78784,394,Mohamed_Salah,1920,3,Liverpool,Wolverhampton Wanderers,True,20,90,8.116743,3.913043,10.754717,11.566265,11.566265,9.936909,8.493648,3
79369,979,Mohamed_Salah,1920,3,Liverpool,Sheffield United,True,21,90,8.116743,3.913043,10.754717,11.566265,11.566265,9.936909,8.493648,10
79960,1570,Mohamed_Salah,1920,3,Liverpool,Tottenham Hotspur,False,22,90,8.116743,3.913043,10.754717,11.566265,11.566265,9.936909,8.493648,6
80757,2367,Mohamed_Salah,1920,3,Liverpool,Manchester United,True,23,90,8.116743,3.913043,10.754717,11.566265,11.566265,9.936909,8.493648,7
81236,2846,Mohamed_Salah,1920,3,Liverpool,Wolverhampton Wanderers,False,24,84,8.116743,3.913043,10.754717,11.566265,11.566265,9.936909,8.493648,2
81237,2847,Mohamed_Salah,1920,3,Liverpool,West Ham United,False,24,90,8.116743,3.913043,10.754717,11.566265,11.566265,9.936909,8.493648,14
81870,3480,Mohamed_Salah,1920,3,Liverpool,Southampton,True,25,90,8.116743,3.913043,10.754717,11.566265,11.566265,9.936909,8.493648,16


In [43]:
# split out dependent variable
X, y = lag_train_df[cat_vars + cont_vars + player_lag_vars].copy(), lag_train_df[dep_var].copy()

In [44]:
# since position is categorical, it should be a string
X['position'] = X['position'].apply(str)

# need to transform season
enc = LabelEncoder()
X['season'] = enc.fit_transform(X['season'])
X_dict = X.to_dict("records")

# Create the DictVectorizer object: dv
dv = DictVectorizer(sparse=False, separator='_')

# Apply dv on df: df_encoded
X_encoded = dv.fit_transform(X_dict)

X_df = pd.DataFrame(X_encoded, columns=dv.feature_names_)

In [45]:
X_df.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 82054 entries, 0 to 82053
Data columns (total 73 columns):
 #   Column                                  Non-Null Count  Dtype  
---  ------                                  --------------  -----  
 0   gw                                      82054 non-null  float64
 1   minutes                                 82054 non-null  float64
 2   opponent_team_Arsenal                   82054 non-null  float64
 3   opponent_team_Aston Villa               82054 non-null  float64
 4   opponent_team_Bournemouth               82054 non-null  float64
 5   opponent_team_Brighton and Hove Albion  82054 non-null  float64
 6   opponent_team_Burnley                   82054 non-null  float64
 7   opponent_team_Cardiff City              82054 non-null  float64
 8   opponent_team_Chelsea                   82054 non-null  float64
 9   opponent_team_Crystal Palace            82054 non-null  float64
 10  opponent_team_Everton                   82054 non-null  fl

In [46]:
X[player_lag_vars].info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 82054 entries, 0 to 82053
Data columns (total 7 columns):
 #   Column                    Non-Null Count  Dtype  
---  ------                    --------------  -----  
 0   total_points_pg_last_all  68666 non-null  float64
 1   total_points_pg_last_1    37879 non-null  float64
 2   total_points_pg_last_2    44333 non-null  float64
 3   total_points_pg_last_3    47807 non-null  float64
 4   total_points_pg_last_4    50292 non-null  float64
 5   total_points_pg_last_5    52087 non-null  float64
 6   total_points_pg_last_10   57642 non-null  float64
dtypes: float64(7)
memory usage: 4.4 MB


In [47]:
# split out training and validation sets
X_train = X_df.iloc[train_idx]
y_train = y.iloc[train_idx]
X_test = X_df.iloc[valid_idx]
y_test = y.iloc[valid_idx]

In [48]:
# instatiate and train XGB Regressor
# print result
xg_reg = xgb.XGBRegressor(gamma=0.05, learning_rate=0.08, max_depth=5, n_estimators=75, subsample=0.7)

xg_reg.fit(X_train, y_train)
preds = xg_reg.predict(X_test)
print("RMSE: %f" % (r_mse(preds, y_test['total_points'])))

RMSE: 1.791749


In [13]:
df_new = train_df.copy()

# add lag vars
df_new = player_lag_features(df_new, player_lag_vars.copy(), player_lags)
df_new = team_lag_features(df_new, team_lag_vars, team_lags)

lag_vars = []

# add lag var names
for player_lag_var in player_lag_vars:
    for player_lag in player_lags:
        feature_name = player_lag_var + '_pg_last_' + str(player_lag)
        lag_vars.append(feature_name)

for team_lag_var in team_lag_vars:
    for team_lag in team_lags:
        feature_name = team_lag_var + '_team_pg_last_' + str(team_lag)
        feature_name_opposition = feature_name + '_opponent'
        lag_vars.extend([feature_name, feature_name_opposition])

print(cont_vars)
print(cat_vars)
print(lag_vars)

X, y = df_new[cat_vars + cont_vars + lag_vars].copy(), df_new[dep_var].copy()

# find validation cut point - index for 18-19 season second half (gw 19)
valid_start = train_df[(train_df['season'] == '1920') & (train_df['gw'] == 20)].index.min()
#valid_end = train_df[(train_df['season'] == '1920') & (train_df['gw'] == 1)].index.min()
valid_end = train_df[(train_df['season'] == '1920') & (train_df['gw'] == 25)].index.max()

NameError: name 'player_lags' is not defined

In [None]:
preds

In [None]:
def create_xgb(df, validation, params, dep_var, cat_vars, cont_vars, 
               player_lag_vars, team_lag_vars, player_lags, team_lags):
    
    df_new = df.copy()
    
    # add lag vars
    df_new = player_lag_features(df_new, player_lag_vars.copy(), player_lags)
    df_new = team_lag_features(df_new, team_lag_vars, team_lags)
    
    lag_vars = []
    
    # add lag var names
    for player_lag_var in player_lag_vars:
        for player_lag in player_lags:
            feature_name = player_lag_var + '_pg_last_' + str(player_lag)
            lag_vars.append(feature_name)
            
    for team_lag_var in team_lag_vars:
        for team_lag in team_lags:
            feature_name = team_lag_var + '_team_pg_last_' + str(team_lag)
            feature_name_opposition = feature_name + '_opponent'
            lag_vars.extend([feature_name, feature_name_opposition])
    
    print(cont_vars)
    print(cat_vars)
    print(lag_vars)
    
    X, y = df_new[cat_vars + cont_vars + lag_vars].copy(), df_new[dep_var].copy()
    
    # find validation cut point - index for 18-19 season second half (gw 19)
    valid_start = train_df[(train_df['season'] == '1920') & (train_df['gw'] == 20)].index.min()
    #valid_end = train_df[(train_df['season'] == '1920') & (train_df['gw'] == 1)].index.min()
    valid_end = train_df[(train_df['season'] == '1920') & (train_df['gw'] == 25)].index.max()
    
    
    
    X['position'] = X['position'].apply(str)
    enc = LabelEncoder()
    X['season'] = enc.fit_transform(X['season'])
    X_dict = X.to_dict("records")

    # Create the DictVectorizer object: dv
    dv = DictVectorizer(sparse=False, separator='_')

    # Apply dv on df: df_encoded
    X_encoded = dv.fit_transform(X_dict)
    
    X_df = pd.DataFrame(X_encoded, columns=dv.feature_names_)
    
    # training and validation sets
    if validation == 'no-search':
        X_train = X_df[:valid_start]
        y_train = y[:valid_start]
        X_test = X_df[valid_start:valid_end]
        y_test = y[valid_start:valid_end]
        
        xg_reg = xgb.XGBRegressor(gamma=0.047479, learning_rate=0.0828905304021426, max_depth=5, n_estimators=75, subsample=0.6862963240045978)
        
        xg_reg.fit(X_train, y_train)
        preds = xg_reg.predict(X_test)
        rmse = np.sqrt(mean_squared_error(preds, y_test))
        print("RMSE: %f" % (rmse))
    
    else:
        X_train = X_df[0:valid_end]
        y_train = y[0:valid_end]
        test_fold = np.repeat([-1, 0], [valid_start, valid_end - valid_start])
        ps = PredefinedSplit(test_fold)
        
        # Instantiate the regressor: gbm
        gbm = xgb.XGBRegressor(objective="reg:squarederror")
        
        # Perform random search: grid_mse
        randomized_mse = RandomizedSearchCV(estimator=gbm, 
                                    param_distributions=params, 
                                    scoring="neg_mean_squared_error", 
                                    n_iter=25, 
                                    cv=ps, 
                                    verbose=1)
        
        # Fit randomized_mse to the data
        randomized_mse.fit(X_train, y_train)
        
        # Print the best parameters and lowest RMSE
        print("Best parameters found: ", randomized_mse.best_params_)
        print("Lowest RMSE found: ", np.sqrt(np.abs(randomized_mse.best_score_)))
    

In [None]:
validation = 'random'

# params = {"gamma": 0.05, 
#         "learning_rate": 0.1, 
#         "max_depth": 5,
#         "n_estimators": 100, 
#         "subsample":0.7}

params = {#"colsample_bytree": uniform(0.7, 0.3),
          "gamma": uniform(0, 0.5),
          "learning_rate": uniform(0.003, 0.3), # default 0.1 
          "max_depth": randint(2, 6), # default 3
          "n_estimators": randint(25, 200), # default 100
          "subsample": uniform(0.6, 0.4)}

dep_var = 'total_points'
cat_vars = ['season', 'position', 'team', 'opponent_team', 'was_home']
cont_vars = ['gw', 'minutes']
#, 'relative_market_value_team', 'relative_market_value_opponent_team']
player_lag_vars = ['total_points']
team_lag_vars = ['total_points']
player_lags = ['all', 1 ,2 ,3, 5, 10]
team_lags = ['all', 1, 2, 3, 5, 10]

create_xgb(train_df, validation, params, dep_var, cat_vars, cont_vars, 
               player_lag_vars, team_lag_vars, player_lags, team_lags)

In [None]:
train_df.head()

In [None]:
train_df[train_df['player'] == 'Harry_Kane'].head(10)

In [None]:
train_df.info()

In [None]:
# set categorical, continuous variables
# and dependent variable
cat_vars = ['season', 'position', 'team', 'opponent_team', 'was_home']
cont_vars = ['gw', 'minutes', 'relative_market_value_team', 
             'relative_market_value_opponent_team', 
#              'minutes_last_all', 
#              'minutes_last_1', 
#              'minutes_last_2',
#              'minutes_last_3',
#              'minutes_last_4',
#              'minutes_last_5',
#              'minutes_last_10',
#              'minutes_last_20',
#              'total_points_last_all',
#              'total_points_last_1',
#              'total_points_last_2',
#              'total_points_last_3',
#              'total_points_last_4',
#              'total_points_last_5',
#              'total_points_last_10',
#              'total_points_last_20',
             'total_points_pg_last_all',
             'total_points_pg_last_1',
             'total_points_pg_last_2',
             'total_points_pg_last_3',
             'total_points_pg_last_4',
             'total_points_pg_last_5',
             'total_points_pg_last_10',
             'total_points_team_pg_last_all',
             'total_points_team_pg_last_1',
             'total_points_team_pg_last_2',
             'total_points_team_pg_last_3',
             'total_points_team_pg_last_4',
             'total_points_team_pg_last_5',
             'total_points_team_pg_last_10',
             'total_points_team_pg_last_20']
dep_var = 'total_points'

In [None]:
X, y = train_df[cat_vars + cont_vars].copy(), train_df[dep_var].copy()

In [None]:
# find validation cut point - index for 18-19 season second half (gw 19)
valid_start = train_df[(train_df['season'] == '1819') & (train_df['gw'] == 19)].index.min()
# valid_end = train_df[(train_df['season'] == '1920') & (train_df['gw'] == 1)].index.min()
valid_end = train_df.index.max()

In [None]:
X['position'] = X['position'].apply(str)

In [None]:
#X['season'] = X['season'].replace({'1617':1, '1718':2, '1819':3, '1920':4})
enc = LabelEncoder()
X['season'] = enc.fit_transform(X['season'])

In [None]:
X_dict = X.to_dict("records")

# Create the DictVectorizer object: dv
dv = DictVectorizer(sparse=False, separator='_')

# Apply dv on df: df_encoded
X_encoded = dv.fit_transform(X_dict)

In [None]:
X_df = pd.DataFrame(X_encoded, columns=dv.feature_names_)
X_df.head()

In [None]:
X_df.columns

In [None]:
X_train = X_df[0:valid_end]
y_train = y[0:valid_end]

In [None]:
test_fold = np.repeat([-1, 0], [valid_start, valid_end - valid_start])

In [None]:
len(test_fold)

In [None]:
ps = PredefinedSplit(test_fold)

In [None]:
# Create the parameter grid: gbm_param_grid 
gbm_param_grid = {
    'n_estimators': [50, 100, 200],
    'max_depth': [2, 3, 5],
    #'colsample_bytree': [0.1, 0.5, 0.8, 1],
    'learning_rate': [0.1]
}

In [None]:
params = {#"colsample_bytree": uniform(0.7, 0.3),
          "gamma": uniform(0, 0.5),
          "learning_rate": uniform(0.003, 0.3), # default 0.1 
          "max_depth": randint(2, 6), # default 3
          "n_estimators": randint(25, 200), # default 100
          "subsample": uniform(0.6, 0.4)}

In [None]:
# Instantiate the regressor: gbm
gbm = xgb.XGBRegressor(objective="reg:squarederror")

In [None]:
# Perform random search: grid_mse
randomized_mse = RandomizedSearchCV(estimator=gbm, 
                                    param_distributions=params, 
                                    scoring="neg_mean_squared_error", 
                                    n_iter=25, 
                                    cv=ps, 
                                    verbose=1)

In [None]:
xgb.__version__

In [None]:
# Fit randomized_mse to the data
randomized_mse.fit(X_train, y_train)

In [None]:
# Print the best parameters and lowest RMSE
print("Best parameters found: ", randomized_mse.best_params_)
print("Lowest RMSE found: ", np.sqrt(np.abs(randomized_mse.best_score_)))

In [None]:
# training and validation sets
X_train = X_df[:valid_start]
y_train = y[:valid_start]
X_test = X_df[valid_start:valid_end]
y_test = y[valid_start:valid_end]

In [None]:
#xg_reg = xgb.XGBRegressor(n_estimators=75, max_depth=5, learning_rate=0.1)

xg_reg = xgb.XGBRegressor(gamma=0.047479, learning_rate=0.0828905304021426, max_depth=5, n_estimators=75, subsample=0.6862963240045978)

xg_reg.fit(X_train, y_train)

preds = xg_reg.predict(X_test)

rmse = np.sqrt(mean_squared_error(preds, y_test))
print("RMSE: %f" % (rmse))

In [None]:
#plt.style.use('ggplot')
xgb.plot_importance(xg_reg, max_num_features=15)
plt.show()

In [None]:
max(preds)

In [None]:
results = X_test.merge(y_test.to_frame(), left_index=True, right_index=True)
results['preds'] = preds
results