In [1]:
import pandas as pd
import numpy as np
import os
import xgboost as xgb
from xgboost import plot_importance
from sklearn.metrics import mean_squared_error
from sklearn.base import TransformerMixin, BaseEstimator
from sklearn.pipeline import Pipeline
import matplotlib.pyplot as plt
import seaborn as sns
import datetime
import random
import lightgbm
import shap
import itertools
pd.options.mode.chained_assignment = None

os.chdir('/Users/stevengeorge/Documents/Github/fpl-analysis/')

from src.data.gw_features import add_gw_features
from src.features.custom_transformers import TimeSeriesFeatures
from src.models.constants import TIME_SERIES_FEATURES, STATIC_FEATURES

%matplotlib inline
pd.set_option('display.max_columns', None)
DATA_PATH = 'data/processed/'

This means that in case of installing LightGBM from PyPI via the ``pip install lightgbm`` command, you don't need to install the gcc compiler anymore.
Instead of that, you need to install the OpenMP library, which is required for running LightGBM on the system with the Apple Clang compiler.
You can install the OpenMP library by the following command: ``brew install libomp``.


In [2]:
fpl_data_all_seasons_with_ts = pd.read_parquet('fpl_data_all_seasons_with_ts.parquet')

In [3]:
training = fpl_data_all_seasons_with_ts[fpl_data_all_seasons_with_ts['season_order'].isin([1, 2])]

In [4]:
training = training[~((training['season_order'] == 2) & (training['gw'] == 38))]

In [5]:
mean_total_points = training['total_points'].mean()
mean_total_points

1.3516271434592604

In [6]:
# Target cannot contain any nulls
training['total_points_plus_1_gw'] = training.groupby(['name'])['total_points'].shift(-1).fillna(mean_total_points)

In [7]:
training.drop(
    ['total_points', 'name', 'season', 'team_name', 'team_name_opponent'],
    axis=1,
    inplace=True
)

In [8]:
print(training.shape)
training = training[~training['next_match_was_home'].isnull()]
training['next_match_was_home'] = training['next_match_was_home'].astype(bool)
training.shape

(45079, 500)


(44850, 500)

In [9]:
from sklearn.model_selection import GridSearchCV

In [10]:
lgmb_model = lightgbm.LGBMRegressor()
lgmb_model

LGBMRegressor(boosting_type='gbdt', class_weight=None, colsample_bytree=1.0,
              importance_type='split', learning_rate=0.1, max_depth=-1,
              min_child_samples=20, min_child_weight=0.001, min_split_gain=0.0,
              n_estimators=100, n_jobs=-1, num_leaves=31, objective=None,
              random_state=None, reg_alpha=0.0, reg_lambda=0.0, silent=True,
              subsample=1.0, subsample_for_bin=200000, subsample_freq=0)

In [11]:
params = {
    'num_leaves': [31, 50],
    'max_depth': [3, 7, -1],
    'learning_rate': [0.03, 0.1, 0.3],
    'n_estimators': [100, 200],
    'reg_alpha': [0, 0.03, 0.1]
}

In [12]:
grid_search = GridSearchCV(lightgbm.LGBMRegressor(), params, cv=3, scoring='neg_mean_squared_error', verbose=4)

In [13]:
%%time
grid_search.fit(
    training.drop(['total_points_plus_1_gw', 'ID'], axis=1),
    training['total_points_plus_1_gw']
)

Fitting 3 folds for each of 108 candidates, totalling 324 fits
[CV] learning_rate=0.03, max_depth=3, n_estimators=100, num_leaves=31, reg_alpha=0 


[Parallel(n_jobs=1)]: Using backend SequentialBackend with 1 concurrent workers.


[CV]  learning_rate=0.03, max_depth=3, n_estimators=100, num_leaves=31, reg_alpha=0, score=-4.105, total=   8.4s
[CV] learning_rate=0.03, max_depth=3, n_estimators=100, num_leaves=31, reg_alpha=0 


[Parallel(n_jobs=1)]: Done   1 out of   1 | elapsed:    8.4s remaining:    0.0s


[CV]  learning_rate=0.03, max_depth=3, n_estimators=100, num_leaves=31, reg_alpha=0, score=-4.391, total=   8.5s
[CV] learning_rate=0.03, max_depth=3, n_estimators=100, num_leaves=31, reg_alpha=0 


[Parallel(n_jobs=1)]: Done   2 out of   2 | elapsed:   17.0s remaining:    0.0s


[CV]  learning_rate=0.03, max_depth=3, n_estimators=100, num_leaves=31, reg_alpha=0, score=-4.570, total=   8.3s
[CV] learning_rate=0.03, max_depth=3, n_estimators=100, num_leaves=31, reg_alpha=0.03 


[Parallel(n_jobs=1)]: Done   3 out of   3 | elapsed:   25.3s remaining:    0.0s


[CV]  learning_rate=0.03, max_depth=3, n_estimators=100, num_leaves=31, reg_alpha=0.03, score=-4.105, total=   9.0s
[CV] learning_rate=0.03, max_depth=3, n_estimators=100, num_leaves=31, reg_alpha=0.03 
[CV]  learning_rate=0.03, max_depth=3, n_estimators=100, num_leaves=31, reg_alpha=0.03, score=-4.391, total=   8.4s
[CV] learning_rate=0.03, max_depth=3, n_estimators=100, num_leaves=31, reg_alpha=0.03 
[CV]  learning_rate=0.03, max_depth=3, n_estimators=100, num_leaves=31, reg_alpha=0.03, score=-4.570, total=   9.5s
[CV] learning_rate=0.03, max_depth=3, n_estimators=100, num_leaves=31, reg_alpha=0.1 
[CV]  learning_rate=0.03, max_depth=3, n_estimators=100, num_leaves=31, reg_alpha=0.1, score=-4.105, total=   8.3s
[CV] learning_rate=0.03, max_depth=3, n_estimators=100, num_leaves=31, reg_alpha=0.1 
[CV]  learning_rate=0.03, max_depth=3, n_estimators=100, num_leaves=31, reg_alpha=0.1, score=-4.388, total=   9.1s
[CV] learning_rate=0.03, max_depth=3, n_estimators=100, num_leaves=31, reg_a

[Parallel(n_jobs=1)]: Done 324 out of 324 | elapsed: 88.9min finished


CPU times: user 4h 6min 7s, sys: 11min 34s, total: 4h 17min 41s
Wall time: 1h 29min 27s


GridSearchCV(cv=3, error_score='raise-deprecating',
             estimator=LGBMRegressor(boosting_type='gbdt', class_weight=None,
                                     colsample_bytree=1.0,
                                     importance_type='split', learning_rate=0.1,
                                     max_depth=-1, min_child_samples=20,
                                     min_child_weight=0.001, min_split_gain=0.0,
                                     n_estimators=100, n_jobs=-1, num_leaves=31,
                                     objective=None, random_state=None,
                                     reg_alpha=0.0, reg_lambda=0.0, silent=True,
                                     subsample=1.0, subsample_for_bin=200000,
                                     subsample_freq=0),
             iid='warn', n_jobs=None,
             param_grid={'learning_rate': [0.03, 0.1, 0.3],
                         'max_depth': [3, 7, -1], 'n_estimators': [100, 200],
                         'num_le

In [15]:
grid_search.best_score_

-4.304288680344856

In [16]:
grid_search.best_params_

{'learning_rate': 0.03,
 'max_depth': -1,
 'n_estimators': 200,
 'num_leaves': 50,
 'reg_alpha': 0.03}