In [1]:
import pandas as pd
import numpy as np
from pathlib import Path
from sklearn.preprocessing import LabelEncoder
from sklearn.feature_extraction import DictVectorizer
from sklearn.ensemble import RandomForestRegressor
from sklearn.impute import SimpleImputer
from helpers import *

In [2]:
# path to project directory
path = Path('./')

In [3]:
# read in training dataset
train_df = pd.read_csv(path/'data/train_v4.csv', index_col=0, dtype={'season':str})

In [4]:
# add a bunch of player lag features
lag_train_df, team_lag_vars = team_lag_features(train_df, ['total_points'], ['all', 1, 2, 3, 4, 5, 10])
lag_train_df, player_lag_vars = player_lag_features(lag_train_df, ['total_points'], ['all', 1, 2, 3, 4, 5, 10])

In [5]:
imp = SimpleImputer(missing_values=np.nan, strategy='mean')

In [6]:
lag_train_df[~np.isfinite(lag_train_df['total_points_pg_last_1'])]

Unnamed: 0,player,gw,position,minutes,team,opponent_team,relative_market_value_team,relative_market_value_opponent_team,was_home,total_points,...,total_points_last_2,total_points_pg_last_2,total_points_last_3,total_points_pg_last_3,total_points_last_4,total_points_pg_last_4,total_points_last_5,total_points_pg_last_5,total_points_last_10,total_points_pg_last_10
0,Aaron_Cresswell,1,2,0,West Ham United,Chelsea,,,False,0,...,0.0,,0.0,,0.0,,0.0,,0.0,
1,Aaron_Lennon,1,3,15,Everton,Tottenham Hotspur,,,True,1,...,0.0,,0.0,,0.0,,0.0,,0.0,
2,Aaron_Ramsey,1,3,60,Arsenal,Liverpool,,,True,2,...,0.0,,0.0,,0.0,,0.0,,0.0,
3,Abdoulaye_Doucouré,1,3,0,Watford,Southampton,,,False,0,...,0.0,,0.0,,0.0,,0.0,,0.0,
4,Abdul Rahman_Baba,1,2,0,Chelsea,West Ham United,,,True,0,...,0.0,,0.0,,0.0,,0.0,,0.0,
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
90431,Heurelho da Silva_Gomes,38,1,0,Watford,Arsenal,0.472917,1.452343,False,0,...,0.0,,0.0,,0.0,,0.0,,0.0,
90432,Tommy_Doyle,38,3,0,Manchester City,Norwich,2.430397,0.327574,True,0,...,0.0,,0.0,,1.0,6.0,1.0,6.0,1.0,6.000000
90433,Joseph_Anang,38,1,0,West Ham United,Aston Villa,0.709989,0.553818,True,0,...,0.0,,0.0,,0.0,,0.0,,0.0,
90435,Japhet_Tanganga,38,2,0,Tottenham Hotspur,Crystal Palace,1.604904,0.430493,False,0,...,0.0,,0.0,,0.0,,0.0,,3.0,1.636364


In [7]:
# set validaton point/length and categorical/continuous variables
valid_season = '1920'
valid_gw = 20
valid_len = 6
cat_vars = ['season', 'position', 'team', 'opponent_team', 'was_home']
cont_vars = ['gw', 'minutes']
dep_var = ['total_points']

In [8]:
# we want to set gw and season as ordered categorical variables
# need lists with ordered categories
ordered_gws = list(range(1,39))
ordered_seasons = ['1617', '1718', '1819', '1920']

# set as categories with correct order 
lag_train_df['gw'] = lag_train_df['gw'].astype('category')
lag_train_df['season'] = lag_train_df['season'].astype('category')

lag_train_df['gw'].cat.set_categories(ordered_gws, ordered=True, inplace=True)
lag_train_df['season'].cat.set_categories(ordered_seasons, ordered=True, inplace=True)

In [9]:
# create dataset with adjusted post-validation lag numbers
lag_train_df, train_idx, valid_idx = create_lag_train(lag_train_df, 
                                                      cat_vars, cont_vars, 
                                                      player_lag_vars, team_lag_vars, dep_var,
                                                      valid_season, valid_gw, valid_len)

In [10]:
lag_train_df[team_lag_vars + player_lag_vars] = lag_train_df[team_lag_vars + player_lag_vars].fillna(0)

In [11]:
# split out dependent variable
X, y = lag_train_df[cat_vars + cont_vars + team_lag_vars + player_lag_vars].copy(), lag_train_df[dep_var].copy()

In [12]:
# since position is categorical, it should be a string
X['position'] = X['position'].apply(str)

# need to transform season
enc = LabelEncoder()
X['season'] = enc.fit_transform(X['season'])
X_dict = X.to_dict("records")

# Create the DictVectorizer object: dv
dv = DictVectorizer(sparse=False, separator='_')

# Apply dv on df: df_encoded
X_encoded = dv.fit_transform(X_dict)

X_df = pd.DataFrame(X_encoded, columns=dv.feature_names_)

In [13]:
# split out training and validation sets
X_train = X_df.iloc[train_idx]
y_train = y.iloc[train_idx]
X_test = X_df.iloc[valid_idx]
y_test = y.iloc[valid_idx]

In [14]:
# need to think about imputing NaN instead of setting to zero
imp.fit(X_train[team_lag_vars + player_lag_vars])

SimpleImputer(add_indicator=False, copy=True, fill_value=None,
              missing_values=nan, strategy='mean', verbose=0)

In [15]:
def rf(xs, y, n_estimators=40, max_samples=50_000,
       max_features=0.5, min_samples_leaf=5, **kwargs):
    return RandomForestRegressor(n_jobs=-1, n_estimators=n_estimators,
        max_samples=max_samples, max_features=max_features,
        min_samples_leaf=min_samples_leaf, oob_score=True).fit(xs, y)

In [16]:
m = rf(X_train, y_train.values.ravel())

In [17]:
r_mse(m.predict(X_test), y_test.values.ravel())

1.805422