In [75]:
import pandas as pd
import numpy as np
from pathlib import Path
from sklearn.preprocessing import LabelEncoder
from sklearn.feature_extraction import DictVectorizer
from sklearn.ensemble import RandomForestRegressor
from helpers import *

In [101]:
# path to project directory
path = Path('./')

In [102]:
# read in training dataset
train_df = pd.read_csv(path/'data/train_v4.csv', index_col=0, dtype={'season':str})

In [103]:
# add a bunch of player lag features
lag_train_df, team_lag_vars = team_lag_features(train_df, ['total_points'], ['all', 10])#1, 2, 3, 4, 5, 10])
lag_train_df, player_lag_vars = player_lag_features(lag_train_df, ['total_points'], ['all', 10])#1, 2, 3, 4, 5, 10])

In [104]:
lag_train_df[team_lag_vars + player_lag_vars] = lag_train_df[team_lag_vars + player_lag_vars].fillna(0)

In [105]:
# set validaton point/length and categorical/continuous variables
valid_season = '1920'
valid_gw = 20
valid_len = 6
cat_vars = ['season', 'position', 'team', 'opponent_team', 'was_home']
cont_vars = ['gw', 'minutes']
dep_var = ['total_points']

In [106]:
# we want to set gw and season as ordered categorical variables
# need lists with ordered categories
ordered_gws = list(range(1,39))
ordered_seasons = ['1617', '1718', '1819', '1920']

# set as categories with correct order 
lag_train_df['gw'] = lag_train_df['gw'].astype('category')
lag_train_df['season'] = lag_train_df['season'].astype('category')

lag_train_df['gw'].cat.set_categories(ordered_gws, ordered=True, inplace=True)
lag_train_df['season'].cat.set_categories(ordered_seasons, ordered=True, inplace=True)

In [107]:
# create dataset with adjusted post-validation lag numbers
lag_train_df, train_idx, valid_idx = create_lag_train(lag_train_df, 
                                                      cat_vars, cont_vars, 
                                                      player_lag_vars, team_lag_vars, dep_var,
                                                      valid_season, valid_gw, valid_len)

In [108]:
# split out dependent variable
X, y = lag_train_df[cat_vars + cont_vars].copy(), lag_train_df[dep_var].copy()

In [109]:
# since position is categorical, it should be a string
X['position'] = X['position'].apply(str)

# need to transform season
enc = LabelEncoder()
X['season'] = enc.fit_transform(X['season'])
X_dict = X.to_dict("records")

# Create the DictVectorizer object: dv
dv = DictVectorizer(sparse=False, separator='_')

# Apply dv on df: df_encoded
X_encoded = dv.fit_transform(X_dict)

X_df = pd.DataFrame(X_encoded, columns=dv.feature_names_)

In [110]:
# split out training and validation sets
X_train = X_df.iloc[train_idx]
y_train = y.iloc[train_idx]
X_test = X_df.iloc[valid_idx]
y_test = y.iloc[valid_idx]

In [111]:
def rf(xs, y, n_estimators=40, max_samples=50_000,
       max_features=0.5, min_samples_leaf=5, **kwargs):
    return RandomForestRegressor(n_jobs=-1, n_estimators=n_estimators,
        max_samples=max_samples, max_features=max_features,
        min_samples_leaf=min_samples_leaf, oob_score=True).fit(xs, y)

In [112]:
m = rf(X_train, y_train.values.ravel())

In [116]:
r_mse(m.predict(X_test), y_test.values.ravel())

1.848427

In [90]:
X_test

Unnamed: 0,gw,minutes,opponent_team_Arsenal,opponent_team_Aston Villa,opponent_team_Bournemouth,opponent_team_Brighton and Hove Albion,opponent_team_Burnley,opponent_team_Cardiff City,opponent_team_Chelsea,opponent_team_Crystal Palace,...,team_West Bromwich Albion,team_West Ham United,team_Wolverhampton Wanderers,total_points_pg_last_10,total_points_pg_last_all,total_points_team_pg_last_10,total_points_team_pg_last_10_opponent,total_points_team_pg_last_all,total_points_team_pg_last_all_opponent,was_home
78390,20.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,2.708618,4.237371,44.8,45.8,55.082707,44.526316,1.0
78391,20.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,6.111111,5.071273,54.9,40.7,54.196970,40.596491,1.0
78392,20.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.000000,4.194837,41.5,31.7,49.436090,31.789474,0.0
78393,20.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,3.913043,4.779793,37.5,41.9,41.774436,38.452632,0.0
78394,20.0,58.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,6.847826,4.687500,45.8,44.8,44.526316,55.082707,0.0
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
82049,25.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,9.000000,4.320000,41.9,31.7,38.452632,31.789474,1.0
82050,25.0,79.0,0.0,0.0,0.0,0.0,0.0,0.0,1.0,0.0,...,0.0,0.0,0.0,3.750000,4.447674,56.9,40.9,40.533835,49.721805,1.0
82051,25.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,2.621359,2.302017,34.9,34.3,35.242105,38.621212,0.0
82052,25.0,90.0,0.0,0.0,0.0,0.0,0.0,0.0,1.0,0.0,...,0.0,0.0,0.0,4.600000,3.546372,56.9,40.9,40.533835,49.721805,1.0
