In [1]:
import numpy as np
import pandas as pd
from sklearn.preprocessing import StandardScaler, PolynomialFeatures, OneHotEncoder
from sklearn.model_selection import train_test_split, GridSearchCV
from sklearn.compose import ColumnTransformer
from sklearn.pipeline import Pipeline
from sklearn.linear_model import Ridge
from sklearn.ensemble import RandomForestRegressor

In [2]:
path = 'bb_agg.csv'
df = pd.read_csv(path, index_col=0)

In [3]:
# define categorical and numeric columns
cat_cols = ['yearID', 'lgID']

cont_cols = df.columns.drop(cat_cols + ['W', 'teamID'])

In [4]:
# create design matrix, target vector, split into test and train

X = df[cat_cols + list(cont_cols)]
y = df['W']

X_train, X_test, y_train, y_test = train_test_split(X, y)

In [5]:
# make pipelines and column transformer for ridge regression

param_grid = {'ridge__alpha': np.logspace(-2, 2, 20)}

pipe_cat = Pipeline([('onehot', OneHotEncoder(handle_unknown='ignore')),
                     ('poly', PolynomialFeatures(degree=2, interaction_only=True))])

pipe_cont = Pipeline([('poly', PolynomialFeatures(degree=2, include_bias=False)),
                     ('scaler', StandardScaler())])

ct = ColumnTransformer([('cat', pipe_cat, cat_cols),
                        ('cont', pipe_cont, cont_cols)],
                        remainder='drop')

pipe = Pipeline([('preprocess', ct),
                 ('ridge', Ridge(fit_intercept=False))])

ridgeCV = GridSearchCV(pipe, param_grid=param_grid, cv=5)

ridgeCV.fit(X_train, y_train)

print(ridgeCV.score(X_test, y_test))

0.7667087204878169


In [None]:
# do the same for random forest

param_grid = {'rf__max_depth': [5, 10, 20, None],
              'rf__min_samples_split': [2, 5, 10]}

# i don't really need to do any of the interaction stuff, etc for random forest, 
# but it actually improves performance a bit if i do

pipe_cat = Pipeline([('onehot', OneHotEncoder(handle_unknown='ignore')),
                     ('poly', PolynomialFeatures(degree=2, interaction_only=True))])

pipe_cont = Pipeline([('poly', PolynomialFeatures(include_bias=False)),
                     ('scaler', StandardScaler())])

ct = ColumnTransformer([('cat', pipe_cat, cat_cols),
                        ('cont', pipe_cont, cont_cols)],
                        remainder='drop')

pipe = Pipeline([('preprocess', ct),
                 ('rf', RandomForestRegressor())])

rfCV = GridSearchCV(pipe, param_grid=param_grid, cv=5)
rfCV.fit(X_train, y_train)

print(rfCV.score(X_test, y_test))

In [None]:
# how to optimize degree of polynomial
# drop degree in the continuous bit
# and include it as a parameter in the grid search as follows

param_grid = {'ridge__alpha': np.logspace(-2, 2, 20),
              'preprocess__cont__poly__degree': [1, 2]}

pipe_cat = Pipeline([('onehot', OneHotEncoder(handle_unknown='ignore')),
                     ('poly', PolynomialFeatures(degree=2, interaction_only=True))])

pipe_cont = Pipeline([('poly', PolynomialFeatures(include_bias=False)),
                     ('scaler', StandardScaler())])

ct = ColumnTransformer([('cat', pipe_cat, cat_cols),
                        ('cont', pipe_cont, cont_cols)],
                        remainder='drop')

pipe = Pipeline([('preprocess', ct),
                 ('ridge', Ridge(fit_intercept=False))])

ridgeCV = GridSearchCV(pipe, param_grid=param_grid, cv=5)

ridgeCV.fit(X_train, y_train)

print(ridgeCV.score(X_test, y_test))