In [1]:
import numpy as np
import pandas as pd
import matplotlib.pyplot as plt

In [2]:
from sklearn.model_selection import GridSearchCV, train_test_split
from sklearn.compose import ColumnTransformer
from sklearn.preprocessing import StandardScaler, PolynomialFeatures, OneHotEncoder
from sklearn.linear_model import Ridge
from sklearn.pipeline import Pipeline

In [3]:
path = 'baseball/data/core/'
df_batting = pd.read_csv(path + 'Batting.csv')
df_salaries = pd.read_csv(path + 'Salaries.csv')
df_appearances = pd.read_csv(path + 'Appearances.csv')

In [4]:
pos_comb = df_appearances.groupby(['playerID', 'yearID']).sum()
pos_comb = pos_comb.drop(columns=['G_all', 'GS', 'G_batting', 'G_defense', 'G_of'])
player_pos = pd.DataFrame()
player_pos['pos'] = pos_comb.idxmax(axis=1).str.split('_').apply(lambda x: x[1])
player_pos['GP'] = pos_comb.max(axis=1)
player_pos = player_pos[player_pos['pos'] != 'p']

In [5]:
batting_stats = df_batting.groupby(['playerID', 'yearID']).sum()

In [6]:
batting_stats = batting_stats[['HR', 'H', 'BB']]
batting_stats['years_in_mlb'] = batting_stats.sort_index().groupby(['playerID']).cumcount()

In [7]:
df_salaries = df_salaries.set_index(['playerID', 'yearID'])

In [8]:
df = batting_stats.join(df_salaries, how='inner').join(player_pos, how='inner')

In [9]:
y = df['salary'] / 1000000
X = df.reset_index().drop(columns=['salary', 'playerID'])

X_train, X_test, y_train, y_test = train_test_split(X, y)

In [10]:
cat_cols = ['yearID', 'years_in_mlb', 'teamID', 'lgID', 'pos']
cont_cols = X_train.columns.difference(cat_cols)

param_grid = {'regression__alpha': np.logspace(-1, 2, 10)}

cat_pipe = Pipeline([('encode', OneHotEncoder(handle_unknown='ignore', sparse=False))])

cont_pipe = Pipeline([('poly', PolynomialFeatures(degree=2, include_bias=False)),
                      ('scale', StandardScaler())])

cf = ColumnTransformer([('cat', cat_pipe, cat_cols),
                        ('cont', cont_pipe, cont_cols)])

pipe = Pipeline([('preprocessing', cf),
                 ('regression', Ridge(fit_intercept=True))])

ridgeCV = GridSearchCV(pipe, param_grid = param_grid, cv=5)
ridgeCV.fit(X_train, y_train)

GridSearchCV(cv=5,
             estimator=Pipeline(steps=[('preprocessing',
                                        ColumnTransformer(transformers=[('cat',
                                                                         Pipeline(steps=[('encode',
                                                                                          OneHotEncoder(handle_unknown='ignore',
                                                                                                        sparse=False))]),
                                                                         ['yearID',
                                                                          'years_in_mlb',
                                                                          'teamID',
                                                                          'lgID',
                                                                          'pos']),
                                                                      