In [32]:
import pandas as pd
from pathlib import Path
import statsmodels.formula.api as sm
import numpy as np

from sklearn.preprocessing import StandardScaler

from sklearn.preprocessing import PolynomialFeatures
from sklearn.compose import ColumnTransformer
from sklearn.linear_model import LassoCV
from sklearn.pipeline import Pipeline
from sklearn.model_selection import cross_val_score

from sklearn.experimental import enable_iterative_imputer
from sklearn.impute import IterativeImputer, SimpleImputer

interim = '../data/interim'
br = pd.read_pickle(Path(interim) / 'batting_records.pkl')
gl = pd.read_pickle(Path(interim) / 'game_logs.pkl')
events = pd.read_pickle(Path(interim) / 'events.pkl')
people = pd.read_pickle(Path(interim) / 'people.pkl')

In [33]:
## combine data
br = br.reset_index()
merged = pd.merge(br, people[['PlayerID', 'birthYear']], left_on = ['BAT_ID'], right_on=['PlayerID'], how='left')
merged = merged.set_index(['BAT_ID', 'year']).sort_values(['BAT_ID', 'year'], ascending=[True, True])

merged['Age'] = merged.index.get_level_values('year') - merged['birthYear']

merged['HPPA'] = merged['HPPA'].astype('float')
merged['H'] = merged['H'].astype('int')
merged['PA'] = merged['PA'].astype('int')
merged['Age'] = merged['Age'].astype('int')

merged = merged[~merged.HPPA.isna()]

merged = merged.sort_values(['BAT_ID', 'year'])
merged['prev_G'] = merged.groupby('BAT_ID')['G'].shift(1)

In [34]:
group = merged.groupby('BAT_ID')
merged['L1_HPPA'] = group['HPPA'].shift(1)
merged['L2_HPPA'] = group['HPPA'].shift(2)
merged['L3_HPPA'] = group['HPPA'].shift(3)

merged['L1_H'] = group['H'].shift(1)
merged['L2_H'] = group['H'].shift(2)
merged['L3_H'] = group['H'].shift(3)

merged['L1_PA'] = group['PA'].shift(1)
merged['L2_PA'] = group['PA'].shift(2)
merged['L3_PA'] = group['PA'].shift(3)

In [35]:
x_vars = [
    'L1_HPPA', 'L2_HPPA', 'L3_HPPA', 'Age',
    'L1_H', 'L2_H', 'L3_H', 'L1_PA', 'L2_PA', 'L3_PA'
]
preprocessor =  ColumnTransformer(
    [('spot', 'passthrough', x_vars)],
    remainder='drop'
)

clf = LassoCV(
    cv=5, random_state=0, max_iter=10000, n_jobs=-1,
)

fitted_model = Pipeline([
    ('select', preprocessor),
    ('impute', IterativeImputer(random_state = 0)),
    ('poly', PolynomialFeatures(2, interaction_only=True)),
    ('scale', StandardScaler()),
    ('clf', clf),
])

fitted_model.fit(merged, merged['HPPA'].astype('float'))
fitted_model.predict



<function sklearn.pipeline.Pipeline.predict(self, X, **predict_params)>

In [36]:
limited = merged[(merged.G >= 50) & (merged.prev_G >= 50)]

In [37]:
results = cross_val_score(fitted_model, limited, limited['HPPA'].astype('float'), cv=10, n_jobs=-1, scoring='r2')
print(results)
print("Accuracy: %.3f%% (%.3f%%)" % (results.mean()*100.0, results.std()*100.0))

[0.2792489  0.2901825  0.26987954 0.31563515 0.28629009 0.29525236
 0.30450427 0.29853654 0.29970046 0.30453127]
Accuracy: 29.438% (1.271%)
