In [1]:
from pybaseball import batting_stats_range
import pandas as pd
import numpy as np
from sklearn.model_selection import train_test_split
from sklearn.linear_model import LinearRegression
from sklearn import linear_model
from sklearn.metrics import mean_squared_error
from sklearn.preprocessing import StandardScaler, Normalizer
from sklearn.pipeline import make_pipeline
from sklearn.preprocessing import PolynomialFeatures
from sklearn.model_selection import GridSearchCV

In [2]:
def build_hitting_dataset(date_ranges, predict_category, min_PA=None):
    '''Build the dataset used for hitting predictions
    
    Args:
      date_ranges (list of ranges)  List of date ranges to use
        for the dataset.  There must be at least two ranges here.
        One range that serve as the input data and the second that
        will serve the predicted value.  If more then two ranges are
        specified, the first n-1 will be used for the input data and
        the last one will be for the predicted values.
    
      predict_category (str) The hitting category that we will predict.
        The category name must be one of the columns returned by the
        batting_stats_range() API.
        
      min_PA (int) If set, we will filter out rows that doesn't exceed
        this minimum plate appearances
        
    Returns:
      DataFrame: The dataset of k columns.  The first k-1 columns will be
        the input values and the kth column is the predicted value.
        
    Examples:
      df = build_hitting_dataset([['2016-01-01', '2016-12-31'],
                                  ['2017-01-01', '2017-12-31']],
                                 'HR')
    '''
    if len(date_ranges) <= 1:
        raise RuntimeException("Must have at least 2 date ranges")
    input_ranges = date_ranges[:-1]
    predict_range = date_ranges[-1]
    input_data = None
    
    for i, dr in zip(range(len(input_ranges)), input_ranges):
        data = transform_hitting(batting_stats_range(dr[0], dr[1]), min_PA=min_PA)
        # We're going to join each of the input sets together, so we need 
        # to change the column name to avoid collision.
        data.rename(lambda x: "{}_P{}".format(x, i) if x != "Name" else x,
                    axis=1, inplace=True)
        if input_data is None:
            input_data = data
        else:
            input_data = pd.merge(input_data, data, on='Name')
        
    predict_full_data = transform_hitting(
        batting_stats_range(predict_range[0], predict_range[1]), min_PA=min_PA)
    predict_data = pd.DataFrame()
    predict_data['Name'] = predict_full_data['Name']
    predict_data[predict_category] = predict_full_data[predict_category]
    return pd.merge(input_data, predict_data, on='Name')

    
def transform_hitting(df, min_PA=None):
    '''Transform column values in a hitting dataset
    
    Args:
      df (DataFrame) Hitting DataFrame to transform
      minPA (int)  Filter out rows that don't match this minimum plate appearence
      
    Returns:
      DataFrame: The transformed DataFrame
    '''
    # Convert a bunch of counting stats to be ratio's of plate appearences
    counting_stats = ['R', 'H', '2B', '3B', 'HR', 'RBI', 'BB', 'IBB',
                      'SO', 'HBP', 'SH', 'SF', 'GDP', 'SB', 'CS']
    for counting_stat in counting_stats:
        df[counting_stat] = df[counting_stat] / df['PA']
    if min_PA is not None:
        df = df[df.PA >= min_PA]
    # Drop any columns with null's and a few non-numeric columns
    return df.dropna().drop(columns=['#days', 'Lev', 'Tm', 'PA', 'G', 'AB'])
    

In [3]:
dt_ranges = [ [ ['2012-07-01', '2015-06-30'], ['2015-07-01', '2015-09-30'], ['2016-04-01', '2016-06-30'] ],  
              [ ['2013-04-01', '2015-09-30'], ['2016-04-01', '2016-06-30'], ['2016-07-01', '2016-09-30'] ],  
              [ ['2013-07-01', '2016-06-30'], ['2016-07-01', '2016-09-30'], ['2017-04-01', '2017-06-30'] ], 
              [ ['2014-04-01', '2016-09-30'], ['2017-04-01', '2017-06-30'], ['2017-07-01', '2017-09-30'] ], 
              [ ['2014-07-01', '2017-06-30'], ['2017-07-01', '2017-09-30'], ['2018-04-01', '2018-06-30'] ],
              [ ['2015-04-01', '2017-09-30'], ['2018-04-01', '2018-06-30'], ['2018-07-01', '2018-09-30'] ] ]
dt_ranges = [ [ ['2010-04-01', '2010-09-30'], ['2011-04-01', '2011-06-30'] ],   
              [ ['2010-07-01', '2011-06-30'], ['2011-07-01', '2011-09-30'] ],   
              [ ['2011-04-01', '2011-09-30'], ['2012-04-01', '2012-06-30'] ],   
              [ ['2011-07-01', '2012-06-30'], ['2012-07-01', '2012-09-30'] ],   
              [ ['2012-04-01', '2012-09-30'], ['2013-04-01', '2013-06-30'] ],   
              [ ['2012-07-01', '2013-06-30'], ['2013-07-01', '2013-09-30'] ],  
              [ ['2013-04-01', '2013-09-30'], ['2014-04-01', '2014-06-30'] ],  
              [ ['2013-07-01', '2014-06-30'], ['2014-07-01', '2014-09-30'] ],  
              [ ['2014-04-01', '2014-09-30'], ['2015-04-01', '2015-06-30'] ],  
              [ ['2014-07-01', '2015-06-30'], ['2015-07-01', '2015-09-30'] ], 
              [ ['2015-04-01', '2015-09-30'], ['2016-04-01', '2016-06-30'] ], 
              [ ['2015-07-01', '2016-06-30'], ['2016-07-01', '2016-09-30'] ], 
              [ ['2016-04-01', '2016-09-30'], ['2017-04-01', '2017-06-30'] ], 
              [ ['2016-07-01', '2017-06-30'], ['2017-07-01', '2017-09-30'] ], 
              [ ['2017-04-01', '2017-09-30'], ['2018-04-01', '2018-06-30'] ],
              [ ['2017-07-01', '2018-06-30'], ['2018-07-01', '2018-09-30'] ] ]
df = None
for dt_range in dt_ranges:
  if df is None:
    df = build_hitting_dataset(dt_range, 'HR', min_PA=200)
  else:
    df = pd.concat([df, build_hitting_dataset(dt_range, 'HR', min_PA=200)])

In [4]:
train, test = train_test_split(df, test_size=0.2, random_state=42)
train_data = train.iloc[:,1:-1]
train_label = train.iloc[:,-1]
test_data = test.iloc[:,1:-1]
test_label = test.iloc[:,-1]
train_data.info()

<class 'pandas.core.frame.DataFrame'>
Int64Index: 2442 entries, 152 to 114
Data columns (total 20 columns):
Age_P0    2442 non-null int64
R_P0      2442 non-null float64
H_P0      2442 non-null float64
2B_P0     2442 non-null float64
3B_P0     2442 non-null float64
HR_P0     2442 non-null float64
RBI_P0    2442 non-null float64
BB_P0     2442 non-null float64
IBB_P0    2442 non-null float64
SO_P0     2442 non-null float64
HBP_P0    2442 non-null float64
SH_P0     2442 non-null float64
SF_P0     2442 non-null float64
GDP_P0    2442 non-null float64
SB_P0     2442 non-null float64
CS_P0     2442 non-null float64
BA_P0     2442 non-null float64
OBP_P0    2442 non-null float64
SLG_P0    2442 non-null float64
OPS_P0    2442 non-null float64
dtypes: float64(19), int64(1)
memory usage: 400.6 KB


In [5]:
train_label.describe()

count    2442.000000
mean        0.030522
std         0.016985
min         0.000000
25%         0.017498
50%         0.029412
75%         0.041667
max         0.105556
Name: HR, dtype: float64

In [6]:
# LinearRegressionModel
model = LinearRegression(normalize=False)
model.fit(train_data, train_label)
model.score(train_data, train_label)

0.46101612360014405

In [7]:
# Lasso regression model
model = linear_model.Lasso(alpha=0.1, normalize=False)
model.fit(train_data, train_label)
model.score(train_data, train_label)

0.0

In [8]:
# ElasticNet regression model
model = linear_model.ElasticNet()
model.fit(train_data, train_label)
model.score(train_data, train_label)

0.0

In [9]:
# Ridge regression
model = linear_model.Ridge(alpha=.5)
model.fit(train_data, train_label)
model.score(train_data, train_label)

0.4435136815493899

In [10]:
model = make_pipeline(Normalizer(), PolynomialFeatures(2), LinearRegression(normalize=False))
model.fit(train_data, train_label)
model.score(train_data, train_label)

0.5172637603095558

In [11]:
model.score(test_data, test_label)

0.31351583830501295

In [15]:
import pickle
pickle.dump(model, open("../data/hr.model", "wb"))