In [13]:
import pandas as pd
import numpy as np
from sklearn.model_selection import train_test_split
from sklearn.linear_model import LinearRegression
from sklearn import linear_model
from sklearn.metrics import mean_squared_error
from sklearn.preprocessing import Normalizer
from sklearn.pipeline import make_pipeline
from sklearn.preprocessing import PolynomialFeatures
from yahoo_baseball_assistant import hitting
import pickle
import os

In [4]:
dt_ranges = [ [ ['2012-07-01', '2015-06-30'], ['2015-07-01', '2015-09-30'], ['2016-04-01', '2016-06-30'] ],  
              [ ['2013-04-01', '2015-09-30'], ['2016-04-01', '2016-06-30'], ['2016-07-01', '2016-09-30'] ],  
              [ ['2013-07-01', '2016-06-30'], ['2016-07-01', '2016-09-30'], ['2017-04-01', '2017-06-30'] ], 
              [ ['2014-04-01', '2016-09-30'], ['2017-04-01', '2017-06-30'], ['2017-07-01', '2017-09-30'] ], 
              [ ['2014-07-01', '2017-06-30'], ['2017-07-01', '2017-09-30'], ['2018-04-01', '2018-06-30'] ],
              [ ['2015-04-01', '2017-09-30'], ['2018-04-01', '2018-06-30'], ['2018-07-01', '2018-09-30'] ] ]
dt_ranges = [ [ ['2010-04-01', '2010-09-30'], ['2011-04-01', '2011-06-30'] ],   
              [ ['2010-07-01', '2011-06-30'], ['2011-07-01', '2011-09-30'] ],   
              [ ['2011-04-01', '2011-09-30'], ['2012-04-01', '2012-06-30'] ],   
              [ ['2011-07-01', '2012-06-30'], ['2012-07-01', '2012-09-30'] ],   
              [ ['2012-04-01', '2012-09-30'], ['2013-04-01', '2013-06-30'] ],   
              [ ['2012-07-01', '2013-06-30'], ['2013-07-01', '2013-09-30'] ],  
              [ ['2013-04-01', '2013-09-30'], ['2014-04-01', '2014-06-30'] ],  
              [ ['2013-07-01', '2014-06-30'], ['2014-07-01', '2014-09-30'] ],  
              [ ['2014-04-01', '2014-09-30'], ['2015-04-01', '2015-06-30'] ],  
              [ ['2014-07-01', '2015-06-30'], ['2015-07-01', '2015-09-30'] ], 
              [ ['2015-04-01', '2015-09-30'], ['2016-04-01', '2016-06-30'] ], 
              [ ['2015-07-01', '2016-06-30'], ['2016-07-01', '2016-09-30'] ], 
              [ ['2016-04-01', '2016-09-30'], ['2017-04-01', '2017-06-30'] ], 
              [ ['2016-07-01', '2017-06-30'], ['2017-07-01', '2017-09-30'] ], 
              [ ['2017-04-01', '2017-09-30'], ['2018-04-01', '2018-06-30'] ],
              [ ['2017-07-01', '2018-06-30'], ['2018-07-01', '2018-09-30'] ] ]
df = None
for dt_range in dt_ranges:
  if df is None:
    df = hitting.build_dataset(dt_range, 'HR', min_PA=200)
  else:
    df = pd.concat([df, hitting.build_dataset(dt_range, 'HR', min_PA=200)])

In [5]:
train, test = train_test_split(df, test_size=0.2, random_state=42)
train_data = train.iloc[:,1:-1]
train_label = train.iloc[:,-1]
test_data = test.iloc[:,1:-1]
test_label = test.iloc[:,-1]
train_data.info()

<class 'pandas.core.frame.DataFrame'>
Int64Index: 2442 entries, 152 to 114
Data columns (total 20 columns):
Age_P0    2442 non-null int64
R_P0      2442 non-null float64
H_P0      2442 non-null float64
2B_P0     2442 non-null float64
3B_P0     2442 non-null float64
HR_P0     2442 non-null float64
RBI_P0    2442 non-null float64
BB_P0     2442 non-null float64
IBB_P0    2442 non-null float64
SO_P0     2442 non-null float64
HBP_P0    2442 non-null float64
SH_P0     2442 non-null float64
SF_P0     2442 non-null float64
GDP_P0    2442 non-null float64
SB_P0     2442 non-null float64
CS_P0     2442 non-null float64
BA_P0     2442 non-null float64
OBP_P0    2442 non-null float64
SLG_P0    2442 non-null float64
OPS_P0    2442 non-null float64
dtypes: float64(19), int64(1)
memory usage: 400.6 KB


In [6]:
train_label.describe()

count    2442.000000
mean        0.030522
std         0.016985
min         0.000000
25%         0.017498
50%         0.029412
75%         0.041667
max         0.105556
Name: HR, dtype: float64

In [7]:
# LinearRegressionModel
model = LinearRegression(normalize=False)
model.fit(train_data, train_label)
model.score(train_data, train_label)

0.46101612360014405

In [8]:
# Lasso regression model
model = linear_model.Lasso(alpha=0.1, normalize=False)
model.fit(train_data, train_label)
model.score(train_data, train_label)

0.0

In [9]:
# ElasticNet regression model
model = linear_model.ElasticNet()
model.fit(train_data, train_label)
model.score(train_data, train_label)

0.0

In [10]:
# Ridge regression
model = linear_model.Ridge(alpha=.5)
model.fit(train_data, train_label)
model.score(train_data, train_label)

0.4435136815493899

In [11]:
model = make_pipeline(Normalizer(), PolynomialFeatures(2), LinearRegression(normalize=False))
model.fit(train_data, train_label)
model.score(train_data, train_label)

0.5172637603095558

In [16]:
model.score(test_data, test_label)

0.31351583830501295

In [20]:
pickle.dump(model, open("../data/hr.model", "wb"))