In [3]:
import pandas as pd
import numpy as np
from sklearn.model_selection import train_test_split
from sklearn.linear_model import LinearRegression
from sklearn import linear_model
from sklearn.metrics import mean_squared_error
from sklearn.preprocessing import Normalizer
from sklearn.pipeline import make_pipeline
from sklearn.preprocessing import PolynomialFeatures
from yahoo_baseball_assistant import hitting, baseball_date
import pickle
import os

In [27]:
dt_gen = baseball_date.Generator(num_pairs=22, range_day_len=90)
dt_ranges = dt_gen.produce()

In [29]:
df = None
for dt1, dt2 in zip(dt_ranges, dt_ranges[1:]):
  dt_range = [dt1, dt2]
  if df is None:
    df = hitting.build_dataset(dt_range, 'HR', min_PA=200)
  else:
    df = pd.concat([df, hitting.build_dataset(dt_range, 'HR', min_PA=200)])

In [39]:
train, test = train_test_split(df, test_size=0.3, random_state=42)
train_data = train.iloc[:,1:-1]
train_label = train.iloc[:,-1]
test_data = test.iloc[:,1:-1]
test_label = test.iloc[:,-1]
train_data.info()

<class 'pandas.core.frame.DataFrame'>
Int64Index: 2325 entries, 11 to 17
Data columns (total 20 columns):
Age_P0    2325 non-null int64
R_P0      2325 non-null float64
H_P0      2325 non-null float64
2B_P0     2325 non-null float64
3B_P0     2325 non-null float64
HR_P0     2325 non-null float64
RBI_P0    2325 non-null float64
BB_P0     2325 non-null float64
IBB_P0    2325 non-null float64
SO_P0     2325 non-null float64
HBP_P0    2325 non-null float64
SH_P0     2325 non-null float64
SF_P0     2325 non-null float64
GDP_P0    2325 non-null float64
SB_P0     2325 non-null float64
CS_P0     2325 non-null float64
BA_P0     2325 non-null float64
OBP_P0    2325 non-null float64
SLG_P0    2325 non-null float64
OPS_P0    2325 non-null float64
dtypes: float64(19), int64(1)
memory usage: 381.4 KB


In [40]:
train_label.describe()

count    2325.000000
mean        0.030773
std         0.017111
min         0.000000
25%         0.017910
50%         0.029412
75%         0.042135
max         0.107239
Name: HR, dtype: float64

In [41]:
# LinearRegressionModel
model = LinearRegression(normalize=False)
model.fit(train_data, train_label)
model.score(train_data, train_label)

0.4506009968073818

In [42]:
# Lasso regression model
model = linear_model.Lasso(alpha=0.1, normalize=False)
model.fit(train_data, train_label)
model.score(train_data, train_label)

0.0

In [43]:
# ElasticNet regression model
model = linear_model.ElasticNet()
model.fit(train_data, train_label)
model.score(train_data, train_label)

0.0

In [44]:
# Ridge regression
model = linear_model.Ridge(alpha=.5)
model.fit(train_data, train_label)
model.score(train_data, train_label)

0.4280655948106677

In [45]:
model = make_pipeline(Normalizer(), PolynomialFeatures(2), LinearRegression(normalize=False))
model.fit(train_data, train_label)
model.score(train_data, train_label)

0.5027193996016462

In [46]:
model.score(test_data, test_label)

0.3783969249924911

In [47]:
pickle.dump(model, open("../data/hr.model", "wb"))