# Modeling Exploration

### Imports

In [1]:
import pandas as pd
import numpy as np
import pickle


from sklearn.preprocessing import StandardScaler, OneHotEncoder
from sklearn.linear_model import LinearRegression, LassoCV, RidgeCV, ElasticNetCV, LogisticRegression
from sklearn.model_selection import train_test_split, GridSearchCV
from sklearn.pipeline import Pipeline
from sklearn.neighbors import KNeighborsRegressor
from sklearn.tree import DecisionTreeRegressor
from sklearn.ensemble import BaggingRegressor, RandomForestRegressor, AdaBoostRegressor

from sklearn import metrics
np.random.seed(42)

In [2]:
df = pd.read_csv('../data/game_logs.csv')

In [3]:
df.head()

Unnamed: 0,Date,Home,Opp,R,H,HR,BA,OPS,Team,abbrev_name,...,K/BB,HR/9,WHIP,xFIP,HardHit%_P,wOBA,wRC+,OBP+,Barrel%,HardHit%
0,Apr 7,False,SDP,4,3,1,0.115,0.554,ARI,Y.Darvish,...,5.22,1.03,1.01,3.55,0.381,0.305,92,97,0.071,0.36
1,Apr 8,False,SDP,0,2,0,0.091,0.364,ARI,S.Manaea,...,2.77,1.44,1.33,3.93,0.411,0.305,92,97,0.071,0.36
2,Apr 9,False,SDP,2,5,1,0.116,0.426,ARI,J.Musgrove,...,4.43,0.92,1.02,3.38,0.331,0.305,92,97,0.071,0.36
3,Apr 10,False,SDP,5,5,2,0.129,0.516,ARI,N.Crismatt,...,3.06,0.34,1.17,3.44,0.36,0.305,92,97,0.071,0.36
4,Apr 12,False,HOU,1,4,1,0.13,0.518,ARI,L.Garcia,...,3.51,1.37,1.13,3.69,0.348,0.305,92,97,0.071,0.36


In [4]:
df.shape

(3647, 21)

### Setting up dataframes and train/test split

In [5]:
df['Home'] = df['Home'].astype(int)

In [6]:
X = df.drop(columns = ['Date', 'R', 'Team', 'abbrev_name', 'Opp'])
y = df['R']

X_train, X_test, y_train, y_test = train_test_split(X, y, random_state = 42)

In [7]:
X.tail()

Unnamed: 0,Home,H,HR,BA,OPS,ERA,K/BB,HR/9,WHIP,xFIP,HardHit%_P,wOBA,wRC+,OBP+,Barrel%,HardHit%
3642,1,9,3,0.248,0.69,4.2,3.7,1.36,1.16,3.31,0.47,0.304,92,100,0.062,0.351
3643,1,9,1,0.248,0.69,3.63,2.47,0.62,1.41,3.55,0.319,0.304,92,100,0.062,0.351
3644,0,10,1,0.248,0.69,3.47,3.1,1.16,1.14,4.21,0.361,0.304,92,100,0.062,0.351
3645,0,10,2,0.248,0.691,3.4,5.22,1.03,1.01,3.55,0.381,0.304,92,100,0.062,0.351
3646,0,7,0,0.248,0.689,3.66,2.92,0.69,1.27,3.33,0.375,0.304,92,100,0.062,0.351


### Pipeline with standard scalar and different regression models to find best type of model

In [8]:
# Pipelines function to run models and output scores and mse
def pipe_model(model):
    pipe = Pipeline([
        ('ss', StandardScaler()),
        ('model', model)
    ])
    pipe.fit(X_train, y_train)
    print(f'{model}')
    print(f'Training r2 score: {pipe.score(X_train, y_train)}')
    print(f'Testing r2 score: {pipe.score(X_test, y_test)}')
    print(f'Testing MSE: {metrics.mean_squared_error(y_test, pipe.predict(X_test))}')
    return print('='*40)

In [9]:
pipe_model(LinearRegression())
pipe_model(KNeighborsRegressor())
pipe_model(DecisionTreeRegressor())
pipe_model(BaggingRegressor())
pipe_model(RandomForestRegressor())
pipe_model(AdaBoostRegressor())

LinearRegression()
Training r2 score: 0.6903796282979524
Testing r2 score: 0.7108435878942181
Testing MSE: 2.949801181415709
KNeighborsRegressor()
Training r2 score: 0.6965464477237789
Testing r2 score: 0.5900417821848458
Testing MSE: 4.182149122807018
DecisionTreeRegressor()
Training r2 score: 1.0
Testing r2 score: 0.36680968056876195
Testing MSE: 6.459429824561403
BaggingRegressor()
Training r2 score: 0.938760308834778
Testing r2 score: 0.629578826336872
Testing MSE: 3.7788157894736845
RandomForestRegressor()
Training r2 score: 0.9528687209521266
Testing r2 score: 0.6819087870815124
Testing MSE: 3.2449767543859647
AdaBoostRegressor()
Training r2 score: 0.6208484786048866
Testing r2 score: 0.6224605791977883
Testing MSE: 3.851431899445324


Based on the above modeling, I will use Linear regression

### Baseline model MSE

In [10]:
metrics.mean_squared_error(y_test, pd.Series(y.mean(), index=range(len(y_test))))

10.21514700119001

## Lasso, Ridge, Elastic Net
----

Because I am using Linear Regression as my model of choice, it may prove fruitful to use lasso, ridge, or elastic net(Both) to help regularize.

In [11]:
ss = StandardScaler()
X_train_sc = ss.fit_transform(X_train)
X_test_sc = ss.transform(X_test)

alphas = np.logspace(0, 5, 100)
# Cross-validate over our list of ridge alphas.
ridge_cv = RidgeCV(alphas = alphas, cv = 5)
# Fit model using best ridge alpha!
ridge_cv.fit(X_train_sc, y_train)
print(ridge_cv.alpha_)
print(ridge_cv.best_score_)
print(ridge_cv.score(X_train_sc, y_train))
print(ridge_cv.score(X_test_sc, y_test))

14.508287784959395
0.6860978581932317
0.6903423498661614
0.7110994240632711


In [12]:
# Set up a list of Lasso alphas to check.
l_alphas = np.logspace(-3, 0, 100)
# Cross-validate over our list of Lasso alphas.
lasso_cv = LassoCV(alphas = l_alphas)
# Fit model using best ridge alpha!
lasso_cv.fit(X_train_sc, y_train)
print(lasso_cv.alpha_)
print(lasso_cv.score(X_train_sc, y_train))
print(lasso_cv.score(X_test_sc, y_test))

0.0032745491628777285
0.6902375350275929
0.7113523666720671


In [13]:
# Set up a list of alphas to check.
enet_alphas = np.linspace(0.5, 1.0, 100)

# Set up our l1 ratio.
enet_ratio = 0.5

# Instantiate model.
enet_model = ElasticNetCV(alphas=enet_alphas, l1_ratio=enet_ratio, cv=5)

# Fit model using optimal alpha.
enet_model = enet_model.fit(X_train_sc, y_train)

# Evaluate model.
print(enet_model.score(X_train_sc, y_train))
print(enet_model.score(X_test_sc, y_test))

0.6336335383482338
0.6528289880988085


Linear regression is still performing the best, so we will move forward with that.

### Linear Regression Model
----

In [14]:
pipe = Pipeline([
        ('ss', StandardScaler()),
        ('lr', LinearRegression())
    ])
pipe.fit(X_train, y_train)
print(f'Training r2 score: {pipe.score(X_train, y_train)}')
print(f'Testing r2 score: {pipe.score(X_test, y_test)}')
print(f'Testing MSE: {metrics.mean_squared_error(y_test, pipe.predict(X_test))}')

Training r2 score: 0.6903796282979524
Testing r2 score: 0.7108435878942181
Testing MSE: 2.949801181415709


In [15]:
with open('../models/lr.pkl', 'wb') as f:
    pickle.dump(pipe, f)