<a href="https://colab.research.google.com/github/sawyerhunt12/MLB-Runs-Scored/blob/main/MLB_Runs_Scored_Ridge.ipynb" target="_parent"><img src="https://colab.research.google.com/assets/colab-badge.svg" alt="Open In Colab"/></a>

In [None]:
# Import packages
import pandas as pd
import numpy as np
import seaborn as sns
import matplotlib.pyplot as plt
from google.colab import drive
drive.mount('/content/gdrive')

Mounted at /content/gdrive


In [None]:
# Import ridge regression specific packages
from sklearn.linear_model import Ridge
from sklearn.preprocessing import StandardScaler
from sklearn.model_selection import train_test_split
from sklearn.metrics import mean_squared_error
from sklearn.model_selection import GridSearchCV

In [None]:
# Read in data from Google Drive
data = pd.read_csv('gdrive/My Drive/MLB_stolen_bases.csv')

Perform ridge regression, not tuned.

In [None]:
# Ridge regression

# Specify features (independent variables)
X = data[["r_total_stolen_base", "on_base_percent", "batting_avg", "on_base_plus_slg", "hit", "walk", "slg_percent"]].values

# Specify dependent (target) variable
y = data["r_run"]

In [None]:
# Split the data into training and testing sets
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size = 0.2, random_state = 42)

In [None]:
# Standardize features
scaler = StandardScaler()
X_train_scaled = scaler.fit_transform(X_train)
X_test_scaled = scaler.transform(X_test)

In [None]:
# Create ridge regression model
alpha = 1
ridge_model = Ridge(alpha = alpha)

In [None]:
# Train the model on the training data
ridge_model.fit(X_train_scaled, y_train)

In [None]:
# Make predictions on the testing data
y_pred_ridge = ridge_model.predict(X_test_scaled)

In [None]:
# Print model parameters (intercept and coefficients)
print("Ridge Intercept:", ridge_model.intercept_)
print("Stolen Bases:", ridge_model.coef_[0])
print("OBP:", ridge_model.coef_[1])
print("Batting Avg:", ridge_model.coef_[2])
print("OPS:", ridge_model.coef_[3])
print("Hits:", ridge_model.coef_[4])
print("Walks:", ridge_model.coef_[5])
print("Slugging Percentage:", ridge_model.coef_[6])

Ridge Intercept: 79.86792452830188
Stolen Bases: 4.41420409741412
OBP: 4.26501807026336
Batting Avg: -9.605800501974574
OPS: 4.497081881411953
Hits: 10.714363925688628
Walks: 1.562316769537352
Slugging Percentage: 3.9620817644071122


In [None]:
# Evaluate the ridge model performance with mean squared error (MSE)
mse_ridge = mean_squared_error(y_test, y_pred_ridge)
print("Ridge MSE:", mse_ridge)

Ridge MSE: 57.68734094119603


In [None]:
# Apply predictions to dataframe
features = data[["r_total_stolen_base", "on_base_percent", "batting_avg", "on_base_plus_slg", "hit", "walk", "slg_percent"]].values
features = scaler.fit_transform(features)

# Make predictions using the trained model
predictions = ridge_model.predict(features)

# Add predictions as a new column to dataframe
data["predicted_runs"] = predictions

# Print actual runs and predicted runs
print(data[["last_name, first_name", "r_run", "predicted_runs"]])

    last_name, first_name  r_run  predicted_runs
0          Grisham, Trent     67       68.381778
1      Candelario, Jeimer     77       78.736241
2           Hoerner, Nico     98       92.476380
3         Carroll, Corbin    116      107.725600
4      Santander, Anthony     81       85.196172
..                    ...    ...             ...
128     Yoshida, Masataka     71       73.287685
129         Outman, James     86       79.814683
130            Bohm, Alec     74       74.056335
131     Wade Jr., LaMonte     64       69.342964
132       Varsho, Daulton     65       67.409811

[133 rows x 3 columns]


Tune ridge regression with cross validation.

In [None]:
# Create ridge regression model
ridge_model_cv = Ridge()

In [None]:
# Define a range of alpha values to test
alphas = np.logspace(-6, 6, 13)

In [None]:
# Set up the hyperparameter grid for tuning
param_grid = {'alpha': alphas}

In [None]:
# Use GridSearchCV for hyperparameter tuning
ridge_cv = GridSearchCV(ridge_model_cv, param_grid, scoring = 'neg_mean_squared_error', cv = 5)
ridge_cv.fit(X_train_scaled, y_train)

In [None]:
# Print the best hyperparameters
print("Best alpha:", ridge_cv.best_params_['alpha'])

Best alpha: 0.1


In [None]:
# Make predictions using the best model
y_pred = ridge_cv.predict(X_test_scaled)

In [None]:
# Evaluate the model performance
mse_cv = mean_squared_error(y_test, y_pred)
print("Mean Squared Error:", mse_cv)

Mean Squared Error: 59.73859921575589


In [None]:
# Get the best tuned ridge model
best_ridge_model = ridge_cv.best_estimator_

In [None]:
# Print model parameters (intercept and coefficients) from the best ridge model
print("Ridge Intercept:", best_ridge_model.intercept_)
print("Stolen Bases:", best_ridge_model.coef_[0])
print("OBP:", best_ridge_model.coef_[1])
print("Batting Avg:", best_ridge_model.coef_[2])
print("OPS:", best_ridge_model.coef_[3])
print("Hits:", best_ridge_model.coef_[4])
print("Walks:", best_ridge_model.coef_[5])
print("Slugging Percentage:", best_ridge_model.coef_[6])

Ridge Intercept: 79.86792452830187
Stolen Bases: 4.321205287241922
OBP: 6.232329778186576
Batting Avg: -12.040170262501974
OPS: 5.01276510048239
Hits: 11.81313468740142
Walks: -0.1364436398176866
Slugging Percentage: 3.6707907418686725


In [None]:
# Apply predictions to dataframe
features = data[["r_total_stolen_base", "on_base_percent", "batting_avg", "on_base_plus_slg", "hit", "walk", "slg_percent"]].values
features = scaler.fit_transform(features)

# Make predictions using the trained model
predictions = best_ridge_model.predict(features)

# Add predictions as a new column to dataframe
data["predicted_runs"] = predictions

# Print actual runs and predicted runs
runs_pred = data[["last_name, first_name", "r_run", "predicted_runs"]]
print(runs_pred)

    last_name, first_name  r_run  predicted_runs
0          Grisham, Trent     67       68.920481
1      Candelario, Jeimer     77       79.372222
2           Hoerner, Nico     98       93.013142
3         Carroll, Corbin    116      108.018060
4      Santander, Anthony     81       85.513644
..                    ...    ...             ...
128     Yoshida, Masataka     71       73.272217
129         Outman, James     86       80.316903
130            Bohm, Alec     74       73.947871
131     Wade Jr., LaMonte     64       69.483154
132       Varsho, Daulton     65       67.232256

[133 rows x 3 columns]


In [None]:
# Print first 50 records of predicted runs scored compared to actual runs scored.
print(runs_pred.head(50))

   last_name, first_name  r_run  predicted_runs
0         Grisham, Trent     67       68.920481
1     Candelario, Jeimer     77       79.372222
2          Hoerner, Nico     98       93.013142
3        Carroll, Corbin    116      108.018060
4     Santander, Anthony     81       85.513644
5            Smith, Will     80       71.970958
6        Swanson, Dansby     81       79.715007
7       Arozarena, Randy     95       91.330047
8           McNeil, Jeff     75       74.927609
9      Walker, Christian     86       91.278626
10          Arraez, Luis     71       83.444660
11           Olson, Matt    127      115.476842
12         Rosario, Amed     70       59.981740
13          Hays, Austin     76       68.985353
14      Rodriguez, Julio    102      106.296679
15         McMahon, Ryan     80       77.838281
16        Ohtani, Shohei    102      113.691904
17         Betts, Mookie    126      114.351924
18     Cronenworth, Jake     54       59.847596
19     Rooker Jr., Brent     61       73