<a href="https://colab.research.google.com/github/sawyerhunt12/MLB-Runs-Scored/blob/main/MLB_Runs_Scored_Lasso.ipynb" target="_parent"><img src="https://colab.research.google.com/assets/colab-badge.svg" alt="Open In Colab"/></a>

In [None]:
import pandas as pd
import numpy as np
import seaborn as sns
import matplotlib.pyplot as plt
from google.colab import drive
drive.mount('/content/gdrive')

Mounted at /content/gdrive


In [None]:
# Read in data from Google Drive
data = pd.read_csv('gdrive/My Drive/MLB_stolen_bases.csv')

In [None]:
data.shape

(133, 23)

In [None]:
# 75th, 25th, and 90th percentile of runs scored
seventy_five_runs = data["r_run"].quantile(0.75)
twenty_five_runs = data["r_run"].quantile(0.25)
ninety_runs = data["r_run"].quantile(0.90)

# runs scored iqr
runs_iqr = seventy_five_runs - twenty_five_runs

# upper and lower outliers
upper_outlier_runs = seventy_five_runs + (1.5 * runs_iqr)
lower_outlier_runs = twenty_five_runs - (1.5 * runs_iqr)

print("75th Percentile:", seventy_five_runs)
print("25th Percentile:", twenty_five_runs)
print("90th Percentile:", ninety_runs)
print("Interquartile Range:", runs_iqr)
print("Upper Outlier Limit:", upper_outlier_runs)
print("Lower Outlier Limit:", lower_outlier_runs)

75th Percentile: 90.0
25th Percentile: 66.0
90th Percentile: 102.0
Interquartile Range: 24.0
Upper Outlier Limit: 126.0
Lower Outlier Limit: 30.0


In [None]:
from sklearn.linear_model import Lasso
from sklearn.model_selection import train_test_split
from sklearn.metrics import mean_squared_error
from sklearn.linear_model import LassoCV
from sklearn.model_selection import cross_val_score

Perform Lasso Regression Model

In [None]:
# Feature selection, all relavent offensive stats in the dataset
X = data[["r_total_stolen_base", "on_base_percent", "batting_avg", "on_base_plus_slg", "hit", "walk", "slg_percent"]].values

# Target variable
y = data["r_run"].values

In [None]:
# Split the data into training and tests sets
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size = 0.2, random_state = 42)

In [None]:
# Use cross validation to determine optimal alpha value
alphas = [0.001, 0.01, 0.1, 1, 10, 100]

lasso_cv = LassoCV(alphas = alphas, cv = 5)

lasso_cv.fit(X_train, y_train)

optimal_alpha = lasso_cv.alpha_
print(optimal_alpha)

0.001


In [None]:
# Fit model using optimal alpha value
final_model = Lasso(alpha = optimal_alpha)
final_model.fit(X_train, y_train)

# Make predictions on the test set
y_pred = final_model.predict(X_test)

test_score = final_model.score(X_test, y_test)
print(test_score)

0.8720197381494038


In [None]:
# Print coefficients
# Print the model parameters
print("Intercept:", final_model.intercept_)
print("Stolen Bases:", final_model.coef_[0])
print("OBP:", final_model.coef_[1])
print("Batting Avg:", final_model.coef_[2])
print("OPS:", final_model.coef_[3])
print("Hits:", final_model.coef_[4])
print("Walks:", final_model.coef_[5])
print("Slugging Percentage:", final_model.coef_[6])

Intercept: -25.300694494424036
Stolen Bases: 0.3862184747414358
OBP: 122.18052400503669
Batting Avg: -457.2620764742318
OPS: 131.80473568584634
Hits: 0.529112311566787
Walks: 0.028528748779291995
Slugging Percentage: -0.0


In [None]:
# Evaluate the model with MSE
mse = mean_squared_error(y_test, y_pred)
print("Mean Squared Error:", mse)

Mean Squared Error: 58.57844429730567


In [None]:
# Apply predictions to dataframe
features = data[["r_total_stolen_base", "on_base_percent", "batting_avg", "on_base_plus_slg", "hit", "walk", "slg_percent"]].values

# Make predictions using the trained model
predictions = final_model.predict(features)

# Add predictions as a new column to dataframe
data["predicted_runs"] = predictions

# Print actual runs and predicted runs
runs_pred = data[["last_name, first_name", "r_run", "predicted_runs"]]
print(runs_pred)

    last_name, first_name  r_run  predicted_runs
0          Grisham, Trent     67       67.702416
1      Candelario, Jeimer     77       79.144637
2           Hoerner, Nico     98       94.254209
3         Carroll, Corbin    116      110.684492
4      Santander, Anthony     81       85.865241
..                    ...    ...             ...
128     Yoshida, Masataka     71       73.122824
129         Outman, James     86       80.166705
130            Bohm, Alec     74       73.758611
131     Wade Jr., LaMonte     64       68.482267
132       Varsho, Daulton     65       66.599807

[133 rows x 3 columns]


In [None]:
# Print first 50 records of predicted runs scored compared to actual runs scored.
print(runs_pred.head(50))

   last_name, first_name  r_run  predicted_runs
0         Grisham, Trent     67       67.702416
1     Candelario, Jeimer     77       79.144637
2          Hoerner, Nico     98       94.254209
3        Carroll, Corbin    116      110.684492
4     Santander, Anthony     81       85.865241
5            Smith, Will     80       71.243642
6        Swanson, Dansby     81       79.641657
7       Arozarena, Randy     95       91.877215
8           McNeil, Jeff     75       74.212378
9      Walker, Christian     86       92.193767
10          Arraez, Luis     71       84.028119
11           Olson, Matt    127      118.064990
12         Rosario, Amed     70       59.248734
13          Hays, Austin     76       68.696992
14      Rodriguez, Julio    102      108.325774
15         McMahon, Ryan     80       77.790487
16        Ohtani, Shohei    102      116.750303
17         Betts, Mookie    126      117.117698
18     Cronenworth, Jake     54       58.238307
19     Rooker Jr., Brent     61       73