# LA Dodgers Standings, 1958-present
> This notebook downloads the team's current standings table from [Baseball Reference](https://www.baseball-reference.com/teams/LAD/2024-schedule-scores.shtml) and combines it with historic records for later analysis and visualization.

---

#### Import Python tools and Jupyter config

In [3]:
import us
import json
import requests
import pandas as pd
import jupyter_black
import altair_stiles as altstiles
from IPython.display import Image, display
from sklearn.linear_model import LinearRegression
from sklearn.model_selection import train_test_split
from sklearn.metrics import mean_squared_error

In [None]:
jupyter_black.load()
pd.options.display.max_columns = 100
pd.options.display.max_rows = 1000
pd.options.display.max_colwidth = None
alt.data_transformers.disable_max_rows()
alt.themes.register("stiles", altstiles.theme)
alt.themes.enable("stiles")

---

## Fetch

#### Read game-by-game standings from the historical archive

In [60]:
df = pd.read_parquet("https://stilesdata.com/dodgers/data/standings/archive/dodgers_standings_1958_2023.parquet").query('year<"2024"')

In [61]:
df[['wins', 'losses']] = df['record'].str.split('-', expand=True).astype(int)

In [62]:
df['win_pct'] = (df['wins'] / df['gm']).round(2)

In [68]:
df['game_day'] = pd.to_datetime(df['game_date']).dt.day_name()

In [69]:
df.head()

Unnamed: 0,gm,game_date,home_away,opp,result,r,ra,record,rank,gb,time,time_minutes,day_night,attendance,year,wins,losses,win_pct,game_day
0,1,1958-04-15,away,SFG,L,0,8,0-1,5,-1.0,2:29:00,149,D,23448,1958,0,1,0.0,Tuesday
1,2,1958-04-16,away,SFG,W,13,1,1-1,4,-0.5,3:03:00,183,N,22735,1958,1,1,0.5,Wednesday
2,3,1958-04-17,away,SFG,L,4,7,1-2,6,-1.5,2:50:00,170,D,12520,1958,1,2,0.33,Thursday
3,4,1958-04-18,home,SFG,W,6,5,2-2,3,-1.5,3:00:00,180,D,78672,1958,2,2,0.5,Friday
4,5,1958-04-19,home,SFG,L,4,11,2-3,5,-2.5,2:37:00,157,D,41303,1958,2,3,0.4,Saturday


In [70]:
df.to_parquet('../data/standings/archive/dodgers_standings_1958_2023.parquet', index=False)
df.to_json('../data/standings/archive/dodgers_standings_1958_2023.json', indent=4, orient='records')
df.to_csv('../data/standings/archive/dodgers_standings_1958_2023.csv', index=False)

In [None]:
# Calculate final wins in each season
final_wins = df.groupby('year')['wins'].max()

# Merge the final wins back to the main dataframe
df = df.merge(final_wins.rename('final_wins'), on='year')

In [43]:
df.head(20)

Unnamed: 0,gm,game_date,home_away,opp,result,r,ra,record,rank,gb,time,time_minutes,day_night,attendance,year,wins,losses,win_pct
0,1,1958-04-15,away,SFG,L,0,8,0-1,5,-1.0,2:29:00,149,D,23448,1958,0,1,0.0
1,2,1958-04-16,away,SFG,W,13,1,1-1,4,-0.5,3:03:00,183,N,22735,1958,1,1,0.5
2,3,1958-04-17,away,SFG,L,4,7,1-2,6,-1.5,2:50:00,170,D,12520,1958,1,2,0.33
3,4,1958-04-18,home,SFG,W,6,5,2-2,3,-1.5,3:00:00,180,D,78672,1958,2,2,0.5
4,5,1958-04-19,home,SFG,L,4,11,2-3,5,-2.5,2:37:00,157,D,41303,1958,2,3,0.4
5,6,1958-04-20,home,SFG,L,2,12,2-4,7,-2.5,2:30:00,150,D,47234,1958,2,4,0.33
6,7,1958-04-22,home,CHC,W,4,2,3-4,5,-1.5,2:38:00,158,N,39459,1958,3,4,0.43
7,8,1958-04-23,home,CHC,L,6,7,3-5,6,-2.5,2:59:00,179,N,24368,1958,3,5,0.38
8,9,1958-04-24,home,CHC,L,2,15,3-6,7,-3.5,2:43:00,163,D,10194,1958,3,6,0.33
9,10,1958-04-25,home,STL,W,5,3,4-6,6,-3.0,2:38:00,158,N,59635,1958,4,6,0.4


In [25]:
# Calculate final wins in each season
final_wins = df.groupby('year')['wins'].max()

# Merge the final wins back to the main dataframe
df = df.merge(final_wins.rename('final_wins'), on='year')

In [None]:
# Merge the final wins back to the main dataframe
df = df.merge(final_wins.rename('final_wins'), on='year')

In [27]:
df.head()

Unnamed: 0,gm,game_date,home_away,opp,result,r,ra,record,rank,gb,time,time_minutes,day_night,attendance,year,wins,losses,win_pct,final_wins
0,20,2024-04-16,home,WSN,W,6,2,12-8,1,1.0,2:26:00,146,N,52718,2024,12,8,0.6,12
1,19,2024-04-15,home,WSN,L,4,6,11-8,1,1.0,2:36:00,156,N,42677,2024,11,8,0.58,12
2,18,2024-04-14,home,SDP,L,3,6,11-7,1,2.0,2:58:00,178,D,49432,2024,11,7,0.61,12
3,17,2024-04-13,home,SDP,W,5,2,11-6,1,3.0,2:36:00,156,N,44582,2024,11,6,0.65,12
4,16,2024-04-12,home,SDP,L,7,8,10-6,1,2.0,3:14:00,194,N,49606,2024,10,6,0.62,12


In [37]:
df.head(100)

Unnamed: 0,gm,game_date,home_away,opp,result,r,ra,record,rank,gb,time,time_minutes,day_night,attendance,year,wins,losses,win_pct,final_wins
0,20,2024-04-16,home,WSN,W,6,2,12-8,1,1.0,2:26:00,146,N,52718,2024,12,8,0.60,12
1,19,2024-04-15,home,WSN,L,4,6,11-8,1,1.0,2:36:00,156,N,42677,2024,11,8,0.58,12
2,18,2024-04-14,home,SDP,L,3,6,11-7,1,2.0,2:58:00,178,D,49432,2024,11,7,0.61,12
3,17,2024-04-13,home,SDP,W,5,2,11-6,1,3.0,2:36:00,156,N,44582,2024,11,6,0.65,12
4,16,2024-04-12,home,SDP,L,7,8,10-6,1,2.0,3:14:00,194,N,49606,2024,10,6,0.62,12
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
95,87,2023-07-06,home,PIT,W,5,2,49-38,2,-0.5,2:13:00,133,N,42036,2023,49,38,0.56,100
96,86,2023-07-05,home,PIT,W,6,4,48-38,2,-1.5,2:44:00,164,N,45403,2023,48,38,0.56,100
97,85,2023-07-04,home,PIT,L,7,9,47-38,2,-2.5,3:14:00,194,N,51487,2023,47,38,0.55,100
98,84,2023-07-03,home,PIT,W,5,2,47-37,2,-2.5,2:33:00,153,N,49652,2023,47,37,0.56,100


In [28]:
# Features and labels
X = df[['win_pct', 'gm']]
y = df['final_wins']

In [29]:
# Split data into training and testing sets
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=42)

# Create a linear regression model
model = LinearRegression()
model.fit(X_train, y_train)

# Make predictions
predictions = model.predict(X_test)

# Evaluate the model
mse = mean_squared_error(y_test, predictions)
print(f"Mean Squared Error: {mse:.2f}")

Mean Squared Error: 109.02
