# MLB miscellaneous records

#### Load Python tools and Jupyter config

In [44]:
import json
import requests
import pandas as pd
import jupyter_black
import altair as alt
from time import sleep
import geopandas as gpd
from random import randint
from tqdm.notebook import tqdm

In [45]:
jupyter_black.load()
pd.options.display.max_columns = 100
pd.options.display.max_rows = 1000
pd.options.display.max_colwidth = None

In [46]:
today = pd.Timestamp("today").strftime("%Y-%m-%d")
current_year = pd.Timestamp("today").strftime("%Y")

---

## Read data

#### Headers

In [52]:
headers = {
    "user-agent": "Mozilla/5.0 (Macintosh; Intel Mac OS X 10_15_7) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/120.0.0.0 Safari/537.36",
}

In [56]:
years = range(1950, (int(current_year) + 1))

In [64]:
dfs = []

for year in tqdm(years):
    url = f"https://www.baseball-reference.com/leagues/majors/{year}-misc.shtml"
    src_df = pd.read_html(url)[0].assign(season=year)
    dfs.append(src_df)
    sleep(randint(1, 5))

  0%|          | 0/75 [00:00<?, ?it/s]

In [65]:
df = pd.concat(dfs).reset_index(drop=True)

In [66]:
df.columns = [
    "team",
    "attendance",
    "attend_per_game",
    "batter_age",
    "pitcher_age",
    "bat_park",
    "pitch_park",
    "hall_famers",
    "all_stars",
    "all_time_all_stars",
    "est_payroll",
    "time",
    "managers",
    "season",
    "challenges",
    "sucessfull",
    "success_rate",
]

In [67]:
df["est_payroll"] = (
    df["est_payroll"].str.replace("$", "").str.replace(",", "").fillna(pd.NA)
)

In [76]:
df.query('team.str.contains("Dodgers")')

Unnamed: 0,team,attendance,attend_per_game,batter_age,pitcher_age,bat_park,pitch_park,hall_famers,all_stars,all_time_all_stars,est_payroll,time,managers,season,challenges,sucessfull,success_rate
1,Brooklyn Dodgers,1185896.0,15204.0,27.6,26.6,103,99,5,7,14,146000.0,2:28,Shotton,1950,,,
17,Brooklyn Dodgers,1282628.0,16444.0,28.5,27.2,102,99,6,7,16,358100.0,2:30,Dressen,1951,,,
33,Brooklyn Dodgers,1088704.0,13609.0,29.2,27.1,102,98,6,7,16,186750.0,2:35,Dressen,1952,,,
49,Brooklyn Dodgers,1163419.0,14916.0,29.0,27.3,103,100,6,6,14,237000.0,2:33,Dressen,1953,,,
66,Brooklyn Dodgers,1020531.0,13254.0,29.6,27.7,104,101,7,6,17,216500.0,2:37,Alston,1954,,,
82,Brooklyn Dodgers,1033589.0,13423.0,29.8,26.2,104,101,7,4,17,231000.0,2:34,Alston,1955,,,
98,Brooklyn Dodgers,1213562.0,15761.0,31.0,28.2,109,106,8,4,20,243500.0,2:32,Alston,1956,,,
114,Brooklyn Dodgers,1028258.0,13354.0,30.1,26.7,109,106,6,3,20,,2:34,Alston,1957,,,
136,Los Angeles Dodgers,1845556.0,23968.0,29.0,25.2,104,105,5,2,21,344300.0,2:34,Alston,1958,,,
152,Los Angeles Dodgers,2071045.0,26552.0,28.2,25.9,107,107,4,6,19,409800.0,2:41,Alston,1959,,,


In [85]:
pd.read_json(source).job.value_counts()

job
Accountant / Auditor               30
Produce Grader                     30
Pharmacist                         30
Photo Developer                    30
Photographer                       30
Physician / Surgeon                30
Piano / Organ Tuner                30
Pilot                              30
Plasterer                          30
Plumber                            30
Policeman / Detective              30
Porter                             30
Postmaster                         30
Power Station Operator             30
Pressman                           30
Professional - Misc                30
Paperhanger                        30
Professor                          30
Professor - Agriculture            30
Professor - Biology                30
Professor - Chemistry              30
Professor - Economics              30
Professor - Engineering            30
Professor - Geology                30
Professor - Mathematics            30
Professor - Medical                30
Professo

---

## Exports

#### JSON

In [17]:
# df.to_json(
#     f"data/processed/NAME.json",
#     indent=4,
#     orient="records",
# )

#### CSV

In [18]:
# df.to_csv(
#     f"data/processed/NAME.csv", index=False
# )