# New project name
> Explain what this notebook does specifically and link to the data source. 

---

#### Import Python tools and Jupyter config

In [24]:
import us
import json
import requests
import pandas as pd
import jupyter_black
import altair as alt
import geopandas as gpd
from tqdm.notebook import tqdm
from IPython.display import Image

In [25]:
jupyter_black.load()
pd.options.display.max_columns = 100
pd.options.display.max_rows = 100
pd.options.display.max_colwidth = None

In [26]:
today = pd.Timestamp("today").strftime("%Y-%m-%d")
current_year = int(pd.Timestamp("today").strftime("%Y"))

---

## Fetch

#### Collect all player information, by season

In [81]:
# Define the base URL for the MLB API without specific fields to get all available data
base_url = "https://statsapi.mlb.com/api/v1/sports/1/players?season={}"

In [82]:
# Initialize a list to store dataframes
dfs = []

# Loop through each year from division play to the current year
for year in tqdm(range(1969, current_year + 1)):
    url = base_url.format(year)
    response = requests.get(url)

    if response.status_code == 200:
        season_data = response.json().get("people", [])
        src = pd.DataFrame(season_data)
        src["season"] = year
        dfs.append(src)
    else:
        print(f"Failed to fetch data for {year}")

  0%|          | 0/56 [00:00<?, ?it/s]

In [83]:
# Concatenate all DataFrames into a single DataFrame
df = pd.concat(dfs, ignore_index=True)

---

## Process

#### Flatten name/description from nested columns

In [84]:
df["teamName"] = df["currentTeam"].apply(
    lambda x: x.get("name") if pd.notnull(x) else None
)

df["positionName"] = df["primaryPosition"].apply(
    lambda x: x.get("name") if pd.notnull(x) else None
)

df["batDescription"] = df["batSide"].apply(
    lambda x: x.get("description") if pd.notnull(x) else None
)

df["pitchDescription"] = df["pitchHand"].apply(
    lambda x: x.get("description") if pd.notnull(x) else None
)

#### Drop the nested columns

In [106]:
players_df = df.drop(
    [
        "currentTeam",
        "link",
        "primaryPosition",
        "batSide",
        "pitchHand",
        "nickName",
        "pronunciation",
        "namePrefix",
        "nameMatrilineal",
        "nameTitle",
        "nameSuffix",
        "primaryNumber",
        "namePrefix",
        "useLastName",
        "middleName",
        "boxscoreName",
        "gender",
        "isPlayer",
        "isVerified",
        "deathCity",
        "deathStateProvince",
        "deathCountry",
        "nameFirstLast",
        "nameSlug",
        "firstLastName",
        "lastFirstName",
        "lastInitName",
        "initLastName",
        "fullFMLName",
        "fullLFMName",
    ],
    axis=1,
).copy()

#### Clean dates, create decade column

In [107]:
players_df["season_decade"] = players_df["season"].astype(str).str[:3] + "0s"

#### Make position names consistent

In [108]:
# Define a mapping of old position names to new ones
position_mapping = {
    "Outfield": "Outfielder",  # Change 'Outfield' to 'Outfielder'
    "Unknown": "Other/Unknown",
    "Pinch Hitter": "Other/Unknown",
    "Pinch Runner": "Other/Unknown",
    "Two-Way Player": "Other/Unknown",
    "Infield": "Other/Unknown",
}

# Apply the mapping to the 'positionName' column
players_df["positionName"] = players_df["positionName"].replace(position_mapping)

In [127]:
unique_players = players_df.drop(["season", "teamName"], axis=1).drop_duplicates(
    subset="id"
)

In [128]:
len(unique_players)

11238

In [133]:
unique_players.birthCountry.value_counts()

birthCountry
USA                    8812
Dominican Republic      894
Venezuela               469
Puerto Rico             267
Cuba                    148
Mexico                  135
Canada                  125
Japan                    79
Panama                   58
Australia                34
Colombia                 31
Taiwan                   17
Curacao                  17
South Korea              16
Nicaragua                15
Republic of Korea        12
Germany                  11
West Germany              9
Netherlands               7
Aruba                     6
VEN                       6
U.S. Virgin Islands       6
England                   6
Bahamas                   6
Brazil                    5
DOM                       4
Jamaica                   4
Panama Canal Zone         3
France                    3
Spain                     3
Honduras                  2
Saudi Arabia              2
South Africa              2
United Kingdom            2
Lithuania                 1
Italy  

---

## Aggregate

#### Groupby state, etc.

---

## Charts

#### Save the chart

In [None]:
chart.save("visuals/chart.png")
Image(filename="visuals/chart.png")

#### Make sure the chart is visible on Github

In [None]:
Image(filename="visuals/chart.png")

---

## Metadata

#### Data provenance, column descriptions, etc.

---

## JSON navigation

#### Find the path to keys in highly nested json

In [None]:
def find_key_path(data, target_key, path=''):
    """
    Recursively searches for a key in a nested JSON object and prints the path to it.
    """
    if isinstance(data, dict):
        for key, value in data.items():
            current_path = f"{path}/{key}" if path else key
            if key == target_key:
                print(f"Key '{target_key}' found at path: {current_path}")
                return current_path
            result = find_key_path(value, target_key, current_path)
            if result:
                return result
    elif isinstance(data, list):
        for index, item in enumerate(data):
            current_path = f"{path}[{index}]"
            result = find_key_path(item, target_key, current_path)
            if result:
                return result
    return None

In [None]:
# If you have json called "json_data"
target_key = 'KEY NAME HERE'  
path_to_key = find_key_path(json_data, target_key)

if path_to_key:
    print(f"The path to '{target_key}' is: {path_to_key}")
else:
    print(f"Key '{target_key}' not found in the JSON data.")

---

## Exports

#### XyXy subset in CSV format to `processed`

#### JSON, GeoJSON, etc., to `processed`