In [1]:
# in late Aug 25 I used this and 'first_sqllite' ipynb files to learn about the 
# first steps of ETL for this MLB project
import requests

url = "https://statsapi.mlb.com/api/v1/teams"
response = requests.get(url)
data = response.json()


In [2]:
# Check the status code to make sure the request worked
print("Status Code:", response.status_code)

# See the top-level keys in the data
print("Top-level keys:", data.keys())

# See how many teams are in the list
print("Number of teams returned:", len(data["teams"]))

# Peek at the first team's data structure
print("First team keys:", data["teams"][0].keys())


Status Code: 200
Top-level keys: dict_keys(['copyright', 'teams'])
Number of teams returned: 790
First team keys: dict_keys(['allStarStatus', 'id', 'name', 'link', 'season', 'venue', 'teamCode', 'fileCode', 'abbreviation', 'teamName', 'locationName', 'firstYearOfPlay', 'league', 'sport', 'shortName', 'parentOrgName', 'parentOrgId', 'franchiseName', 'clubName', 'active'])


In [3]:
import pandas as pd


In [4]:
df = pd.json_normalize(data["teams"])
sorted(df.columns)



['abbreviation',
 'active',
 'allStarStatus',
 'clubName',
 'division.id',
 'division.link',
 'division.name',
 'fileCode',
 'firstYearOfPlay',
 'franchiseName',
 'id',
 'league.id',
 'league.link',
 'league.name',
 'link',
 'locationName',
 'name',
 'parentOrgId',
 'parentOrgName',
 'season',
 'shortName',
 'sport.id',
 'sport.link',
 'sport.name',
 'springLeague.abbreviation',
 'springLeague.id',
 'springLeague.link',
 'springLeague.name',
 'springVenue.id',
 'springVenue.link',
 'teamCode',
 'teamName',
 'venue.id',
 'venue.link',
 'venue.name']

In [5]:
df_active_mlb = df[
    (df["active"] == True) &
    (df["sport.name"] == "Major League Baseball")
]
df_active_mlb.head()


Unnamed: 0,allStarStatus,id,name,link,season,teamCode,fileCode,abbreviation,teamName,locationName,...,sport.name,division.id,division.name,division.link,springLeague.id,springLeague.name,springLeague.link,springLeague.abbreviation,springVenue.id,springVenue.link
20,N,108,Los Angeles Angels,/api/v1/teams/108,2025,ana,ana,LAA,Angels,Anaheim,...,Major League Baseball,200.0,American League West,/api/v1/divisions/200,114.0,Cactus League,/api/v1/league/114,CL,2500.0,/api/v1/venues/2500
21,N,109,Arizona Diamondbacks,/api/v1/teams/109,2025,ari,ari,AZ,D-backs,Phoenix,...,Major League Baseball,203.0,National League West,/api/v1/divisions/203,114.0,Cactus League,/api/v1/league/114,CL,4249.0,/api/v1/venues/4249
22,N,110,Baltimore Orioles,/api/v1/teams/110,2025,bal,bal,BAL,Orioles,Baltimore,...,Major League Baseball,201.0,American League East,/api/v1/divisions/201,115.0,Grapefruit League,/api/v1/league/115,GL,2508.0,/api/v1/venues/2508
23,N,111,Boston Red Sox,/api/v1/teams/111,2025,bos,bos,BOS,Red Sox,Boston,...,Major League Baseball,201.0,American League East,/api/v1/divisions/201,115.0,Grapefruit League,/api/v1/league/115,GL,4309.0,/api/v1/venues/4309
24,N,112,Chicago Cubs,/api/v1/teams/112,2025,chn,chc,CHC,Cubs,Chicago,...,Major League Baseball,205.0,National League Central,/api/v1/divisions/205,114.0,Cactus League,/api/v1/league/114,CL,4629.0,/api/v1/venues/4629


In [6]:
print("Number of columns:", len(df.columns))
print("First 25 column names:", sorted(df.columns)[:25])
print("\nSample row data:")
print(df.iloc[0].to_dict())


Number of columns: 35
First 25 column names: ['abbreviation', 'active', 'allStarStatus', 'clubName', 'division.id', 'division.link', 'division.name', 'fileCode', 'firstYearOfPlay', 'franchiseName', 'id', 'league.id', 'league.link', 'league.name', 'link', 'locationName', 'name', 'parentOrgId', 'parentOrgName', 'season', 'shortName', 'sport.id', 'sport.link', 'sport.name', 'springLeague.abbreviation']

Sample row data:
{'allStarStatus': 'N', 'id': 4104, 'name': 'Coastal Carolina Chanticleers', 'link': '/api/v1/teams/4104', 'season': 2025, 'teamCode': 'ccu', 'fileCode': 't4104', 'abbreviation': 'CCC', 'teamName': 'Chanticleers', 'locationName': 'United States', 'firstYearOfPlay': '2011', 'shortName': 'Coastal Carolina', 'parentOrgName': 'Office of the Commissioner', 'parentOrgId': 11.0, 'franchiseName': 'Coastal Carolina', 'clubName': 'Chanticleers', 'active': True, 'venue.id': 401, 'venue.name': 'TBD', 'venue.link': '/api/v1/venues/401', 'league.id': 107.0, 'league.name': 'College Baseba

In [7]:
# sanity: these should already exist from earlier cells
assert "teams" in data and isinstance(data["teams"], list), "Missing/invalid `data`"
assert "id" in pd.json_normalize(data["teams"]).columns, "`df` needs to be rebuilt"

# rebuild df just in case
df = pd.json_normalize(data["teams"], max_level=2)


In [12]:
# filter to only include MLB teams here
df = df[df["sport.name"] == "Major League Baseball"]

#make a map (dictionary?) of names of columns you want to keep and what you want the name to be
keep_map = {
    "id": "team_id",
    "name": "team_name",
    "abbreviation": "abbrev",
    "teamCode": "team_code",
    "franchiseName": "franchise",
    "clubName": "club_name",
    "league.name": "league",
    "division.name": "division",
    "locationName": "location",
    "season": "season",
    "firstYearOfPlay": "first_year_of_play",
    "venue.name": "venue_name",
    "active": "active"
}

In [15]:
# filter down to columns that actually exist in df
existing = {k: v for k, v in keep_map.items() if k in df.columns}

# create clean table with only those columns, renamed
tidy = df[list(existing.keys())].rename(columns=existing)

pd.set_option("display.max_rows", None)
pd.set_option("display.max_columns", None)

tidy


Unnamed: 0,team_id,team_name,abbrev,team_code,franchise,club_name,league,division,location,season,first_year_of_play,venue_name,active
20,108,Los Angeles Angels,LAA,ana,Los Angeles,Angels,American League,American League West,Anaheim,2025,1961,Angel Stadium,True
21,109,Arizona Diamondbacks,AZ,ari,Arizona,Diamondbacks,National League,National League West,Phoenix,2025,1996,Chase Field,True
22,110,Baltimore Orioles,BAL,bal,Baltimore,Orioles,American League,American League East,Baltimore,2025,1901,Oriole Park at Camden Yards,True
23,111,Boston Red Sox,BOS,bos,Boston,Red Sox,American League,American League East,Boston,2025,1901,Fenway Park,True
24,112,Chicago Cubs,CHC,chn,Chicago,Cubs,National League,National League Central,Chicago,2025,1874,Wrigley Field,True
25,113,Cincinnati Reds,CIN,cin,Cincinnati,Reds,National League,National League Central,Cincinnati,2025,1882,Great American Ball Park,True
26,114,Cleveland Guardians,CLE,cle,Cleveland,Guardians,American League,American League Central,Cleveland,2025,1901,Progressive Field,True
27,115,Colorado Rockies,COL,col,Colorado,Rockies,National League,National League West,Denver,2025,1992,Coors Field,True
28,116,Detroit Tigers,DET,det,Detroit,Tigers,American League,American League Central,Detroit,2025,1901,Comerica Park,True
29,117,Houston Astros,HOU,hou,Houston,Astros,American League,American League West,Houston,2025,1962,Daikin Park,True


In [16]:
# Convert IDs and years to numeric, booleans to proper True/False
tidy["team_id"] = pd.to_numeric(tidy["team_id"], errors="coerce").astype("Int64")

if "season" in tidy.columns:
    tidy["season"] = pd.to_numeric(tidy["season"], errors="coerce").astype("Int64")

if "first_year_of_play" in tidy.columns:
    tidy["first_year_of_play"] = pd.to_numeric(tidy["first_year_of_play"], errors="coerce").astype("Int64")

if "active" in tidy.columns:
    tidy["active"] = tidy["active"].astype("boolean")

tidy.dtypes


team_id                 Int64
team_name              object
abbrev                 object
team_code              object
franchise              object
club_name              object
league                 object
division               object
location               object
season                  Int64
first_year_of_play      Int64
venue_name             object
active                boolean
dtype: object

In [1]:
len(tidy)

NameError: name 'tidy' is not defined