In [32]:
import os
from datetime import timedelta

import pandas as pd
from sqlalchemy import create_engine, text
import requests

In [4]:
baseball_url = 'postgresql+psycopg2://ryan:cloude1379@localhost:5432/baseball'
engine = create_engine(baseball_url)
try:
    with engine.connect() as conn:
        result = conn.execute(text("SELECT version();"))
        print("✅ Connected to PostgreSQL!")
        print(f"PostgreSQL version: {result.fetchone()[0]}")
except Exception as e:
    print("Failed to connect to PostgreSQL:")
    print(e)
    exit()

✅ Connected to PostgreSQL!
PostgreSQL version: PostgreSQL 15.14 (Debian 15.14-1.pgdg13+1) on x86_64-pc-linux-gnu, compiled by gcc (Debian 14.2.0-19) 14.2.0, 64-bit


In [37]:
players_query = "select distinct id from bronze.players;"
players = pd.read_sql_query(players_query, engine)
players = players['id'].astype(str).tolist()
teams_query= "select distinct id from bronze.teams;"
teams = pd.read_sql_query(teams_query, engine)
teams = teams['id'].tolist()

In [51]:
players_test = players[2000:2050]

In [83]:
def get_roster_entries(players, teams):
    players_ids = ",".join(players)
    params = {"hydrate": "rosterEntries", "personIds": players_ids}
    url = "https://statsapi.mlb.com/api/v1/people"
    response = requests.get(url, params=params, timeout=60)
    # Try to extract JSON
    try:
        data = response.json()
    except Exception as e:
        print(f"[WARNING] Failed to parse data: {e}")
        return pd.DataFrame()

    all_rows = []
    for person in (data.get("people") or []):
        pid = person.get("id")
        for e in (person.get("rosterEntries") or []):
            row = {
                    "person_id": pid,
                    "is_active": e.get("isActive") or None,
                    "team_id": e.get("team" or {}).get("id"),
                    "start_date": e.get("startDate"),
                    "status_date": e.get("statusDate") or None,
                    "end_date": e.get("endDate") or None,
                    "status_code": (e.get("status") or {}).get("code"),
                    "status_desc": (e.get("status") or {}).get("description"),
                    "parent_org_id": e.get("team" or {}).get("parentOrgId") or None,
                }
            if row["team_id"] in teams or (row["parent_org_id"] in teams and row["status_code"] == "RA"):
                all_rows.append(row)

    df = pd.DataFrame(all_rows, columns=[
        "person_id", "team_id", "start_date", "status_date", "end_date", "is_active",
        "status_code", "status_desc", "parent_org_id"])
    if not df.empty:
        # de-dupe on the natural stint key
        df = df.drop_duplicates(subset=["person_id", "team_id", "start_date"], keep="last")
    return df

In [84]:
test_df = get_roster_entries(players_test, teams)

In [85]:
test_df[test_df.is_active == True]

Unnamed: 0,person_id,team_id,start_date,status_date,end_date,is_active,status_code,status_desc,parent_org_id
0,621383,143,2024-07-30,2025-04-16,,True,A,Active,
31,621438,552,2025-09-19,2025-09-19,,True,RA,Rehab Assignment,121.0
32,621438,121,2023-12-20,2025-08-30,,True,D10,Injured 10-Day,
36,621439,142,2015-06-14,2025-09-16,,True,A,Active,
61,621493,108,2018-08-14,2025-04-16,,True,A,Active,
111,621566,144,2022-03-14,2025-04-16,,True,A,Active,
157,622098,121,2025-02-12,2025-02-13,,True,D60,Injured 60-Day,
192,622250,140,2021-02-16,2025-02-17,,True,D60,Injured 60-Day,
198,622253,141,2025-09-01,2025-09-04,,True,RM,Reassigned to Minors,
