BUS


In [1]:
import requests
import pandas as pd
import time
from requests.adapters import HTTPAdapter
from urllib3.util.retry import Retry

URL = "https://data.ny.gov/resource/kv7t-n8in.json"
APP_TOKEN = None 


session = requests.Session()
if APP_TOKEN:
    session.headers.update({"X-App-Token": APP_TOKEN})

retry = Retry(
    total=5,                
    connect=5, read=5,      
    status=5,
    backoff_factor=1.2,     
    status_forcelist=[429, 500, 502, 503, 504],
    allowed_methods=["GET"],
    raise_on_status=False,
)
session.mount("https://", HTTPAdapter(max_retries=retry))
session.mount("http://", HTTPAdapter(max_retries=retry))


all_data = []
offset = 0


batch_size = 20_000  

while True:
    params = {
        "$query": f"""
            SELECT date_trunc_ymd(transit_timestamp) AS day,
                   bus_route,
                   SUM(ridership) AS total_ridership
            WHERE transit_timestamp BETWEEN '2020-01-01T00:00:00' AND '2020-12-31T23:59:59'
            GROUP BY day, bus_route
            HAVING SUM(ridership) > 0
            ORDER BY day
            LIMIT {batch_size} OFFSET {offset}
        """
    }

   
    try:
        r = session.get(URL, params=params, timeout=(10, 120))
        r.raise_for_status()
        data = r.json()
    except (requests.ReadTimeout, requests.ConnectTimeout):
        
        print("Timeout ao ler; tentando novamente com backoff curto...")
        time.sleep(2.0)
        continue
    except requests.HTTPError as e:
        status = getattr(e.response, "status_code", None)
        if status in (429, 500, 502, 503, 504):
            print(f"HTTP {status}; aguardando e repetindo...")
            time.sleep(2.0)
            continue
        
        print("Erro SoQL:", getattr(e.response, "text", ""))
        raise

    if not data:
        break  

    all_data.extend(data)
    print(f"Lidos {len(data):,} registros (offset {offset:,})")

   
    if len(data) < batch_size:
        break

    offset += batch_size


df = pd.DataFrame(all_data)
if not df.empty:
    df["day"] = pd.to_datetime(df["day"], errors="coerce").dt.date
    df["total_ridership"] = pd.to_numeric(df["total_ridership"], errors="coerce")

print(f"\nTotal de linhas baixadas: {len(df):,}")
print(df.head())

Lidos 20,000 registros (offset 0)
Lidos 20,000 registros (offset 20,000)
Lidos 20,000 registros (offset 40,000)
Lidos 20,000 registros (offset 60,000)
Lidos 5,522 registros (offset 80,000)

Total de linhas baixadas: 85,522
          day bus_route  total_ridership
0  2020-01-01        B1             6252
1  2020-01-01      B100              751
2  2020-01-01      B103             3934
3  2020-01-01       B11             4123
4  2020-01-01       B12             3641


In [None]:
df.to_csv("mta_bus_ridership_20.csv", index=False)
print("Archivo guarado")

Archivo guarado
