<a href="https://colab.research.google.com/github/samarasimhareddymatla-sys/air-pollution-prediction/blob/main/Real_time_pipeline.ipynb" target="_parent"><img src="https://colab.research.google.com/assets/colab-badge.svg" alt="Open In Colab"/></a>

In [None]:
import requests
import pandas as pd
from datetime import datetime, timedelta, timezone
API_KEY = "389e4da96750d7e5d9f9ccf207543afacce5fac8475a10777b7c406269fd0633"
BASE = "https://api.openaq.org/v3"
HEADERS = {"X-API-Key": API_KEY}
HYD_LAT, HYD_LON = 17.3850, 78.4867
RADIUS_M = 25000
LATEST_WINDOW_HOURS = 24
def api_get(path, params=None, timeout=20):
    """GET helper with basic error handling."""
    url = f"{BASE}{path}"
    r = requests.get(url, headers=HEADERS, params=params or {}, timeout=timeout)
    if r.status_code != 200:
        raise RuntimeError(f"HTTP {r.status_code} for {url} -> {r.text}")
    return r.json()

def iso_utc(dt: datetime) -> str:
    return dt.astimezone(timezone.utc).strftime("%Y-%m-%dT%H:%M:%SZ")
loc_params = {
    "coordinates": f"{HYD_LAT:.4f},{HYD_LON:.4f}",
    "radius": RADIUS_M,
    "limit": 100,
    "page": 1,
}
loc_data = api_get("/locations", params=loc_params)
locations = loc_data.get("results", [])

if not locations:
    raise SystemExit("No stations found near Hyderabad. Try a different point or smaller radius.")

print(f"Found {len(locations)} stations within {RADIUS_M/1000:.0f} km of Hyderabad")
cutoff = datetime.now(timezone.utc) - timedelta(hours=LATEST_WINDOW_HOURS)
cutoff_iso = iso_utc(cutoff)

rows = []

for loc in locations:
    loc_id = loc["id"]
    loc_name = loc.get("name") or ""
    loc_city = loc.get("locality") or ""
    loc_tz = loc.get("timezone") or "Asia/Kolkata"
    try:
        sensors = api_get(f"/locations/{loc_id}/sensors").get("results", [])
    except Exception as e:
        print(f"Skipping sensors for {loc_name} (ID {loc_id}): {e}")
        sensors = []

    sensor_meta = {}
    for s in sensors:
        sid = s.get("id")
        param = (s.get("parameter") or {}).get("name")
        units = (s.get("parameter") or {}).get("units")
        sensor_meta[sid] = {"parameter": param, "units": units, "sensor_name": s.get("name")}
    try:
        latest = api_get(f"/locations/{loc_id}/latest", params={"datetime_min": cutoff_iso}).get("results", [])
    except Exception as e:
        print(f"Skipping latest for {loc_name} (ID {loc_id}): {e}")
        latest = []

    if not latest:
        continue

    for rec in latest:
        sid = rec.get("sensorsId")
        meta = sensor_meta.get(sid, {})
        dt = rec.get("datetime") or {}
        coords = rec.get("coordinates") or {}

        rows.append({
            "city": loc_city,
            "station_name": loc_name,
            "station_id": loc_id,
            "sensor_id": sid,
            "sensor_name": meta.get("sensor_name"),
            "parameter": meta.get("parameter"),
            "value": rec.get("value"),
            "units": meta.get("units"),
            "datetime_utc": dt.get("utc"),
            "datetime_local": dt.get("local"),
            "latitude": coords.get("latitude"),
            "longitude": coords.get("longitude"),
            "timezone": loc_tz,
        })
if not rows:
    print("No recent readings in the last", LATEST_WINDOW_HOURS, "hours.")
else:
    df = pd.DataFrame(rows)
    df.sort_values(["station_name", "parameter", "datetime_utc"], inplace=True)
    df.to_csv("hyderabad_air_quality_latest.csv", index=False)
    print("Real-time dataset saved -> hyderabad_air_quality_latest.csv")
    print("Rows:", len(df))
    print("Parameters:", sorted(x for x in df["parameter"].dropna().unique()))
    print("\nSample:")
    print(df.head(10000000).to_string(index=False))



Found 14 stations within 25 km of Hyderabad
Skipping sensors for Central University, Hyderabad - TSPCB (ID 5623): HTTP 500 for https://api.openaq.org/v3/locations/5623/sensors -> Internal Server Error
Skipping sensors for Sanathnagar, Hyderabad - TSPCB (ID 5647): HTTP 500 for https://api.openaq.org/v3/locations/5647/sensors -> Internal Server Error
Real-time dataset saved -> hyderabad_air_quality_latest.csv
Rows: 81
Parameters: ['co', 'no', 'no2', 'o3', 'pm10', 'pm25', 'relativehumidity', 'so2', 'temperature']

Sample:
 city                                 station_name  station_id  sensor_id        sensor_name        parameter   value units         datetime_utc            datetime_local  latitude  longitude     timezone
       Bollaram Industrial Area, Hyderabad - TSPCB        5599   12235395             co ppb               co   0.490   ppb 2025-09-13T04:30:00Z 2025-09-13T10:00:00+05:30 17.540891  78.358528 Asia/Kolkata
       Bollaram Industrial Area, Hyderabad - TSPCB        5599   