In [None]:
# Databricks notebook source
# MAGIC %md
# MAGIC # Bronze Ingestion - Traffic Congestion (30 min)
# MAGIC Pulls near-real-time traffic congestion metrics for configured Indian metro cities.


In [None]:
# COMMAND ----------
import json
import time
from datetime import datetime, timezone

import requests
from pyspark.sql import functions as F
from pyspark.sql.types import (
    StructType,
    StructField,
    StringType,
    DoubleType,
    LongType,
    TimestampType,
)


In [None]:
# COMMAND ----------
# Widgets for Databricks Job parameters
# traffic_api_url example: https://api.tomtom.com/traffic/services/4/flowSegmentData/absolute/10/json
# city_points_json example: {"mumbai":"19.0760,72.8777","delhi":"28.6139,77.2090"}

dbutils.widgets.text("traffic_api_url", "")
dbutils.widgets.text("traffic_api_key", "")
dbutils.widgets.text("city_points_json", '{"mumbai":"19.0760,72.8777","delhi":"28.6139,77.2090","bengaluru":"12.9716,77.5946"}')
dbutils.widgets.text("bronze_catalog", "main")
dbutils.widgets.text("bronze_schema", "wattrac_bronze")
dbutils.widgets.text("traffic_table", "traffic_congestion_raw")

TRAFFIC_API_URL = dbutils.widgets.get("traffic_api_url")
TRAFFIC_API_KEY = dbutils.widgets.get("traffic_api_key")
CITY_POINTS = json.loads(dbutils.widgets.get("city_points_json"))
BRONZE_CATALOG = dbutils.widgets.get("bronze_catalog")
BRONZE_SCHEMA = dbutils.widgets.get("bronze_schema")
TRAFFIC_TABLE = dbutils.widgets.get("traffic_table")
TARGET_TABLE = f"{BRONZE_CATALOG}.{BRONZE_SCHEMA}.{TRAFFIC_TABLE}"


In [None]:
# COMMAND ----------
def call_with_retry(url: str, params: dict, max_retries: int = 3, sleep_seconds: int = 5):
    for attempt in range(1, max_retries + 1):
        resp = requests.get(url, params=params, timeout=45)
        if resp.ok:
            return resp.json()
        if attempt == max_retries:
            resp.raise_for_status()
        time.sleep(sleep_seconds * attempt)


In [None]:
# COMMAND ----------
bronze_schema = StructType([
    StructField("city", StringType(), False),
    StructField("point", StringType(), False),
    StructField("current_speed_kmph", DoubleType(), True),
    StructField("free_flow_speed_kmph", DoubleType(), True),
    StructField("current_travel_time_sec", LongType(), True),
    StructField("free_flow_travel_time_sec", LongType(), True),
    StructField("confidence", DoubleType(), True),
    StructField("road_closure", StringType(), True),
    StructField("source_event_ts", TimestampType(), True),
    StructField("ingestion_ts", TimestampType(), False),
    StructField("ingestion_date", StringType(), False),
    StructField("raw_payload", StringType(), False),
])

records = []
run_ts = datetime.now(timezone.utc)
for city, point in CITY_POINTS.items():
    payload = call_with_retry(
        TRAFFIC_API_URL,
        {
            "key": TRAFFIC_API_KEY,
            "point": point,
            "unit": "KMPH",
        },
    )
    flow = payload.get("flowSegmentData", {})
    records.append({
        "city": city,
        "point": point,
        "current_speed_kmph": flow.get("currentSpeed"),
        "free_flow_speed_kmph": flow.get("freeFlowSpeed"),
        "current_travel_time_sec": flow.get("currentTravelTime"),
        "free_flow_travel_time_sec": flow.get("freeFlowTravelTime"),
        "confidence": flow.get("confidence"),
        "road_closure": str(flow.get("roadClosure")),
        "source_event_ts": run_ts,
        "ingestion_ts": run_ts,
        "ingestion_date": run_ts.strftime("%Y-%m-%d"),
        "raw_payload": json.dumps(payload),
    })

df = spark.createDataFrame(records, schema=bronze_schema)


In [None]:
# COMMAND ----------
spark.sql(f"CREATE SCHEMA IF NOT EXISTS {BRONZE_CATALOG}.{BRONZE_SCHEMA}")
(df.write
   .format("delta")
   .mode("append")
   .partitionBy("ingestion_date")
   .saveAsTable(TARGET_TABLE))

display(df)
print(f"Inserted {df.count()} rows into {TARGET_TABLE}")
