# 🌍 Air Quality Intelligence Pipeline
## Setup & Configuration Notebook

This notebook:
- Creates the project database and Delta table directories
- Defines all shared configuration variables
- Validates the environment is ready for ingestion

**Run this notebook FIRST before any other notebook.**

In [0]:
# ============================================================
# PROJECT CONFIGURATION
# Using Managed Tables — no explicit DBFS paths needed
# ============================================================

DATABASE_NAME = "air_quality_db"

BRONZE_TABLE     = f"{DATABASE_NAME}.bronze_raw_measurements"
SILVER_TABLE     = f"{DATABASE_NAME}.silver_clean_measurements"
GOLD_TABLE_CITY  = f"{DATABASE_NAME}.gold_city_rankings"
GOLD_TABLE_TREND = f"{DATABASE_NAME}.gold_pollutant_trends"
GOLD_TABLE_AQI   = f"{DATABASE_NAME}.gold_aqi_summary"

# ✅ Updated to OpenAQ v3
API_BASE_URL = "https://api.openaq.org/v3"
OPENAQ_API_KEY = "6702146761c7bbc4554416a63c3856c80bd69d9faae0770cfe573214c0d6ea69"   # ← paste your key here

TARGET_CITIES = [
    "Delhi", "Mumbai", "Beijing", "Shanghai", "Lahore",
    "Dhaka", "Karachi", "Lima", "Jakarta", "Bangkok",
    "London", "Paris", "New York", "Los Angeles", "Tokyo",
    "Seoul", "Mexico City", "Cairo", "Lagos", "Nairobi"
]

TARGET_POLLUTANTS = ["pm25", "pm10", "no2", "o3", "co", "so2"]

AQI_CATEGORIES = {
    "Good":                  (0.0,   12.0),
    "Moderate":              (12.1,  35.4),
    "Unhealthy (Sensitive)": (35.5,  55.4),
    "Unhealthy":             (55.5,  150.4),
    "Very Unhealthy":        (150.5, 250.4),
    "Hazardous":             (250.5, 9999.0)
}

print("✅ Configuration variables loaded successfully.")
print(f"   Database   : {DATABASE_NAME}")
print(f"   API Version: v3")
print(f"   API Key    : {'SET ✅' if OPENAQ_API_KEY != 'PASTE_YOUR_API_KEY_HERE' else 'NOT SET ❌ — paste your key above'}")
print(f"   Cities     : {len(TARGET_CITIES)} cities configured")
print(f"   Pollutants : {TARGET_POLLUTANTS}")

✅ Configuration variables loaded successfully.
   Database   : air_quality_db
   API Version: v3
   API Key    : SET ✅
   Cities     : 20 cities configured
   Pollutants : ['pm25', 'pm10', 'no2', 'o3', 'co', 'so2']


In [0]:
# ============================================================
# CREATE DATABASE
# ============================================================

spark.sql(f"CREATE DATABASE IF NOT EXISTS {DATABASE_NAME}")
spark.sql(f"USE {DATABASE_NAME}")

# Confirm creation — compatible with all Databricks runtime versions
db_df = spark.sql("SHOW DATABASES")

# Dynamically detect the correct column name
col_name = db_df.columns[0]  # Works regardless of runtime version
databases = [row[col_name] for row in db_df.collect()]

if DATABASE_NAME in databases:
    print(f"✅ Database '{DATABASE_NAME}' is ready.")
    print(f"   (Detected column name: '{col_name}')")
else:
    raise Exception(f"❌ Database '{DATABASE_NAME}' was NOT created. Check permissions.")

✅ Database 'air_quality_db' is ready.
   (Detected column name: 'databaseName')


In [0]:
# ============================================================
# CREATE MANAGED DELTA TABLES (empty scaffolds)
# Databricks manages all storage locations automatically
# ============================================================

spark.sql(f"USE {DATABASE_NAME}")

# Bronze — raw API response
spark.sql(f"""
    CREATE TABLE IF NOT EXISTS {BRONZE_TABLE} (
        city            STRING,
        country         STRING,
        pollutant       STRING,
        value           DOUBLE,
        unit            STRING,
        location_name   STRING,
        latitude        DOUBLE,
        longitude       DOUBLE,
        measured_at     TIMESTAMP,
        ingested_at     TIMESTAMP,
        source_url      STRING,
        raw_json        STRING
    )
    USING DELTA
""")
print(f"✅ Bronze table ready : {BRONZE_TABLE}")

# Silver — cleaned and validated
spark.sql(f"""
    CREATE TABLE IF NOT EXISTS {SILVER_TABLE} (
        city            STRING,
        country         STRING,
        pollutant       STRING,
        value           DOUBLE,
        unit            STRING,
        location_name   STRING,
        latitude        DOUBLE,
        longitude       DOUBLE,
        measured_at     TIMESTAMP,
        ingested_at     TIMESTAMP,
        aqi_category    STRING,
        is_valid        BOOLEAN
    )
    USING DELTA
""")
print(f"✅ Silver table ready : {SILVER_TABLE}")

# Gold — city rankings
spark.sql(f"""
    CREATE TABLE IF NOT EXISTS {GOLD_TABLE_CITY} (
        city            STRING,
        country         STRING,
        pollutant       STRING,
        avg_value       DOUBLE,
        max_value       DOUBLE,
        min_value       DOUBLE,
        reading_count   LONG,
        dominant_aqi    STRING,
        last_updated    TIMESTAMP
    )
    USING DELTA
""")
print(f"✅ Gold table ready   : {GOLD_TABLE_CITY}")

# Gold — pollutant trends
spark.sql(f"""
    CREATE TABLE IF NOT EXISTS {GOLD_TABLE_TREND} (
        city            STRING,
        pollutant       STRING,
        reading_date    DATE,
        avg_value       DOUBLE,
        reading_count   LONG
    )
    USING DELTA
""")
print(f"✅ Gold table ready   : {GOLD_TABLE_TREND}")

# Gold — AQI summary
spark.sql(f"""
    CREATE TABLE IF NOT EXISTS {GOLD_TABLE_AQI} (
        city            STRING,
        aqi_category    STRING,
        reading_count   LONG,
        percentage      DOUBLE,
        last_updated    TIMESTAMP
    )
    USING DELTA
""")
print(f"✅ Gold table ready   : {GOLD_TABLE_AQI}")

print("\n📦 All 5 managed Delta tables scaffolded successfully.")

✅ Bronze table ready : air_quality_db.bronze_raw_measurements
✅ Silver table ready : air_quality_db.silver_clean_measurements
✅ Gold table ready   : air_quality_db.gold_city_rankings
✅ Gold table ready   : air_quality_db.gold_pollutant_trends
✅ Gold table ready   : air_quality_db.gold_aqi_summary

📦 All 5 managed Delta tables scaffolded successfully.


In [0]:
# ============================================================
# VALIDATE OPENAQ v3 API IS REACHABLE
# ============================================================

import requests

def test_api_connection():
    """Make a lightweight test call to OpenAQ v3 API."""
    
    # v3 requires API key in the header
    headers = {
        "Accept": "application/json",
        "X-API-Key": OPENAQ_API_KEY
    }

    test_url = f"{API_BASE_URL}/parameters?limit=5"
    
    try:
        response = requests.get(test_url, headers=headers, timeout=15)
        
        if response.status_code == 200:
            data = response.json()
            param_names = [p["name"] for p in data.get("results", [])]
            print(f"✅ OpenAQ v3 API is reachable.")
            print(f"   Status Code   : {response.status_code}")
            print(f"   Sample Params : {param_names}")
            return True

        elif response.status_code == 401:
            print("❌ Unauthorized — Your API key is invalid or not yet active.")
            print("   Check your key at: https://explore.openaq.org")
            return False

        elif response.status_code == 410:
            print("❌ Still hitting v2 endpoint — check API_BASE_URL in Cell 2.")
            return False

        else:
            print(f"⚠️  Unexpected status: {response.status_code}")
            print(f"   Response: {response.text[:300]}")
            return False

    except requests.exceptions.ConnectionError:
        print("❌ Connection Error — Restart the cluster and try again.")
        return False
    except requests.exceptions.Timeout:
        print("❌ Timeout — API did not respond within 15 seconds.")
        return False

api_ok = test_api_connection()

✅ OpenAQ v3 API is reachable.
   Status Code   : 200
   Sample Params : ['pm10', 'pm25', 'o3', 'co', 'no2']


In [0]:
# ============================================================
# VALIDATE SPARK & DELTA LAKE
# ============================================================

from pyspark.sql import Row
from pyspark.sql.types import StructType, StructField, StringType, DoubleType, TimestampType

# Check Spark version
print(f"✅ Spark Version : {spark.version}")

# Explicit schema for test row
schema = StructType([
    StructField("city", StringType(), True),
    StructField("country", StringType(), True),
    StructField("pollutant", StringType(), True),
    StructField("value", DoubleType(), True),
    StructField("unit", StringType(), True),
    StructField("location_name", StringType(), True),
    StructField("latitude", DoubleType(), True),
    StructField("longitude", DoubleType(), True),
    StructField("measured_at", TimestampType(), True),
    StructField("ingested_at", TimestampType(), True),
    StructField("source_url", StringType(), True),
    StructField("raw_json", StringType(), True)
])

# Validate Delta works by writing a row to bronze and reading it back
test_row = spark.createDataFrame([
    Row(
        city="TestCity", country="TC", pollutant="pm25",
        value=0.0, unit="µg/m³", location_name="test",
        latitude=0.0, longitude=0.0,
        measured_at=None, ingested_at=None,
        source_url="test", raw_json="{}"
    )
], schema=schema)

test_row.write.format("delta").mode("append").saveAsTable(BRONZE_TABLE)

count = spark.sql(f"SELECT COUNT(*) as cnt FROM {BRONZE_TABLE}").collect()[0]["cnt"]
assert count >= 1, "Delta write/read test failed"

# Clean up the test row
spark.sql(f"DELETE FROM {BRONZE_TABLE} WHERE city = 'TestCity'")

print(f"✅ Delta Lake    : Write/Read/Delete test passed")
print(f"✅ Managed Tables: Accessible and operational")

✅ Spark Version : 4.1.0
✅ Delta Lake    : Write/Read/Delete test passed
✅ Managed Tables: Accessible and operational


In [0]:
# ============================================================
# SETUP SUMMARY
# ============================================================

print("=" * 55)
print("  🌍 AIR QUALITY PIPELINE — SETUP COMPLETE")
print("=" * 55)
print(f"  ✅ Database     : {DATABASE_NAME}")
print(f"  ✅ Bronze Path  : {BRONZE_PATH}")
print(f"  ✅ Silver Path  : {SILVER_PATH}")
print(f"  ✅ Gold Paths   : 3 tables configured")
print(f"  ✅ API          : {'Reachable' if api_ok else 'UNREACHABLE - check network'}")
print(f"  ✅ Delta Lake   : Operational")
print("=" * 55)
print("\n▶️  Next Step: Open and run  01_bronze_ingestion.py")

  🌍 AIR QUALITY PIPELINE — SETUP COMPLETE
  ✅ Database     : air_quality_db
  ✅ Bronze Path  : dbfs:/user/air_quality_pipeline/bronze/raw_measurements
  ✅ Silver Path  : dbfs:/user/air_quality_pipeline/silver/clean_measurements
  ✅ Gold Paths   : 3 tables configured
  ✅ API          : Reachable
  ✅ Delta Lake   : Operational

▶️  Next Step: Open and run  01_bronze_ingestion.py
