In [None]:
#  Install required libraries for API data collection

!pip install requests pandas tqdm




In [None]:
#Store OpenAQ API Key securely in Colab

import os

# Paste your OpenAQ API key between the quotes
os.environ["OPENAQ_API_KEY"] = "ENTER_YOUR_APIKEY"

# Simple verification (DO NOT print the key)
if "OPENAQ_API_KEY" in os.environ:
    print("OpenAQ API key loaded successfully.")
else:
    print("API key not found. Check the setup.")


OpenAQ API key loaded successfully.


In [None]:
# Define cities, pollutants, and base API configuration

# Major cities of India (expandable later if needed)
INDIAN_CITIES = [
    "Delhi",
    "Mumbai",
    "Bengaluru",
    "Chennai",
    "Kolkata",
    "Hyderabad",
    "Pune",
    "Ahmedabad",
    "Jaipur",
    "Lucknow"
]

# Pollutants required by the project
POLLUTANTS = ["pm25", "pm10", "no2", "so2", "co", "o3"]

# OpenAQ v3 base configuration
BASE_URL = "https://api.openaq.org/v3"
COUNTRY_CODE = "IN"

print("Cities configured:", len(INDIAN_CITIES))
print("Pollutants configured:", POLLUTANTS)
print("Base URL:", BASE_URL)


Cities configured: 10
Pollutants configured: ['pm25', 'pm10', 'no2', 'so2', 'co', 'o3']
Base URL: https://api.openaq.org/v3


In [None]:
# STEP 5A (FINAL): Fetch Indian locations using geographic bounding box

import requests
import os

API_KEY = os.environ.get("OPENAQ_API_KEY")

headers = {
    "X-API-Key": API_KEY
}

locations_url = f"{BASE_URL}/locations"

# India bounding box: [min_lon, min_lat, max_lon, max_lat]
params = {
    "bbox": "68,6,97,37",
    "limit": 10
}

response = requests.get(locations_url, headers=headers, params=params)

print("HTTP Status Code:", response.status_code)

if response.status_code == 200:
    data = response.json()
    print("Locations fetched:", len(data["results"]))
    print("Sample VERIFIED Indian location:")
    print(data["results"][0])
else:
    print("Failed to fetch locations")
    print(response.text)


HTTP Status Code: 200
Locations fetched: 10
Sample VERIFIED Indian location:
{'id': 12, 'name': 'SPARTAN - IIT Kanpur', 'locality': None, 'timezone': 'Asia/Kolkata', 'country': {'id': 9, 'code': 'IN', 'name': 'India'}, 'owner': {'id': 4, 'name': 'Unknown Governmental Organization'}, 'provider': {'id': 226, 'name': 'Spartan'}, 'isMobile': False, 'isMonitor': True, 'instruments': [{'id': 2, 'name': 'Government Monitor'}], 'sensors': [{'id': 23, 'name': 'pm25 µg/m³', 'parameter': {'id': 2, 'name': 'pm25', 'units': 'µg/m³', 'displayName': 'PM2.5'}}], 'coordinates': {'latitude': 26.519, 'longitude': 80.233}, 'licenses': None, 'bounds': [80.233, 26.519, 80.233, 26.519], 'distance': None, 'datetimeFirst': None, 'datetimeLast': None}


In [None]:
# STEP 5B (FINAL): Discover sensors via locations and fetch measurements

import requests
import os

API_KEY = os.environ.get("OPENAQ_API_KEY")

headers = {
    "X-API-Key": API_KEY
}

# 1️⃣ Fetch Indian locations
locations_url = f"{BASE_URL}/locations"

params = {
    "bbox": "68,6,97,37",   # India bounding box
    "limit": 20
}

response = requests.get(locations_url, headers=headers, params=params)

print("Locations status:", response.status_code)

if response.status_code != 200:
    print("Failed to fetch locations")
    raise SystemExit

locations = response.json()["results"]

print("Locations retrieved:", len(locations))

# 2️⃣ Find first sensor inside locations
sensor_id = None

for loc in locations:
    sensors = loc.get("sensors", [])
    if sensors:
        sensor_id = sensors[0]["id"]
        print("Using sensor ID:", sensor_id)
        break

if sensor_id is None:
    print("No sensors found in locations")
    raise SystemExit

# 3️⃣ Fetch measurements from that sensor
measurements_url = f"{BASE_URL}/sensors/{sensor_id}/measurements"

params = {
    "limit": 10,
    "sort": "desc"
}

response = requests.get(measurements_url, headers=headers, params=params)

print("Measurements status:", response.status_code)

if response.status_code == 200:
    results = response.json()["results"]
    print("Measurements fetched:", len(results))

    if len(results) > 0:
        print("Sample measurement:")
        print(results[0])
    else:
        print("Sensor exists but has no recent data")
else:
    print("Failed to fetch measurements")
    print(response.text)


Locations status: 200
Locations retrieved: 20
Using sensor ID: 23
Measurements status: 200
Measurements fetched: 0
Sensor exists but has no recent data


In [None]:
# STEP 6: Define final dataset schema for INDIA_openaq_pollution.csv

FINAL_COLUMNS = [
    "city",
    "location_id",
    "location_name",
    "sensor_id",
    "pollutant",
    "value",
    "unit",
    "latitude",
    "longitude",
    "timestamp_utc",
    "country",
    "source"
]

print("Final dataset schema defined.")
print("Number of columns:", len(FINAL_COLUMNS))
print(FINAL_COLUMNS)


Final dataset schema defined.
Number of columns: 12
['city', 'location_id', 'location_name', 'sensor_id', 'pollutant', 'value', 'unit', 'latitude', 'longitude', 'timestamp_utc', 'country', 'source']


In [None]:
# STEP 7: Controlled data collection loop (India only, live API)

import requests
import os
from tqdm import tqdm

API_KEY = os.environ.get("OPENAQ_API_KEY")

headers = {
    "X-API-Key": API_KEY
}

# Storage for collected rows (in-memory only)
collected_rows = []

# 1️⃣ Fetch Indian locations (limited for safety)
locations_url = f"{BASE_URL}/locations"

location_params = {
    "bbox": "68,6,97,37",   # India bounding box
    "limit": 50            # controlled limit (can be increased later)
}

loc_response = requests.get(locations_url, headers=headers, params=location_params)
loc_response.raise_for_status()

locations = loc_response.json()["results"]

print("Total Indian locations fetched:", len(locations))

# 2️⃣ Loop through locations → sensors → measurements
for loc in tqdm(locations, desc="Processing locations"):
    location_id = loc.get("id")
    location_name = loc.get("name")
    coords = loc.get("coordinates", {})
    latitude = coords.get("latitude")
    longitude = coords.get("longitude")

    sensors = loc.get("sensors", [])

    for sensor in sensors:
        sensor_id = sensor.get("id")
        pollutant = sensor.get("parameter", {}).get("name")

        # Only required pollutants
        if pollutant not in POLLUTANTS:
            continue

        measurements_url = f"{BASE_URL}/sensors/{sensor_id}/measurements"

        meas_params = {
            "limit": 5,
            "sort": "desc"
        }

        meas_response = requests.get(measurements_url, headers=headers, params=meas_params)

        if meas_response.status_code != 200:
            continue

        results = meas_response.json().get("results", [])

        # Skip sensors with no data
        if not results:
            continue

        for r in results:
            collected_rows.append({
                "city": None,  # city mapping comes later if needed
                "location_id": location_id,
                "location_name": location_name,
                "sensor_id": sensor_id,
                "pollutant": r.get("parameter"),
                "value": r.get("value"),
                "unit": r.get("unit"),
                "latitude": latitude,
                "longitude": longitude,
                "timestamp_utc": r.get("date", {}).get("utc"),
                "country": "IN",
                "source": "OpenAQ v3"
            })

print("Total records collected (in memory):", len(collected_rows))


Total Indian locations fetched: 50


Processing locations: 100%|██████████| 50/50 [01:33<00:00,  1.86s/it]

Total records collected (in memory): 696





In [None]:
# STEP 8: Convert collected data to DataFrame and save CSV

import pandas as pd

# Create DataFrame using predefined schema
df = pd.DataFrame(collected_rows, columns=FINAL_COLUMNS)

# Save to CSV
csv_filename = "INDIA_openaq_pollution.csv"
df.to_csv(csv_filename, index=False)

print("CSV file saved:", csv_filename)
print("Total rows:", df.shape[0])
print("Total columns:", df.shape[1])

# Preview first 5 rows
df.head()


CSV file saved: INDIA_openaq_pollution.csv
Total rows: 696
Total columns: 12


Unnamed: 0,city,location_id,location_name,sensor_id,pollutant,value,unit,latitude,longitude,timestamp_utc,country,source
0,,13,"Delhi Technological University, Delhi - CPCB",13866,"{'id': 5, 'name': 'no2', 'units': 'µg/m³', 'di...",11.3,,28.744,77.12,,IN,OpenAQ v3
1,,13,"Delhi Technological University, Delhi - CPCB",13866,"{'id': 5, 'name': 'no2', 'units': 'µg/m³', 'di...",8.6,,28.744,77.12,,IN,OpenAQ v3
2,,13,"Delhi Technological University, Delhi - CPCB",13866,"{'id': 5, 'name': 'no2', 'units': 'µg/m³', 'di...",8.6,,28.744,77.12,,IN,OpenAQ v3
3,,13,"Delhi Technological University, Delhi - CPCB",13866,"{'id': 5, 'name': 'no2', 'units': 'µg/m³', 'di...",9.3,,28.744,77.12,,IN,OpenAQ v3
4,,13,"Delhi Technological University, Delhi - CPCB",13866,"{'id': 5, 'name': 'no2', 'units': 'µg/m³', 'di...",10.6,,28.744,77.12,,IN,OpenAQ v3


In [None]:
#  Load dataset and inspect structure

import pandas as pd

# Load the CSV generated in Milestone-1
df = pd.read_csv("INDIA_openaq_pollution.csv")

# Basic inspection
print("Dataset shape (rows, columns):", df.shape)
print("\nColumn names:")
print(df.columns.tolist())

print("\nData types:")
print(df.dtypes)

print("\nFirst 5 rows:")
df.head()


Dataset shape (rows, columns): (696, 12)

Column names:
['city', 'location_id', 'location_name', 'sensor_id', 'pollutant', 'value', 'unit', 'latitude', 'longitude', 'timestamp_utc', 'country', 'source']

Data types:
city             float64
location_id        int64
location_name     object
sensor_id          int64
pollutant         object
value            float64
unit             float64
latitude         float64
longitude        float64
timestamp_utc    float64
country           object
source            object
dtype: object

First 5 rows:


Unnamed: 0,city,location_id,location_name,sensor_id,pollutant,value,unit,latitude,longitude,timestamp_utc,country,source
0,,13,"Delhi Technological University, Delhi - CPCB",13866,"{'id': 5, 'name': 'no2', 'units': 'µg/m³', 'di...",11.3,,28.744,77.12,,IN,OpenAQ v3
1,,13,"Delhi Technological University, Delhi - CPCB",13866,"{'id': 5, 'name': 'no2', 'units': 'µg/m³', 'di...",8.6,,28.744,77.12,,IN,OpenAQ v3
2,,13,"Delhi Technological University, Delhi - CPCB",13866,"{'id': 5, 'name': 'no2', 'units': 'µg/m³', 'di...",8.6,,28.744,77.12,,IN,OpenAQ v3
3,,13,"Delhi Technological University, Delhi - CPCB",13866,"{'id': 5, 'name': 'no2', 'units': 'µg/m³', 'di...",9.3,,28.744,77.12,,IN,OpenAQ v3
4,,13,"Delhi Technological University, Delhi - CPCB",13866,"{'id': 5, 'name': 'no2', 'units': 'µg/m³', 'di...",10.6,,28.744,77.12,,IN,OpenAQ v3


In [None]:
#  Parse and flatten the pollutant column

import ast

# Safely extract pollutant name and unit from the dict-like string
def parse_pollutant(p):
    try:
        p_dict = ast.literal_eval(p)
        return p_dict.get("name"), p_dict.get("units")
    except Exception:
        return None, None

df[["pollutant_name", "pollutant_unit"]] = df["pollutant"].apply(
    lambda x: pd.Series(parse_pollutant(x))
)

# Verify results
print("New columns added:")
print(df[["pollutant", "pollutant_name", "pollutant_unit"]].head())


New columns added:
                                           pollutant pollutant_name  \
0  {'id': 5, 'name': 'no2', 'units': 'µg/m³', 'di...            no2   
1  {'id': 5, 'name': 'no2', 'units': 'µg/m³', 'di...            no2   
2  {'id': 5, 'name': 'no2', 'units': 'µg/m³', 'di...            no2   
3  {'id': 5, 'name': 'no2', 'units': 'µg/m³', 'di...            no2   
4  {'id': 5, 'name': 'no2', 'units': 'µg/m³', 'di...            no2   

  pollutant_unit  
0          µg/m³  
1          µg/m³  
2          µg/m³  
3          µg/m³  
4          µg/m³  


In [None]:
#  Fix data types (NO imputation yet)

# 1️⃣ Fix timestamp: convert to datetime (UTC)
df["timestamp_utc"] = pd.to_datetime(df["timestamp_utc"], errors="coerce", utc=True)

# 2️⃣ Fix unit: replace raw 'unit' column using parsed pollutant_unit
df["unit"] = df["pollutant_unit"]

# 3️⃣ Fix city column type (keep NaN, just correct dtype)
df["city"] = df["city"].astype("object")

# Verify data types after correction
print("Updated data types:")
print(df.dtypes)

print("\nSample rows after type fix:")
df[["pollutant_name", "value", "unit", "timestamp_utc", "city"]].head()


Updated data types:
city                           object
location_id                     int64
location_name                  object
sensor_id                       int64
pollutant                      object
value                         float64
unit                           object
latitude                      float64
longitude                     float64
timestamp_utc     datetime64[ns, UTC]
country                        object
source                         object
pollutant_name                 object
pollutant_unit                 object
dtype: object

Sample rows after type fix:


Unnamed: 0,pollutant_name,value,unit,timestamp_utc,city
0,no2,11.3,µg/m³,NaT,
1,no2,8.6,µg/m³,NaT,
2,no2,8.6,µg/m³,NaT,
3,no2,9.3,µg/m³,NaT,
4,no2,10.6,µg/m³,NaT,


In [None]:
#  Handle missing values safely

# Count missing values before
print("Missing values BEFORE:")
print(df.isnull().sum())

# Drop rows where pollution value is missing
initial_rows = df.shape[0]
df = df.dropna(subset=["value"])
final_rows = df.shape[0]

print("\nRows before dropping missing values:", initial_rows)
print("Rows after dropping missing values:", final_rows)
print("Rows removed:", initial_rows - final_rows)

# Verify remaining missing values
print("\nMissing values AFTER:")
print(df.isnull().sum())


Missing values BEFORE:
city              696
location_id         0
location_name       0
sensor_id           0
pollutant           0
value               0
unit                0
latitude            0
longitude           0
timestamp_utc     696
country             0
source              0
pollutant_name      0
pollutant_unit      0
dtype: int64

Rows before dropping missing values: 696
Rows after dropping missing values: 696
Rows removed: 0

Missing values AFTER:
city              696
location_id         0
location_name       0
sensor_id           0
pollutant           0
value               0
unit                0
latitude            0
longitude           0
timestamp_utc     696
country             0
source              0
pollutant_name      0
pollutant_unit      0
dtype: int64


In [None]:
#  Remove duplicates and perform sanity checks

# 1️⃣ Remove exact duplicate rows
rows_before = df.shape[0]
df = df.drop_duplicates()
rows_after = df.shape[0]

print("Rows before duplicate removal:", rows_before)
print("Rows after duplicate removal:", rows_after)
print("Duplicates removed:", rows_before - rows_after)

# 2️⃣ Sanity check: value ranges by pollutant
print("\nSanity check (min / max by pollutant):")

sanity = (
    df.groupby("pollutant_name")["value"]
    .agg(["min", "max", "count"])
    .reset_index()
)

print(sanity)


Rows before duplicate removal: 696
Rows after duplicate removal: 583
Duplicates removed: 113

Sanity check (min / max by pollutant):
  pollutant_name   min       max  count
0             co  0.09  38970.00    109
1            no2  0.00    166.76    115
2             o3  0.00    176.71    109
3           pm10  0.00    582.00     50
4           pm25  0.00   9999.90     90
5            so2  0.00     77.40    110


In [None]:
# Temporal feature extraction (safe)

# Create temporal features only where timestamp exists
df["hour"] = df["timestamp_utc"].dt.hour
df["day"] = df["timestamp_utc"].dt.day
df["month"] = df["timestamp_utc"].dt.month
df["day_of_week"] = df["timestamp_utc"].dt.dayofweek

# Verify temporal features
print("Temporal feature sample:")
df[["timestamp_utc", "hour", "day", "month", "day_of_week"]].head()

print("\nMissing values in temporal features:")
print(df[["hour", "day", "month", "day_of_week"]].isnull().sum())


Temporal feature sample:

Missing values in temporal features:
hour           583
day            583
month          583
day_of_week    583
dtype: int64


In [None]:
# Geographic sanity checks

# Check latitude and longitude ranges
lat_outliers = df[
    (df["latitude"] < 6) | (df["latitude"] > 37)
]

lon_outliers = df[
    (df["longitude"] < 68) | (df["longitude"] > 97)
]

print("Latitude out-of-range points:", lat_outliers.shape[0])
print("Longitude out-of-range points:", lon_outliers.shape[0])

# Summary statistics for coordinates
print("\nLatitude summary:")
print(df["latitude"].describe())

print("\nLongitude summary:")
print(df["longitude"].describe())


Latitude out-of-range points: 0
Longitude out-of-range points: 0

Latitude summary:
count    583.000000
mean      24.235619
std        5.666411
min       12.938906
25%       17.659919
50%       26.845786
75%       28.634100
max       28.876028
Name: latitude, dtype: float64

Longitude summary:
count    583.000000
mean      78.512666
std        2.811583
min       72.998600
25%       77.131023
50%       77.316032
75%       80.191517
max       85.336206
Name: longitude, dtype: float64


In [None]:
#  Prepare final cleaned dataset and save

# Select final columns for Milestone-2 output
final_columns_m2 = [
    "location_id",
    "location_name",
    "sensor_id",
    "pollutant_name",
    "value",
    "unit",
    "latitude",
    "longitude",
    "timestamp_utc",
    "hour",
    "day",
    "month",
    "day_of_week",
    "country",
    "source"
]

df_m2 = df[final_columns_m2].copy()

# Save Milestone-2 output
m2_filename = "INDIA_openaq_pollution_cleaned_M2.csv"
df_m2.to_csv(m2_filename, index=False)

print("Milestone-2 CSV saved:", m2_filename)
print("Final shape:", df_m2.shape)

# Preview final dataset
df_m2.head()


Milestone-2 CSV saved: INDIA_openaq_pollution_cleaned_M2.csv
Final shape: (583, 15)


Unnamed: 0,location_id,location_name,sensor_id,pollutant_name,value,unit,latitude,longitude,timestamp_utc,hour,day,month,day_of_week,country,source
0,13,"Delhi Technological University, Delhi - CPCB",13866,no2,11.3,µg/m³,28.744,77.12,NaT,,,,,IN,OpenAQ v3
1,13,"Delhi Technological University, Delhi - CPCB",13866,no2,8.6,µg/m³,28.744,77.12,NaT,,,,,IN,OpenAQ v3
3,13,"Delhi Technological University, Delhi - CPCB",13866,no2,9.3,µg/m³,28.744,77.12,NaT,,,,,IN,OpenAQ v3
4,13,"Delhi Technological University, Delhi - CPCB",13866,no2,10.6,µg/m³,28.744,77.12,NaT,,,,,IN,OpenAQ v3
5,13,"Delhi Technological University, Delhi - CPCB",13864,pm25,300.0,µg/m³,28.744,77.12,NaT,,,,,IN,OpenAQ v3


In [None]:
# Load cleaned dataset and define labeling objective

import pandas as pd

# Load Milestone-2 output
df = pd.read_csv("INDIA_openaq_pollution_cleaned_M2.csv")

print("Dataset loaded successfully.")
print("Shape:", df.shape)

print("\nColumns:")
print(df.columns.tolist())

# Define pollution source labels (fixed)
SOURCE_LABELS = [
    "Vehicular",
    "Industrial",
    "Agricultural",
    "Burning",
    "Natural",
    "Unknown"
]

print("\nDefined pollution source classes:")
print(SOURCE_LABELS)

# Preview data
df.head()


Dataset loaded successfully.
Shape: (583, 15)

Columns:
['location_id', 'location_name', 'sensor_id', 'pollutant_name', 'value', 'unit', 'latitude', 'longitude', 'timestamp_utc', 'hour', 'day', 'month', 'day_of_week', 'country', 'source']

Defined pollution source classes:
['Vehicular', 'Industrial', 'Agricultural', 'Burning', 'Natural', 'Unknown']


Unnamed: 0,location_id,location_name,sensor_id,pollutant_name,value,unit,latitude,longitude,timestamp_utc,hour,day,month,day_of_week,country,source
0,13,"Delhi Technological University, Delhi - CPCB",13866,no2,11.3,µg/m³,28.744,77.12,,,,,,IN,OpenAQ v3
1,13,"Delhi Technological University, Delhi - CPCB",13866,no2,8.6,µg/m³,28.744,77.12,,,,,,IN,OpenAQ v3
2,13,"Delhi Technological University, Delhi - CPCB",13866,no2,9.3,µg/m³,28.744,77.12,,,,,,IN,OpenAQ v3
3,13,"Delhi Technological University, Delhi - CPCB",13866,no2,10.6,µg/m³,28.744,77.12,,,,,,IN,OpenAQ v3
4,13,"Delhi Technological University, Delhi - CPCB",13864,pm25,300.0,µg/m³,28.744,77.12,,,,,,IN,OpenAQ v3


In [None]:
#  Rule-based pollution source labeling

def assign_source_label(row):
    pollutant = row["pollutant_name"]
    value = row["value"]

    # Safety check
    if pd.isna(pollutant) or pd.isna(value):
        return "Unknown"

    # Vehicular emissions
    if pollutant in ["no2", "co"] and value >= 40:
        return "Vehicular"

    # Industrial emissions
    if pollutant == "so2" and value >= 20:
        return "Industrial"

    # Burning (extreme PM2.5)
    if pollutant == "pm25" and value >= 250:
        return "Burning"

    # Agricultural (high PM)
    if pollutant in ["pm25", "pm10"] and value >= 100:
        return "Agricultural"

    # Natural background
    if pollutant in ["pm25", "pm10"] and value < 100:
        return "Natural"

    # Fallback
    return "Unknown"


# Apply labeling
df["source_label"] = df.apply(assign_source_label, axis=1)

# Verify results
print("Source label distribution:")
print(df["source_label"].value_counts())

df[["pollutant_name", "value", "source_label"]].head(10)


Source label distribution:
source_label
Unknown         260
Vehicular       150
Natural          67
Agricultural     61
Industrial       33
Burning          12
Name: count, dtype: int64


Unnamed: 0,pollutant_name,value,source_label
0,no2,11.3,Unknown
1,no2,8.6,Unknown
2,no2,9.3,Unknown
3,no2,10.6,Unknown
4,pm25,300.0,Burning
5,pm25,93.0,Natural
6,co,5900.0,Vehicular
7,co,7800.0,Vehicular
8,co,8700.0,Vehicular
9,co,8400.0,Vehicular


In [None]:
#  Save final labeled dataset

# Select final columns for Milestone-3
final_columns_m3 = [
    "location_id",
    "location_name",
    "sensor_id",
    "pollutant_name",
    "value",
    "unit",
    "latitude",
    "longitude",
    "timestamp_utc",
    "hour",
    "day",
    "month",
    "day_of_week",
    "source_label",
    "country",
    "source"
]

df_m3 = df[final_columns_m3].copy()

# Save CSV
m3_filename = "INDIA_openaq_pollution_labeled_M3.csv"
df_m3.to_csv(m3_filename, index=False)

print("Milestone-3 CSV saved:", m3_filename)
print("Final shape:", df_m3.shape)

# Preview
df_m3.head()


Milestone-3 CSV saved: INDIA_openaq_pollution_labeled_M3.csv
Final shape: (583, 16)


Unnamed: 0,location_id,location_name,sensor_id,pollutant_name,value,unit,latitude,longitude,timestamp_utc,hour,day,month,day_of_week,source_label,country,source
0,13,"Delhi Technological University, Delhi - CPCB",13866,no2,11.3,µg/m³,28.744,77.12,,,,,,Unknown,IN,OpenAQ v3
1,13,"Delhi Technological University, Delhi - CPCB",13866,no2,8.6,µg/m³,28.744,77.12,,,,,,Unknown,IN,OpenAQ v3
2,13,"Delhi Technological University, Delhi - CPCB",13866,no2,9.3,µg/m³,28.744,77.12,,,,,,Unknown,IN,OpenAQ v3
3,13,"Delhi Technological University, Delhi - CPCB",13866,no2,10.6,µg/m³,28.744,77.12,,,,,,Unknown,IN,OpenAQ v3
4,13,"Delhi Technological University, Delhi - CPCB",13864,pm25,300.0,µg/m³,28.744,77.12,,,,,,Burning,IN,OpenAQ v3


In [None]:
#  Install geospatial libraries

!pip install osmnx geopandas shapely pyproj rtree networkx


Collecting osmnx
  Downloading osmnx-2.0.7-py3-none-any.whl.metadata (4.9 kB)
Downloading osmnx-2.0.7-py3-none-any.whl (101 kB)
[2K   [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m101.5/101.5 kB[0m [31m2.4 MB/s[0m eta [36m0:00:00[0m
[?25hInstalling collected packages: osmnx
Successfully installed osmnx-2.0.7


In [None]:
#  Load labeled dataset and create GeoDataFrame

import pandas as pd
import geopandas as gpd
from shapely.geometry import Point

# Load Milestone-3 labeled dataset
df = pd.read_csv("INDIA_openaq_pollution_labeled_M3.csv")

print("Dataset loaded.")
print("Shape:", df.shape)

# Convert lat/long to geometry
geometry = [Point(xy) for xy in zip(df["longitude"], df["latitude"])]

gdf = gpd.GeoDataFrame(df, geometry=geometry, crs="EPSG:4326")

print("GeoDataFrame created.")
print("CRS:", gdf.crs)

# Preview
gdf.head()


Dataset loaded.
Shape: (583, 16)
GeoDataFrame created.
CRS: EPSG:4326


Unnamed: 0,location_id,location_name,sensor_id,pollutant_name,value,unit,latitude,longitude,timestamp_utc,hour,day,month,day_of_week,source_label,country,source,geometry
0,13,"Delhi Technological University, Delhi - CPCB",13866,no2,11.3,µg/m³,28.744,77.12,,,,,,Unknown,IN,OpenAQ v3,POINT (77.12 28.744)
1,13,"Delhi Technological University, Delhi - CPCB",13866,no2,8.6,µg/m³,28.744,77.12,,,,,,Unknown,IN,OpenAQ v3,POINT (77.12 28.744)
2,13,"Delhi Technological University, Delhi - CPCB",13866,no2,9.3,µg/m³,28.744,77.12,,,,,,Unknown,IN,OpenAQ v3,POINT (77.12 28.744)
3,13,"Delhi Technological University, Delhi - CPCB",13866,no2,10.6,µg/m³,28.744,77.12,,,,,,Unknown,IN,OpenAQ v3,POINT (77.12 28.744)
4,13,"Delhi Technological University, Delhi - CPCB",13864,pm25,300.0,µg/m³,28.744,77.12,,,,,,Burning,IN,OpenAQ v3,POINT (77.12 28.744)


In [None]:
# Fetch local road networks around points

import osmnx as ox
import geopandas as gpd

ox.settings.log_console = False
ox.settings.use_cache = True

print("Fetching local road networks around pollution points...")

# Take unique locations to reduce API calls
unique_points = gdf[["latitude", "longitude"]].drop_duplicates().head(20)

road_geometries = []

for idx, row in unique_points.iterrows():
    try:
        G = ox.graph_from_point(
            (row["latitude"], row["longitude"]),
            dist=3000,  # 3 km radius
            network_type="drive"
        )
        edges = ox.graph_to_gdfs(G, nodes=False, edges=True)
        road_geometries.append(edges)
    except Exception as e:
        continue

# Combine all road segments
roads_gdf = gpd.GeoDataFrame(
    pd.concat(road_geometries, ignore_index=True),
    crs="EPSG:4326"
)

print("Local road data fetched.")
print("Total road segments:", roads_gdf.shape[0])


Fetching local road networks around pollution points...
Local road data fetched.
Total road segments: 286564


In [None]:
#Compute distance to nearest road (meters)

# Reproject both datasets to metric CRS
gdf_proj = gdf.to_crs(epsg=3857)
roads_proj = roads_gdf.to_crs(epsg=3857)

print("CRS after reprojection:")
print("Pollution points:", gdf_proj.crs)
print("Roads:", roads_proj.crs)

# Compute distance to nearest road
print("Computing distance to nearest road...")

gdf_proj["dist_to_road_m"] = gdf_proj.geometry.apply(
    lambda geom: roads_proj.distance(geom).min()
)

print("Distance computation completed.")

# Preview results
gdf_proj[["latitude", "longitude", "dist_to_road_m"]].head()


CRS after reprojection:
Pollution points: EPSG:3857
Roads: EPSG:3857
Computing distance to nearest road...
Distance computation completed.


Unnamed: 0,latitude,longitude,dist_to_road_m
0,28.744,77.12,44.595218
1,28.744,77.12,44.595218
2,28.744,77.12,44.595218
3,28.744,77.12,44.595218
4,28.744,77.12,44.595218


In [None]:
#Fetch industrial areas & compute distance to industry

import osmnx as ox
import geopandas as gpd
import pandas as pd

print("Fetching industrial landuse polygons...")

industrial_polys = []

# Reuse same unique points (limit API calls)
for idx, row in unique_points.iterrows():
    try:
        tags = {"landuse": "industrial"}
        gdf_ind = ox.geometries_from_point(
            (row["latitude"], row["longitude"]),
            tags=tags,
            dist=5000  # 5 km radius
        )
        if not gdf_ind.empty:
            industrial_polys.append(gdf_ind)
    except Exception:
        continue

# Combine industrial polygons
if industrial_polys:
    industry_gdf = gpd.GeoDataFrame(
        pd.concat(industrial_polys, ignore_index=True),
        crs="EPSG:4326"
    )
    print("Industrial areas fetched:", industry_gdf.shape[0])
else:
    industry_gdf = gpd.GeoDataFrame(geometry=[], crs="EPSG:4326")
    print("No industrial areas found.")

# Reproject to metric CRS
industry_proj = industry_gdf.to_crs(epsg=3857)

# Compute distance to nearest industrial area
print("Computing distance to nearest industrial area...")

if not industry_proj.empty:
    gdf_proj["dist_to_industry_m"] = gdf_proj.geometry.apply(
        lambda geom: industry_proj.distance(geom).min()
    )
else:
    gdf_proj["dist_to_industry_m"] = None

print("Industry distance computation completed.")

# Preview
gdf_proj[["dist_to_industry_m"]].head()


Fetching industrial landuse polygons...
No industrial areas found.
Computing distance to nearest industrial area...
Industry distance computation completed.


Unnamed: 0,dist_to_industry_m
0,
1,
2,
3,
4,


In [None]:
#Fetch farmland & compute distance to farmland

print("Fetching farmland / agricultural landuse polygons...")

farmland_polys = []

for idx, row in unique_points.iterrows():
    try:
        tags = {"landuse": ["farmland", "agricultural"]}
        gdf_farm = ox.geometries_from_point(
            (row["latitude"], row["longitude"]),
            tags=tags,
            dist=5000  # 5 km radius
        )
        if not gdf_farm.empty:
            farmland_polys.append(gdf_farm)
    except Exception:
        continue

# Combine farmland polygons
if farmland_polys:
    farmland_gdf = gpd.GeoDataFrame(
        pd.concat(farmland_polys, ignore_index=True),
        crs="EPSG:4326"
    )
    print("Farmland areas fetched:", farmland_gdf.shape[0])
else:
    farmland_gdf = gpd.GeoDataFrame(geometry=[], crs="EPSG:4326")
    print("No farmland areas found.")

# Reproject to metric CRS
farmland_proj = farmland_gdf.to_crs(epsg=3857)

# Compute distance to nearest farmland
print("Computing distance to nearest farmland...")

if not farmland_proj.empty:
    gdf_proj["dist_to_farmland_m"] = gdf_proj.geometry.apply(
        lambda geom: farmland_proj.distance(geom).min()
    )
else:
    gdf_proj["dist_to_farmland_m"] = None

print("Farmland distance computation completed.")

# Preview
gdf_proj[["dist_to_farmland_m"]].head()


Fetching farmland / agricultural landuse polygons...
No farmland areas found.
Computing distance to nearest farmland...
Farmland distance computation completed.


Unnamed: 0,dist_to_farmland_m
0,
1,
2,
3,
4,


In [None]:
#Context-aware source labeling using proximity + pollutants

def assign_source_label_context(row):
    p = row["pollutant_name"]
    v = row["value"]
    d_road = row["dist_to_road_m"]
    d_ind = row["dist_to_industry_m"]
    d_farm = row["dist_to_farmland_m"]

    # Safety
    if pd.isna(p) or pd.isna(v):
        return "Unknown"

    # Vehicular
    if d_road is not None and d_road <= 200 and p in ["no2", "co"] and v >= 20:
        return "Vehicular"

    # Industrial
    if d_ind is not None and d_ind <= 1000 and p == "so2" and v >= 10:
        return "Industrial"

    # Agricultural
    if d_farm is not None and d_farm <= 2000 and p in ["pm25", "pm10"] and v >= 80:
        return "Agricultural"

    # Burning
    if p == "pm25" and v >= 250:
        return "Burning"

    # Natural
    if p in ["pm25", "pm10"] and v < 80:
        return "Natural"

    return "Unknown"


# Apply upgraded labeling
gdf_proj["source_label"] = gdf_proj.apply(assign_source_label_context, axis=1)

# Verify distribution
print("Updated source label distribution:")
print(gdf_proj["source_label"].value_counts())

gdf_proj[["pollutant_name", "value", "dist_to_road_m", "source_label"]].head(10)


Updated source label distribution:
source_label
Unknown      368
Vehicular    151
Natural       52
Burning       12
Name: count, dtype: int64


Unnamed: 0,pollutant_name,value,dist_to_road_m,source_label
0,no2,11.3,44.595218,Unknown
1,no2,8.6,44.595218,Unknown
2,no2,9.3,44.595218,Unknown
3,no2,10.6,44.595218,Unknown
4,pm25,300.0,44.595218,Burning
5,pm25,93.0,44.595218,Unknown
6,co,5900.0,6.514188,Vehicular
7,co,7800.0,6.514188,Vehicular
8,co,8700.0,6.514188,Vehicular
9,co,8400.0,6.514188,Vehicular


In [None]:
#Prepare data for ML model training

from sklearn.model_selection import train_test_split

# Select features for ML
feature_cols = [
    "value",
    "dist_to_road_m",
    "dist_to_industry_m",
    "dist_to_farmland_m"
]

# Replace None with large distance (no proximity)
X = gdf_proj[feature_cols].fillna(10000)

# Target variable
y = gdf_proj["source_label"]

# Train-test split (80/20)
X_train, X_test, y_train, y_test = train_test_split(
    X, y, test_size=0.2, random_state=42, stratify=y
)

print("Training samples:", X_train.shape[0])
print("Testing samples:", X_test.shape[0])
print("\nTarget distribution in training set:")
print(y_train.value_counts())


Training samples: 466
Testing samples: 117

Target distribution in training set:
source_label
Unknown      294
Vehicular    121
Natural       41
Burning       10
Name: count, dtype: int64


  X = gdf_proj[feature_cols].fillna(10000)


In [None]:
#Train Random Forest model

from sklearn.ensemble import RandomForestClassifier

# Initialize model
rf_model = RandomForestClassifier(
    n_estimators=200,
    random_state=42,
    class_weight="balanced"
)

# Train model
rf_model.fit(X_train, y_train)

print("Random Forest model trained successfully.")


Random Forest model trained successfully.


In [None]:
# Evaluate model performance

from sklearn.metrics import (
    accuracy_score,
    classification_report,
    confusion_matrix
)

# Predict on test set
y_pred = rf_model.predict(X_test)

# Accuracy
accuracy = accuracy_score(y_test, y_pred)
print("Accuracy:", accuracy)

# Detailed classification report
print("\nClassification Report:")
print(classification_report(y_test, y_pred))

# Confusion matrix
print("Confusion Matrix:")
print(confusion_matrix(y_test, y_pred))


Accuracy: 0.8888888888888888

Classification Report:
              precision    recall  f1-score   support

     Burning       0.00      0.00      0.00         2
     Natural       0.67      0.73      0.70        11
     Unknown       0.93      0.95      0.94        74
   Vehicular       0.87      0.87      0.87        30

    accuracy                           0.89       117
   macro avg       0.62      0.63      0.63       117
weighted avg       0.88      0.89      0.88       117

Confusion Matrix:
[[ 0  0  1  1]
 [ 0  8  1  2]
 [ 0  3 70  1]
 [ 0  1  3 26]]


  _warn_prf(average, modifier, f"{metric.capitalize()} is", len(result))
  _warn_prf(average, modifier, f"{metric.capitalize()} is", len(result))
  _warn_prf(average, modifier, f"{metric.capitalize()} is", len(result))


In [None]:
#Save trained model

import joblib

model_filename = "pollution_source_rf_model.pkl"

joblib.dump(rf_model, model_filename)

print("Model saved as:", model_filename)


Model saved as: pollution_source_rf_model.pkl


In [None]:
#Folium map with heatmap and source markers

!pip install folium




In [None]:
import folium
from folium.plugins import HeatMap

# Center map at mean location
center_lat = gdf_proj["latitude"].mean()
center_lon = gdf_proj["longitude"].mean()

m = folium.Map(location=[center_lat, center_lon], zoom_start=6)

# Heatmap (pollution intensity)
heat_data = [
    [row["latitude"], row["longitude"], row["value"]]
    for _, row in gdf_proj.iterrows()
]

HeatMap(heat_data, radius=12).add_to(m)

# Color mapping for sources
color_map = {
    "Vehicular": "red",
    "Industrial": "purple",
    "Agricultural": "green",
    "Burning": "orange",
    "Natural": "blue",
    "Unknown": "gray"
}

# Add markers
for _, row in gdf_proj.iterrows():
    folium.CircleMarker(
        location=[row["latitude"], row["longitude"]],
        radius=4,
        color=color_map.get(row["source_label"], "black"),
        fill=True,
        fill_opacity=0.7,
        popup=f"{row['pollutant_name']} | {row['source_label']}"
    ).add_to(m)

# Save map
map_filename = "pollution_source_map.html"
m.save(map_filename)

print("Map saved as:", map_filename)


Map saved as: pollution_source_map.html


In [None]:
#Create minimal Streamlit dashboard

%%writefile app.py
import streamlit as st
import pandas as pd

st.set_page_config(page_title="EnviroScan Dashboard", layout="wide")

st.title("EnviroScan: AI-Powered Pollution Source Identifier")

# Load data
@st.cache_data
def load_data():
    return pd.read_csv("INDIA_openaq_pollution_labeled_M3.csv")

df = load_data()


# Sidebar filter
st.sidebar.header("Filter Options")
source = st.sidebar.selectbox(
    "Select Pollution Source",
    sorted(df["source_label"].unique())
)

filtered_df = df[df["source_label"] == source]

# Display metrics
st.metric("Number of Records", filtered_df.shape[0])

# Show table
st.subheader("Filtered Pollution Data")
st.dataframe(filtered_df.head(20))


# Simple alert logic
if source in ["Vehicular", "Burning"]:
    st.warning("⚠️ High-risk pollution source selected!")

st.success("Dashboard loaded successfully.")


Writing app.py


In [None]:
print("Streamlit dashboard file created: app.py")


Streamlit dashboard file created: app.py


In [None]:
!pip install streamlit pyngrok


Collecting streamlit
  Downloading streamlit-1.52.2-py3-none-any.whl.metadata (9.8 kB)
Collecting pyngrok
  Downloading pyngrok-7.5.0-py3-none-any.whl.metadata (8.1 kB)
Collecting pydeck<1,>=0.8.0b4 (from streamlit)
  Downloading pydeck-0.9.1-py2.py3-none-any.whl.metadata (4.1 kB)
Downloading streamlit-1.52.2-py3-none-any.whl (9.0 MB)
[2K   [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m9.0/9.0 MB[0m [31m42.5 MB/s[0m eta [36m0:00:00[0m
[?25hDownloading pyngrok-7.5.0-py3-none-any.whl (24 kB)
Downloading pydeck-0.9.1-py2.py3-none-any.whl (6.9 MB)
[2K   [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m6.9/6.9 MB[0m [31m61.4 MB/s[0m eta [36m0:00:00[0m
[?25hInstalling collected packages: pyngrok, pydeck, streamlit
Successfully installed pydeck-0.9.1 pyngrok-7.5.0 streamlit-1.52.2


In [None]:
!pip install cloudflared


Collecting cloudflared
  Downloading cloudflared-1.0.0.2.tar.gz (2.6 kB)
  Preparing metadata (setup.py) ... [?25l[?25hdone
Collecting setuptools_scm (from cloudflared)
  Downloading setuptools_scm-9.2.2-py3-none-any.whl.metadata (7.7 kB)
Downloading setuptools_scm-9.2.2-py3-none-any.whl (62 kB)
[2K   [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m63.0/63.0 kB[0m [31m1.7 MB/s[0m eta [36m0:00:00[0m
[?25hBuilding wheels for collected packages: cloudflared
  Building wheel for cloudflared (setup.py) ... [?25l[?25hdone
  Created wheel for cloudflared: filename=cloudflared-1.0.0.2-py3-none-any.whl size=2983 sha256=9a9717078c73b561a94935cd70dab1d427bc7db19174bab612b50df872435110
  Stored in directory: /root/.cache/pip/wheels/5b/ec/09/c3bcd3470be046ec77a9c0cb9d8bb6ceed49c831460878ab0a
Successfully built cloudflared
Installing collected packages: setuptools_scm, cloudflared
Successfully installed cloudflared-1.0.0.2 setuptools_scm-9.2.2


In [None]:
!sudo apt-get update
!sudo apt-get install -y cloudflared


Hit:1 https://cloud.r-project.org/bin/linux/ubuntu jammy-cran40/ InRelease
Get:2 https://cli.github.com/packages stable InRelease [3,917 B]
Hit:3 http://security.ubuntu.com/ubuntu jammy-security InRelease
Hit:4 http://archive.ubuntu.com/ubuntu jammy InRelease
Hit:5 http://archive.ubuntu.com/ubuntu jammy-updates InRelease
Hit:6 https://r2u.stat.illinois.edu/ubuntu jammy InRelease
Hit:7 http://archive.ubuntu.com/ubuntu jammy-backports InRelease
Hit:8 https://ppa.launchpadcontent.net/deadsnakes/ppa/ubuntu jammy InRelease
Hit:9 https://ppa.launchpadcontent.net/ubuntugis/ppa/ubuntu jammy InRelease
Fetched 3,917 B in 2s (2,459 B/s)
Reading package lists... Done
W: Skipping acquire of configured file 'main/source/Sources' as repository 'https://r2u.stat.illinois.edu/ubuntu jammy InRelease' does not seem to provide it (sources.list entry misspelt?)
Reading package lists... Done
Building dependency tree... Done
Reading state information... Done
E: Unable to locate package cloudflared


In [None]:
!streamlit run app.py



Collecting usage statistics. To deactivate, set browser.gatherUsageStats to false.
[0m
[0m
[34m[1m  You can now view your Streamlit app in your browser.[0m
[0m
[34m  Local URL: [0m[1mhttp://localhost:8501[0m
[34m  Network URL: [0m[1mhttp://172.28.0.12:8501[0m
[34m  External URL: [0m[1mhttp://35.194.190.174:8501[0m
[0m
[34m  Stopping...[0m
[34m  Stopping...[0m
[34m  Stopping...[0m
[34m  Stopping...[0m
Traceback (most recent call last):
  File "/usr/local/lib/python3.12/dist-packages/click/core.py", line 824, in invoke
    return callback(*args, **kwargs)
           ^^^^^^^^^^^^^^^^^^^^^^^^^
  File "/usr/local/lib/python3.12/dist-packages/streamlit/web/cli.py", line 251, in main_run
    _main_run(path_str, args, flag_options=kwargs)
  File "/usr/local/lib/python3.12/dist-packages/streamlit/web/cli.py", line 308, in _main_run
[34m  Stopping...[0m
object address  : 0x7f8d2a128700
object refcount : 3
object type     : 0xa10960
object type name: RuntimeError
obj

In [None]:
!cloudflared tunnel --url http://localhost:8501
