In [7]:
import requests
import pandas as pd
import time

# --- CONFIG ---
API_KEY = "9d8ad33c3bfde1431f1748da6878dc8c082b017fab31638caa50190fac77ce8a"
JERSEY_CITY_LAT, JERSEY_CITY_LON =13.0843,80.2705
TARGET_PARAMETERS = ['co', 'no2', 'so2', 'o3', 'pm25', 'pm10']
START_DATE = "2022-01-01"
END_DATE = "2022-12-31"


In [8]:
def get_jersey_city_locations(radius=25000):
    base_url = "https://api.openaq.org/v3/locations"
    params = {
        "coordinates": f"{JERSEY_CITY_LAT},{JERSEY_CITY_LON}",
        "radius": radius,
        "limit": 500
    }
    headers = {"X-API-Key": API_KEY}

    try:
        response = requests.get(base_url, params=params, headers=headers)
        if response.status_code != 200:
            print(f"❌ Failed: {response.status_code}")
            return pd.DataFrame()

        data = response.json().get('results', [])
        if not data:
            print("No locations found.")
            return pd.DataFrame()

        df = pd.json_normalize(data)
        df.rename(columns={
            'id': 'location_id',
            'name': 'location_name',
            'coordinates.latitude': 'latitude',
            'coordinates.longitude': 'longitude',
        }, inplace=True)

        print(f" Found {len(df)} monitoring stations.")
        return df

    except Exception as e:
        print(f" Error: {e}")
        return pd.DataFrame()


jersey_city_locations = get_jersey_city_locations()
display(jersey_city_locations.head())


 Found 11 monitoring stations.


Unnamed: 0,location_id,location_name,locality,timezone,isMobile,isMonitor,instruments,sensors,licenses,bounds,...,owner.id,owner.name,provider.id,provider.name,latitude,longitude,datetimeFirst.utc,datetimeFirst.local,datetimeLast.utc,datetimeLast.local
0,378,Alandur Bus Depot,,Asia/Kolkata,False,True,"[{'id': 2, 'name': 'Government Monitor'}]","[{'id': 661, 'name': 'co µg/m³', 'parameter': ...",,"[80.19151667, 12.99711389, 80.19151667, 12.997...",...,4,Unknown Governmental Organization,168,CPCB,12.997114,80.191517,2016-03-22T00:45:00Z,2016-03-22T06:15:00+05:30,2018-02-22T03:45:00Z,2018-02-22T09:15:00+05:30
1,2461,US Diplomatic Post: Chennai,,Asia/Kolkata,False,True,"[{'id': 2, 'name': 'Government Monitor'}]","[{'id': 4725, 'name': 'pm25 µg/m³', 'parameter...",,"[80.251932, 13.052371, 80.251932, 13.052371]",...,4,Unknown Governmental Organization,245,StateAir Chennai,13.052371,80.251932,2016-01-30T00:30:00Z,2016-01-30T06:00:00+05:30,2016-11-09T16:30:00Z,2016-11-09T22:00:00+05:30
2,2549,IIT,,Asia/Kolkata,False,True,"[{'id': 2, 'name': 'Government Monitor'}]","[{'id': 5141, 'name': 'co µg/m³', 'parameter':...",,"[80.23744722, 12.992513890000001, 80.23744722,...",...,4,Unknown Governmental Organization,168,CPCB,12.992514,80.237447,2016-03-22T00:45:00Z,2016-03-22T06:15:00+05:30,2018-02-22T03:45:00Z,2018-02-22T09:15:00+05:30
3,2586,"Manali, Chennai - CPCB",,Asia/Kolkata,False,True,"[{'id': 2, 'name': 'Government Monitor'}, {'id...","[{'id': 5339, 'name': 'co µg/m³', 'parameter':...",,"[80.26285, 13.164544, 80.26285, 13.164544]",...,8517,Central Pollution Control Board,168,CPCB,13.164544,80.26285,2016-03-21T10:00:00Z,2016-03-21T15:30:00+05:30,2025-11-07T10:00:00Z,2025-11-07T15:30:00+05:30
4,5655,"Velachery Res. Area, Chennai - CPCB",,Asia/Kolkata,False,True,"[{'id': 2, 'name': 'Government Monitor'}, {'id...","[{'id': 12235526, 'name': 'co ppb', 'parameter...",,"[80.2398125, 13.0052189, 80.2398125, 13.0052189]",...,8517,Central Pollution Control Board,168,CPCB,13.005219,80.239812,2018-03-09T05:30:00Z,2018-03-09T11:00:00+05:30,2025-11-07T10:15:00Z,2025-11-07T15:45:00+05:30


In [9]:
def get_sensor_data(sensor_id, start_date, end_date):
    print(f" Fetching data for sensor {sensor_id} from {start_date} to {end_date}...")

    base_url = f"https://api.openaq.org/v3/sensors/{sensor_id}/measurements"
    headers = {"X-API-Key": API_KEY}
    params = {
        "date_from": start_date,
        "date_to": end_date,
        "limit": 1000
    }

    all_results = []
    page = 1

    try:
        while True:
            params["page"] = page
            r = requests.get(base_url, headers=headers, params=params)
            if r.status_code != 200:
                print(f"❌ Error {r.status_code}: {r.text}")
                break

            data = r.json()
            results = data.get("results", [])
            if not results:
                break

            all_results.extend(results)
            if page >= data.get("meta", {}).get("pages", 1):
                break

            page += 1
            time.sleep(0.5)

    except Exception as e:
        print(f" Exception: {e}")

    if not all_results:
        print(f" No data found for sensor {sensor_id}")
        return pd.DataFrame()

    # Flatten relevant fields manually
    extracted = []
    for rec in all_results:
        extracted.append({
            "datetimeUtc": rec.get("period", {}).get("datetimeFrom", {}).get("utc"),
            "datetimeLocal": rec.get("period", {}).get("datetimeFrom", {}).get("local"),
            "parameter": rec.get("parameter", {}).get("name"),
            "value": rec.get("value"),
            "unit": rec.get("parameter", {}).get("units")
        })

    df = pd.DataFrame(extracted)

    # Convert datetime fields safely
    if "datetimeUtc" in df.columns:
        df["datetimeUtc"] = pd.to_datetime(df["datetimeUtc"], errors="coerce")
    if "datetimeLocal" in df.columns:
        df["datetimeLocal"] = pd.to_datetime(df["datetimeLocal"], errors="coerce")

    # Drop empty rows
    df = df.dropna(subset=["value"], how="all")

    print(f" Retrieved {len(df)} records for sensor {sensor_id}")
    return df[["datetimeUtc", "datetimeLocal", "parameter", "value", "unit"]]



In [10]:
def collect_targeted_sensor_data(locations_df, start_date, end_date, target_params):
    all_sensor_data = {}
    target_sensors = []

    for _, loc in locations_df.iterrows():
        sensors = loc.get('sensors', [])
        if not sensors:
            continue

        for s in sensors:
            param = s.get('parameter', {}).get('name', '')
            if param in target_params:
                target_sensors.append({
                    'id': s['id'],
                    'location_id': loc.get('location_id'),
                    'location_name': loc.get('location_name'),
                    'parameter': param
                })

    print(f" Found {len(target_sensors)} sensors for target pollutants.")

    for s in target_sensors:
        data = get_sensor_data(s['id'], start_date, end_date)
        if not data.empty:
            data['location_id'] = s['location_id']
            data['location_name'] = s['location_name']
            data['parameter'] = s['parameter']
            all_sensor_data[f"{s['location_id']}_{s['parameter']}"] = data

    return all_sensor_data


sensor_data_dict = collect_targeted_sensor_data(jersey_city_locations, START_DATE , END_DATE, TARGET_PARAMETERS)


 Found 86 sensors for target pollutants.
 Fetching data for sensor 661 from 2022-01-01 to 2022-12-31...
 Retrieved 103 records for sensor 661
 Fetching data for sensor 5374 from 2022-01-01 to 2022-12-31...
 Retrieved 1000 records for sensor 5374
 Fetching data for sensor 5352 from 2022-01-01 to 2022-12-31...
 Retrieved 1000 records for sensor 5352
 Fetching data for sensor 14304 from 2022-01-01 to 2022-12-31...
 Retrieved 1000 records for sensor 14304
 Fetching data for sensor 14305 from 2022-01-01 to 2022-12-31...
 Retrieved 1000 records for sensor 14305
 Fetching data for sensor 4725 from 2022-01-01 to 2022-12-31...
 Retrieved 1000 records for sensor 4725
 Fetching data for sensor 5141 from 2022-01-01 to 2022-12-31...
 Retrieved 1000 records for sensor 5141
 Fetching data for sensor 5142 from 2022-01-01 to 2022-12-31...
 Retrieved 1000 records for sensor 5142
 Fetching data for sensor 5144 from 2022-01-01 to 2022-12-31...
 Retrieved 1000 records for sensor 5144
 Fetching data for sen

In [14]:
# --- CLEANING AND ORGANIZING DATA FOR ANALYSIS ---

import pandas as pd, json

# Combine all collected sensor data
if 'sensor_data_dict' in locals() and sensor_data_dict:
    print("\nMerging all sensor data into one DataFrame...")
    all_data = pd.concat(sensor_data_dict.values(), ignore_index=True)
else:
    print(" No sensor data found to clean.")
    all_data = pd.DataFrame()

# Proceed only if data exists
if 'all_data' in locals() and not all_data.empty:
    print("\nCleaning and organizing data for analysis...")
    print(f"Initial dataset shape: {all_data.shape}")
    cleaned_data = all_data.copy()

    print("\nAvailable columns:")
    print(cleaned_data.columns.tolist())

    # --- Extract datetime from nested JSON field ---
    if 'period_datetimeFrom' in cleaned_data.columns:
        print("\nExamining period_datetimeFrom column...")
        for i in range(min(3, len(cleaned_data))):
            value = cleaned_data['period_datetimeFrom'].iloc[i]
            print(f"Row {i}: {value} (type: {type(value).__name__})")

        print("\nExtracting datetime directly from period_datetimeFrom...")

        def extract_utc_simple(dt_value):
            try:
                if isinstance(dt_value, dict) and 'utc' in dt_value:
                    return dt_value['utc']
            except Exception:
                pass
            return None

        try:
            cleaned_data['extracted_utc'] = cleaned_data['period_datetimeFrom'].apply(extract_utc_simple)
            cleaned_data['datetimeUtc'] = pd.to_datetime(cleaned_data['extracted_utc'], errors='coerce')
            success_count = cleaned_data['datetimeUtc'].notnull().sum()
            print(f" Successfully extracted {success_count} datetimes "
                  f"({success_count/len(cleaned_data)*100:.2f}% of rows)")
        except Exception as e:
            print(f" Error during extraction: {str(e)}")
    else:
        print(" period_datetimeFrom column not found!")

    # --- Date Components ---
    if 'datetimeUtc' in cleaned_data.columns and cleaned_data['datetimeUtc'].notnull().any():
        print("\nExtracting date components from datetimeUtc...")
        cleaned_data['year'] = cleaned_data['datetimeUtc'].dt.year
        cleaned_data['month'] = cleaned_data['datetimeUtc'].dt.month
        cleaned_data['day'] = cleaned_data['datetimeUtc'].dt.day
        cleaned_data['hour'] = cleaned_data['datetimeUtc'].dt.hour
        cleaned_data['dayofweek'] = cleaned_data['datetimeUtc'].dt.dayofweek
        non_null_years = cleaned_data['year'].notnull().sum()
        print(f"Extracted date components for {non_null_years} rows "
              f"({non_null_years/len(cleaned_data)*100:.2f}% of dataset)")

    # --- Extract latitude and longitude (robust) ---
    print("\n Extracting sensor coordinates (robust method)...")

    def extract_lat_lon(coord):
        lat, lon = None, None
        try:
            if isinstance(coord, str):
                coord = json.loads(coord)
            if isinstance(coord, dict):
                lat = coord.get('latitude') or coord.get('lat')
                lon = coord.get('longitude') or coord.get('lon')
        except Exception:
            pass
        return pd.Series({'latitude': lat, 'longitude': lon})

    if 'coordinates' in cleaned_data.columns:
        print("Found 'coordinates' column — extracting...")
        coords_df = cleaned_data['coordinates'].apply(extract_lat_lon)
        cleaned_data[['latitude', 'longitude']] = coords_df
    elif {'latitude', 'longitude'}.issubset(cleaned_data.columns):
        print("Using existing latitude and longitude columns.")
        cleaned_data['latitude'] = pd.to_numeric(cleaned_data['latitude'], errors='coerce')
        cleaned_data['longitude'] = pd.to_numeric(cleaned_data['longitude'], errors='coerce')
    else:
        print(" No coordinate columns found — creating empty latitude/longitude columns.")
        cleaned_data['latitude'] = None
        cleaned_data['longitude'] = None

    # --- Inject missing coordinates from jersey_city_locations ---
    if 'jersey_city_locations' in locals() and not jersey_city_locations.empty:
        print("\n Updating coordinates from sensor location table...")

        if set(['location_id', 'latitude', 'longitude']).issubset(jersey_city_locations.columns):
            before_missing = cleaned_data['latitude'].isna().sum()
            cleaned_data = cleaned_data.merge(
                jersey_city_locations[['location_id', 'latitude', 'longitude']],
                on='location_id',
                how='left',
                suffixes=('', '_loc')
            )
            # Fill missing lat/lon using merged columns
            cleaned_data['latitude'] = cleaned_data['latitude'].fillna(cleaned_data['latitude_loc'])
            cleaned_data['longitude'] = cleaned_data['longitude'].fillna(cleaned_data['longitude_loc'])
            cleaned_data.drop(['latitude_loc', 'longitude_loc'], axis=1, inplace=True)
            after_missing = cleaned_data['latitude'].isna().sum()
            filled = before_missing - after_missing
            print(f" Filled {filled} missing coordinates from jersey_city_locations.")
        else:
            print(" jersey_city_locations missing lat/lon columns — skipping merge.")
    else:
        print(" jersey_city_locations not found — skipping coordinate merge.")

    filled_coords = cleaned_data[['latitude', 'longitude']].notnull().all(axis=1).sum()
    print(f" Valid coordinates for {filled_coords}/{len(cleaned_data)} rows.")

    # --- Convert values to numeric ---
    if 'value' in cleaned_data.columns:
        print("\nConverting value column to numeric...")
        cleaned_data['value'] = pd.to_numeric(cleaned_data['value'], errors='coerce')

    # --- Fill in missing units ---
    if 'unit' in cleaned_data.columns:
        print("\nFilling in missing units...")
        unit_map = {
            'pm25': 'μg/m³',
            'pm10': 'μg/m³',
            'no2': 'ppm',
            'so2': 'ppm',
            'co': 'ppm',
            'o3': 'ppm'
        }
        mask = cleaned_data['unit'].isnull()
        if mask.any():
            cleaned_data.loc[mask, 'unit'] = cleaned_data.loc[mask, 'parameter'].map(unit_map)
            print(f"Filled {mask.sum()} missing unit values")

    # --- Outlier Detection (IQR method) ---
    print("\nDetecting outliers...")
    cleaned_data['is_outlier'] = False
    for param in cleaned_data['parameter'].dropna().unique():
        param_data = cleaned_data[cleaned_data['parameter'] == param]
        if len(param_data) >= 4:
            q1 = param_data['value'].quantile(0.25)
            q3 = param_data['value'].quantile(0.75)
            iqr = q3 - q1
            lower_bound = q1 - 1.5 * iqr
            upper_bound = q3 + 1.5 * iqr
            outlier_mask = (cleaned_data['parameter'] == param) & (
                (cleaned_data['value'] < lower_bound) | (cleaned_data['value'] > upper_bound)
            )
            cleaned_data.loc[outlier_mask, 'is_outlier'] = True

    outlier_count = cleaned_data['is_outlier'].sum()
    print(f"Identified {outlier_count} outliers "
          f"({outlier_count/len(cleaned_data)*100:.2f}% of the data)")

    # --- Final Cleaned Dataset ---
    essential_columns = [
        'parameter', 'value', 'unit', 'datetimeUtc',
        'location_id', 'location_name', 'latitude', 'longitude',
        'year', 'month', 'day', 'hour', 'dayofweek', 'is_outlier'
    ]
    final_columns = [col for col in essential_columns if col in cleaned_data.columns]
    final_data = cleaned_data[final_columns].copy()

    # Add weekday names
    day_names = {
        0: 'Monday', 1: 'Tuesday', 2: 'Wednesday', 3: 'Thursday',
        4: 'Friday', 5: 'Saturday', 6: 'Sunday'
    }
    if 'dayofweek' in final_data.columns:
        final_data['day_name'] = final_data['dayofweek'].map(day_names)

    # --- Summary ---
    print("\nFinal cleaned dataset summary:")
    print(f"Shape: {final_data.shape}")
    if 'datetimeUtc' in final_data.columns and not final_data['datetimeUtc'].isnull().all():
        print(f"Date range: {final_data['datetimeUtc'].min()} to {final_data['datetimeUtc'].max()}")
    if 'parameter' in final_data.columns:
        print(f"Parameters: {', '.join(final_data['parameter'].dropna().unique())}")
    if 'location_name' in final_data.columns:
        print(f"Locations: {final_data['location_name'].nunique()} unique")

    all_data = final_data

    print("\nSample of final cleaned data:")
    display(all_data.head())
    print("\nColumns in the final dataset:")
    print(all_data.columns.tolist())
    print("\n Data is ready for visualization and analysis!")

else:
    print("\n No data available for cleaning and preprocessing.")






Merging all sensor data into one DataFrame...

Cleaning and organizing data for analysis...
Initial dataset shape: (51103, 7)

Available columns:
['datetimeUtc', 'datetimeLocal', 'parameter', 'value', 'unit', 'location_id', 'location_name']
 period_datetimeFrom column not found!

Extracting date components from datetimeUtc...
Extracted date components for 51103 rows (100.00% of dataset)

 Extracting sensor coordinates (robust method)...
 No coordinate columns found — creating empty latitude/longitude columns.

 Updating coordinates from sensor location table...
 Filled 51103 missing coordinates from jersey_city_locations.
 Valid coordinates for 51103/51103 rows.

Converting value column to numeric...

Filling in missing units...

Detecting outliers...
Identified 3145 outliers (6.15% of the data)

Final cleaned dataset summary:
Shape: (51103, 15)
Date range: 2016-01-29 23:30:00+00:00 to 2025-03-21 08:15:00+00:00
Parameters: co, no2, o3, pm25, so2, pm10
Locations: 11 unique

Sample of f

  cleaned_data['latitude'] = cleaned_data['latitude'].fillna(cleaned_data['latitude_loc'])
  cleaned_data['longitude'] = cleaned_data['longitude'].fillna(cleaned_data['longitude_loc'])


Unnamed: 0,parameter,value,unit,datetimeUtc,location_id,location_name,latitude,longitude,year,month,day,hour,dayofweek,is_outlier,day_name
0,co,4250.0,µg/m³,2016-03-31 08:30:00+00:00,378,Alandur Bus Depot,12.997114,80.191517,2016,3,31,8,3,True,Thursday
1,co,32360.0,µg/m³,2016-04-01 16:15:00+00:00,378,Alandur Bus Depot,12.997114,80.191517,2016,4,1,16,4,True,Friday
2,co,5100.0,µg/m³,2016-04-01 17:15:00+00:00,378,Alandur Bus Depot,12.997114,80.191517,2016,4,1,17,4,True,Friday
3,co,38970.0,µg/m³,2016-04-01 20:30:00+00:00,378,Alandur Bus Depot,12.997114,80.191517,2016,4,1,20,4,True,Friday
4,co,36330.0,µg/m³,2016-04-01 20:45:00+00:00,378,Alandur Bus Depot,12.997114,80.191517,2016,4,1,20,4,True,Friday



Columns in the final dataset:
['parameter', 'value', 'unit', 'datetimeUtc', 'location_id', 'location_name', 'latitude', 'longitude', 'year', 'month', 'day', 'hour', 'dayofweek', 'is_outlier', 'day_name']

 Data is ready for visualization and analysis!


In [16]:
import os
save_dir = "/content/processed_data"
os.makedirs(save_dir, exist_ok=True)

save_path = os.path.join(save_dir, "all_data.csv")

if "all_data" in locals() and not all_data.empty:
    all_data.to_csv(save_path, index=False)
    print(f" Combined preprocessed data saved successfully!")
    print(f" File location: {save_path}")
    print(f" Shape: {all_data.shape}")
else:
    print(" 'all_data' DataFrame not found or empty — please check previous cells.")


 Combined preprocessed data saved successfully!
 File location: /content/processed_data/all_data.csv
 Shape: (51103, 15)


In [None]:
from google.colab import files
files.download("/content/processed_data/all_data.csv")


<IPython.core.display.Javascript object>

<IPython.core.display.Javascript object>