In [33]:
# Cell 1: Setup and Configuration
# OpenAQ API Key
import os
from dotenv import load_dotenv
load_dotenv()  # Load environment variables from .env file if present
OPENAQ_API_KEY = os.getenv("OPENAQ_API_KEY")    

# Force complete reimport of openaq module
import sys
modules_to_remove = [key for key in sys.modules.keys() if key.startswith('openaq')]
for mod in modules_to_remove:
    del sys.modules[mod]

print("OpenAQ modules cleaned from cache")
print(f"API Key configured: {OPENAQ_API_KEY[:10]}...")

OpenAQ modules cleaned from cache
API Key configured: eccdc2f296...


In [None]:
# Cell 2: Define India States and Major Districts with Coordinates

# Dictionary of Indian states with 5 major districts each
# Format: {state_name: [(district_name, latitude, longitude), ...]}

INDIA_STATES_DISTRICTS = {
    "Andhra Pradesh": [
        ("Visakhapatnam", 17.6868, 83.2185),
        ("Vijayawada", 16.5062, 80.6480),
        ("Guntur", 16.3067, 80.4365),
        ("Tirupati", 13.6288, 79.4192),
        ("Nellore", 14.4426, 79.9865),
    ],
    "Arunachal Pradesh": [
        ("Itanagar", 27.0844, 93.6053),
        ("Naharlagun", 27.1044, 93.6945),
        ("Pasighat", 28.0670, 95.3269),
        ("Tawang", 27.5861, 91.8594),
        ("Ziro", 27.5450, 93.8260),
    ],
    "Assam": [
        ("Guwahati", 26.1445, 91.7362),
        ("Silchar", 24.8333, 92.7789),
        ("Dibrugarh", 27.4728, 94.9120),
        ("Jorhat", 26.7509, 94.2037),
        ("Tezpur", 26.6528, 92.7926),
    ],
    "Bihar": [
        ("Patna", 25.5941, 85.1376),
        ("Gaya", 24.7914, 85.0002),
        ("Muzaffarpur", 26.1209, 85.3647),
        ("Bhagalpur", 25.2425, 86.9842),
        ("Darbhanga", 26.1542, 85.8918),
    ],
    "Chhattisgarh": [
        ("Raipur", 21.2514, 81.6296),
        ("Bhilai", 21.2094, 81.3509),
        ("Bilaspur", 22.0796, 82.1391),
        ("Korba", 22.3595, 82.7501),
        ("Durg", 21.1904, 81.2849),
    ],
    "Goa": [
        ("Panaji", 15.4909, 73.8278),
        ("Margao", 15.2832, 73.9862),
        ("Vasco da Gama", 15.3959, 73.8154),
        ("Mapusa", 15.5916, 73.8127),
        ("Ponda", 15.4034, 74.0152),
    ],
    "Gujarat": [
        ("Ahmedabad", 23.0225, 72.5714),
        ("Surat", 21.1702, 72.8311),
        ("Vadodara", 22.3072, 73.1812),
        ("Rajkot", 22.3039, 70.8022),
        ("Gandhinagar", 23.2156, 72.6369),
    ],
    "Haryana": [
        ("Gurugram", 28.4595, 77.0266),
        ("Faridabad", 28.4089, 77.3178),
        ("Panipat", 29.3909, 76.9635),
        ("Ambala", 30.3782, 76.7767),
        ("Hisar", 29.1492, 75.7217),
    ],
    "Himachal Pradesh": [
        ("Shimla", 31.1048, 77.1734),
        ("Dharamshala", 32.2190, 76.3234),
        ("Manali", 32.2396, 77.1887),
        ("Kullu", 31.9592, 77.1089),
        ("Solan", 30.9045, 77.0967),
    ],
    "Jharkhand": [
        ("Ranchi", 23.3441, 85.3096),
        ("Jamshedpur", 22.8046, 86.2029),
        ("Dhanbad", 23.7957, 86.4304),
        ("Bokaro", 23.6693, 86.1511),
        ("Hazaribagh", 23.9925, 85.3637),
    ],
    "Karnataka": [
        ("Bengaluru", 12.9716, 77.5946),
        ("Mysuru", 12.2958, 76.6394),
        ("Hubli", 15.3647, 75.1240),
        ("Mangaluru", 12.9141, 74.8560),
        ("Belgaum", 15.8497, 74.4977),
    ],
    "Kerala": [
        ("Thiruvananthapuram", 8.5241, 76.9366),
        ("Kochi", 9.9312, 76.2673),
        ("Kozhikode", 11.2588, 75.7804),
        ("Thrissur", 10.5276, 76.2144),
        ("Kannur", 11.8745, 75.3704),
    ],
    "Madhya Pradesh": [
        ("Bhopal", 23.2599, 77.4126),
        ("Indore", 22.7196, 75.8577),
        ("Jabalpur", 23.1815, 79.9864),
        ("Gwalior", 26.2183, 78.1828),
        ("Ujjain", 23.1765, 75.7885),
    ],
    "Maharashtra": [
        ("Mumbai", 19.0760, 72.8777),
        ("Pune", 18.5204, 73.8567),
        ("Nagpur", 21.1458, 79.0882),
        ("Nashik", 19.9975, 73.7898),
        ("Aurangabad", 19.8762, 75.3433),
    ],
    "Manipur": [
        ("Imphal", 24.8170, 93.9368),
        ("Thoubal", 24.6342, 94.0132),
        ("Bishnupur", 24.6270, 93.7610),
        ("Churachandpur", 24.3333, 93.6833),
        ("Ukhrul", 25.0492, 94.3616),
    ],
    "Meghalaya": [
        ("Shillong", 25.5788, 91.8933),
        ("Tura", 25.5144, 90.2003),
        ("Jowai", 25.4529, 92.2035),
        ("Nongpoh", 25.9042, 91.8806),
        ("Williamnagar", 25.4939, 90.6178),
    ],
    "Mizoram": [
        ("Aizawl", 23.7271, 92.7176),
        ("Lunglei", 22.8839, 92.7322),
        ("Champhai", 23.4567, 93.3281),
        ("Serchhip", 23.3067, 92.8506),
        ("Kolasib", 24.2239, 92.6789),
    ],
    "Nagaland": [
        ("Kohima", 25.6751, 94.1086),
        ("Dimapur", 25.9064, 93.7273),
        ("Mokokchung", 26.3167, 94.5167),
        ("Tuensang", 26.2667, 94.8333),
        ("Wokha", 26.1000, 94.2667),
    ],
    "Odisha": [
        ("Bhubaneswar", 20.2961, 85.8245),
        ("Cuttack", 20.4625, 85.8830),
        ("Rourkela", 22.2604, 84.8536),
        ("Puri", 19.8135, 85.8312),
        ("Sambalpur", 21.4669, 83.9756),
    ],
    "Punjab": [
        ("Ludhiana", 30.9010, 75.8573),
        ("Amritsar", 31.6340, 74.8723),
        ("Jalandhar", 31.3260, 75.5762),
        ("Patiala", 30.3398, 76.3869),
        ("Bathinda", 30.2110, 74.9455),
    ],
    "Rajasthan": [
        ("Jaipur", 26.9124, 75.7873),
        ("Jodhpur", 26.2389, 73.0243),
        ("Udaipur", 24.5854, 73.7125),
        ("Kota", 25.2138, 75.8648),
        ("Ajmer", 26.4499, 74.6399),
    ],
    "Sikkim": [
        ("Gangtok", 27.3389, 88.6065),
        ("Namchi", 27.1667, 88.3500),
        ("Gyalshing", 27.2833, 88.2500),
        ("Mangan", 27.5167, 88.5333),
        ("Rangpo", 27.1833, 88.5167),
    ],
    "Tamil Nadu": [
        ("Chennai", 13.0827, 80.2707),
        ("Coimbatore", 11.0168, 76.9558),
        ("Madurai", 9.9252, 78.1198),
        ("Tiruchirappalli", 10.7905, 78.7047),
        ("Salem", 11.6643, 78.1460),
    ],
    "Telangana": [
        ("Hyderabad", 17.3850, 78.4867),
        ("Warangal", 17.9784, 79.5941),
        ("Nizamabad", 18.6725, 78.0940),
        ("Karimnagar", 18.4386, 79.1288),
        ("Khammam", 17.2473, 80.1514),
    ],
    "Tripura": [
        ("Agartala", 23.8315, 91.2868),
        ("Dharmanagar", 24.3667, 92.1667),
        ("Udaipur", 23.5333, 91.4833),
        ("Kailashahar", 24.3333, 92.0167),
        ("Belonia", 23.2500, 91.4500),
    ],
    "Uttar Pradesh": [
        ("Lucknow", 26.8467, 80.9462),
        ("Kanpur", 26.4499, 80.3319),
        ("Varanasi", 25.3176, 82.9739),
        ("Agra", 27.1767, 78.0081),
        ("Noida", 28.5355, 77.3910),
    ],
    "Uttarakhand": [
        ("Dehradun", 30.3165, 78.0322),
        ("Haridwar", 29.9457, 78.1642),
        ("Rishikesh", 30.0869, 78.2676),
        ("Nainital", 29.3919, 79.4542),
        ("Haldwani", 29.2183, 79.5130),
    ],
    "West Bengal": [
        ("Kolkata", 22.5726, 88.3639),
        ("Howrah", 22.5958, 88.2636),
        ("Durgapur", 23.5204, 87.3119),
        ("Siliguri", 26.7271, 88.6393),
        ("Asansol", 23.6889, 86.9661),
    ],
    # Union Territories
    "Delhi": [
        ("New Delhi", 28.6139, 77.2090),
        ("Dwarka", 28.5921, 77.0460),
        ("Rohini", 28.7495, 77.0565),
        ("Shahdara", 28.6731, 77.2868),
        ("Najafgarh", 28.6092, 76.9798),
    ],
    "Chandigarh": [
        ("Chandigarh Sector 17", 30.7412, 76.7684),
        ("Chandigarh Sector 22", 30.7333, 76.7794),
        ("Chandigarh Sector 35", 30.7233, 76.7580),
        ("Chandigarh Sector 43", 30.7106, 76.7451),
        ("Manimajra", 30.7333, 76.8333),
    ],
    "Puducherry": [
        ("Puducherry", 11.9416, 79.8083),
        ("Karaikal", 10.9254, 79.8380),
        ("Mahe", 11.7036, 75.5360),
        ("Yanam", 16.7333, 82.2167),
        ("Ozhukarai", 11.9500, 79.7667),
    ],
    "Jammu and Kashmir": [
        ("Srinagar", 34.0837, 74.7973),
        ("Jammu", 32.7266, 74.8570),
        ("Anantnag", 33.7311, 75.1487),
        ("Baramulla", 34.2095, 74.3436),
        ("Udhampur", 32.9160, 75.1322),
    ],
    "Ladakh": [
        ("Leh", 34.1526, 77.5771),
        ("Kargil", 34.5539, 76.1349),
        ("Diskit", 34.5500, 77.5500),
        ("Nyoma", 33.2000, 78.6500),
        ("Zanskar", 33.7500, 76.8500),
    ],
}

print(f"Configured {len(INDIA_STATES_DISTRICTS)} states/UTs")
total_districts = sum(len(districts) for districts in INDIA_STATES_DISTRICTS.values())
print(f"Total districts to query: {total_districts}")

In [None]:
# Cell 3: Import Libraries and Define Helper Functions

import datetime
import time
import os
import pandas as pd
from openaq import OpenAQ

# Search radius in meters (25 km)
RADIUS_METERS = 25000

# Define date range: last 30 days to today
date_to = datetime.datetime.now(datetime.timezone.utc)
date_from = date_to - datetime.timedelta(days=30)

print(f"Date range: {date_from.strftime('%Y-%m-%d')} to {date_to.strftime('%Y-%m-%d')}")
print(f"Search radius: {RADIUS_METERS/1000} km")

# Define the parameters we want to collect (flexible matching)
WANTED_KEYWORDS = {
    "pm25": ["pm25", "pm2.5", "pm2_5"],
    "pm10": ["pm10"],
    "no2": ["no2", "nitrogen dioxide"],
    "co": ["co", "carbon monoxide"],
    "so2": ["so2", "sulfur dioxide", "sulphur dioxide"],
    "o3": ["o3", "ozone"],
    "temperature": ["temperature", "temp", "at"],
    "humidity": ["humidity", "rh", "relativehumidity", "relative humidity"],
    "wind_speed": ["windspeed", "wind_speed", "ws", "wind speed"],
    "wind_direction": ["winddirection", "wind_direction", "wd", "wind direction"]
}

# Global set to track already-used location IDs (prevents duplicate data)
used_location_ids = set()

def is_wanted_parameter(param_name):
    """Check if parameter name matches any of our wanted parameters"""
    if not param_name:
        return None
    param_lower = param_name.lower().strip()
    for category, keywords in WANTED_KEYWORDS.items():
        for kw in keywords:
            if kw == param_lower or param_lower.startswith(kw):
                return category
    return None

def get_sensor_param_info(sensor):
    """Extract parameter info from sensor, handling both dict and object responses"""
    param_name = ""
    param_display = ""
    param_units = ""
    
    if hasattr(sensor, 'parameter'):
        param = sensor.parameter
        # Handle dict response
        if isinstance(param, dict):
            param_name = param.get('name', '')
            param_display = param.get('display_name', param_name)
            param_units = param.get('units', '')
        # Handle object response
        else:
            if hasattr(param, 'name') and param.name:
                param_name = param.name
            if hasattr(param, 'display_name') and param.display_name:
                param_display = param.display_name
            if hasattr(param, 'units') and param.units:
                param_units = param.units
    
    # If still empty, try the sensor name
    if not param_name and hasattr(sensor, 'name') and sensor.name:
        param_name = sensor.name.split()[0] if sensor.name else ""
        param_display = sensor.name
    
    return param_name, param_display, param_units

def fetch_district_data(client, state_name, district_name, latitude, longitude, date_from, date_to):
    """
    Fetch air quality data for a specific district.
    Uses only ONE unique location that hasn't been used before.
    """
    global used_location_ids
    district_data = []
    
    try:
        # Fetch locations around the district
        loc_response = client.locations.list(
            coordinates=(latitude, longitude),
            radius=RADIUS_METERS,
            limit=100
        )
        locations = loc_response.results
        
        if not locations:
            return district_data, 0, None, "no_stations"
        
        # Find the first location that hasn't been used yet
        selected_loc = None
        for loc in locations:
            if loc.id not in used_location_ids:
                selected_loc = loc
                break
        
        # If all locations were already used, skip this district
        if selected_loc is None:
            return district_data, len(locations), None, "all_used"
        
        # Mark this location as used
        used_location_ids.add(selected_loc.id)
        
        loc_id = selected_loc.id
        loc_name = selected_loc.name
        loc_lat = selected_loc.coordinates.latitude
        loc_lon = selected_loc.coordinates.longitude
        
        # Get sensors for this location
        try:
            sensors_response = client.locations.sensors(loc_id)
            sensors = sensors_response.results
        except Exception:
            return district_data, len(locations), loc_name, "sensor_error"
        
        # For each sensor, fetch measurements if it's a wanted parameter
        for sensor in sensors:
            sensor_id = sensor.id
            param_name, param_display, param_units = get_sensor_param_info(sensor)
            
            matched_category = is_wanted_parameter(param_name)
            if not matched_category:
                continue
            
            # Fetch measurements with pagination
            page = 1
            while True:
                try:
                    meas_response = client.measurements.list(
                        sensors_id=sensor_id,
                        datetime_from=date_from,
                        datetime_to=date_to,
                        page=page,
                        limit=1000
                    )
                    results = meas_response.results
                    
                    if not results:
                        break
                    
                    for m in results:
                        datetime_utc = None
                        datetime_local = None
                        
                        if hasattr(m, 'period') and m.period:
                            if hasattr(m.period, 'datetime_from'):
                                dt_from = m.period.datetime_from
                                if hasattr(dt_from, 'utc'):
                                    datetime_utc = str(dt_from.utc)
                                if hasattr(dt_from, 'local'):
                                    datetime_local = str(dt_from.local)
                        
                        record = {
                            "state": state_name,
                            "district": district_name,
                            "location_id": loc_id,
                            "location_name": loc_name,
                            "sensor_id": sensor_id,
                            "parameter": matched_category,
                            "parameter_original": param_name,
                            "parameter_display": param_display,
                            "value": m.value if hasattr(m, 'value') else None,
                            "unit": param_units,
                            "datetime_utc": datetime_utc,
                            "datetime_local": datetime_local,
                            "latitude": loc_lat,
                            "longitude": loc_lon,
                        }
                        district_data.append(record)
                    
                    page += 1
                    time.sleep(0.05)  # Rate limiting
                    
                except Exception:
                    break
        
        return district_data, len(locations), loc_name, "success"
        
    except Exception as e:
        return district_data, 0, None, "error"

print("Helper functions defined successfully!")
print("Note: Each location will only be used ONCE across all districts.")

In [None]:
# Cell 4: Main Data Collection Loop

# Reset the used locations tracker (important if re-running)
used_location_ids.clear()

# Initialize the OpenAQ client
client = OpenAQ(api_key=OPENAQ_API_KEY)

# Store all collected data
all_india_data = []

# Statistics tracking
stats = {
    "states_processed": 0,
    "districts_with_data": 0,
    "districts_without_stations": 0,
    "districts_skipped_duplicate": 0,
    "districts_no_params": 0,
    "total_records": 0,
    "unique_locations_used": 0
}

print("="*70)
print("Starting India-wide Air Quality Data Collection")
print("(Using ONLY 1 UNIQUE location per district - no duplicates)")
print("="*70)

for state_name, districts in INDIA_STATES_DISTRICTS.items():
    print(f"\n{'='*60}")
    print(f"Processing: {state_name}")
    print(f"{'='*60}")
    
    state_records = 0
    
    for district_name, lat, lon in districts:
        print(f"  üìç {district_name} ({lat:.4f}, {lon:.4f})...", end=" ")
        
        district_data, num_locations, location_used, status = fetch_district_data(
            client, state_name, district_name, lat, lon, date_from, date_to
        )
        
        if status == "success" and district_data:
            all_india_data.extend(district_data)
            stats["districts_with_data"] += 1
            stats["unique_locations_used"] += 1
            state_records += len(district_data)
            print(f"‚úì {len(district_data)} records from '{location_used}'")
        elif status == "all_used":
            stats["districts_skipped_duplicate"] += 1
            print(f"‚è≠Ô∏è Skipped (all {num_locations} nearby stations already used)")
        elif status == "no_stations":
            stats["districts_without_stations"] += 1
            print(f"‚úó No monitoring stations found")
        elif status == "success" and not district_data:
            stats["districts_no_params"] += 1
            print(f"‚ö†Ô∏è Station found but no matching parameters")
        else:
            stats["districts_without_stations"] += 1
            print(f"‚úó Error fetching data")
        
        # Small delay between districts
        time.sleep(0.1)
    
    stats["states_processed"] += 1
    stats["total_records"] += state_records
    print(f"  üìä State total: {state_records} records")

# Close the client
client.close()

print("\n" + "="*70)
print("Data Collection Complete!")
print("="*70)
print(f"States/UTs processed: {stats['states_processed']}")
print(f"Districts with data: {stats['districts_with_data']}")
print(f"Districts skipped (duplicate stations): {stats['districts_skipped_duplicate']}")
print(f"Districts without stations: {stats['districts_without_stations']}")
print(f"Districts with no matching params: {stats['districts_no_params']}")
print(f"Unique monitoring locations used: {stats['unique_locations_used']}")
print(f"Total records collected: {stats['total_records']}")

In [None]:
# Cell 5: Build DataFrame and Save Data

# Create DataFrame
df = pd.DataFrame(all_india_data)

print(f"Total records in DataFrame: {len(df)}")

if len(df) > 0:
    # Display data preview
    print("\n" + "="*60)
    print("Data Preview:")
    print("="*60)
    print(df.head(10))
    
    # Summary statistics
    print("\n" + "="*60)
    print("Summary by State:")
    print("="*60)
    state_summary = df.groupby('state').agg({
        'district': 'nunique',
        'location_id': 'nunique',
        'value': 'count'
    }).rename(columns={
        'district': 'districts',
        'location_id': 'locations',
        'value': 'records'
    })
    print(state_summary)
    
    print("\n" + "="*60)
    print("Parameter Distribution:")
    print("="*60)
    print(df['parameter'].value_counts())
    
    print("\n" + "="*60)
    print("Data Summary by Parameter:")
    print("="*60)
    param_summary = df.groupby('parameter')['value'].agg(['count', 'mean', 'min', 'max'])
    print(param_summary)
    
    # Create output directory
    output_dir = "data/india_states"
    os.makedirs(output_dir, exist_ok=True)
    
    # Save complete dataset
    output_file = f"{output_dir}/india_all_states_aq_last30days.csv"
    df.to_csv(output_file, index=False)
    print(f"\n‚úÖ Saved complete dataset to: {output_file}")
    
    # Save state-wise files
    print("\nSaving state-wise files...")
    for state in df['state'].unique():
        state_df = df[df['state'] == state]
        state_filename = state.lower().replace(' ', '_').replace('and', '').replace('__', '_')
        state_file = f"{output_dir}/{state_filename}_aq_last30days.csv"
        state_df.to_csv(state_file, index=False)
        print(f"   ‚úÖ {state}: {len(state_df)} records -> {state_file}")
    
    print(f"\nüéâ All data saved to '{output_dir}' directory!")
    
else:
    print("\n‚ùå No data collected. This could mean:")
    print("   - No monitoring stations exist in the searched areas")
    print("   - No data available for the last 30 days")
    print("   - API rate limits or connectivity issues")

In [None]:
# Cell 6: Generate Summary Report

if len(df) > 0:
    print("="*70)
    print("INDIA AIR QUALITY DATA COLLECTION - FINAL REPORT")
    print("="*70)
    print(f"\nüìÖ Data Period: {date_from.strftime('%Y-%m-%d')} to {date_to.strftime('%Y-%m-%d')}")
    print(f"üìä Total Records: {len(df):,}")
    print(f"üó∫Ô∏è  States/UTs with data: {df['state'].nunique()}")
    print(f"üìç Districts with data: {df['district'].nunique()}")
    print(f"üè≠ Monitoring Locations: {df['location_id'].nunique()}")
    
    print("\n" + "-"*50)
    print("Parameters Collected:")
    print("-"*50)
    for param in df['parameter'].unique():
        param_data = df[df['parameter'] == param]
        print(f"  ‚Ä¢ {param.upper()}: {len(param_data):,} measurements")
    
    print("\n" + "-"*50)
    print("Top 10 States by Data Volume:")
    print("-"*50)
    top_states = df.groupby('state').size().sort_values(ascending=False).head(10)
    for i, (state, count) in enumerate(top_states.items(), 1):
        print(f"  {i:2}. {state}: {count:,} records")
    
    print("\n" + "-"*50)
    print("Output Files:")
    print("-"*50)
    print(f"  üìÅ Main file: data/india_states/india_all_states_aq_last30days.csv")
    print(f"  üìÅ State files: data/india_states/[state_name]_aq_last30days.csv")
    
    print("\n" + "="*70)
    print("‚úÖ Data collection completed successfully!")
    print("="*70)
else:
    print("No data to generate report.")

# Data Transformation

Transform the collected data by pivoting parameter values into separate feature columns.
Each unique parameter (pm25, pm10, no2, co, so2, o3, temperature, humidity, wind_speed, wind_direction) becomes its own column.

In [28]:
# Cell 7: Load the Complete Combined Dataset (ALL RECORDS)

import pandas as pd
import os

# Path to the main combined data file
data_dir = "data/india_states"
main_file = f"{data_dir}/india_all_states_aq_last30days.csv"

print("="*70)
print("LOADING COMPLETE INDIA AIR QUALITY DATASET")
print("="*70)

# Load the main combined file which contains ALL records
if os.path.exists(main_file):
    combined_df = pd.read_csv(main_file)
    print(f"\n‚úÖ Loaded: {main_file}")
    print(f"\nüìä Dataset Statistics:")
    print(f"   Total records: {len(combined_df):,}")
    print(f"   States/UTs: {combined_df['state'].nunique()}")
    print(f"   Districts: {combined_df['district'].nunique()}")
    print(f"   Unique Locations: {combined_df['location_id'].nunique()}")
    print(f"   Parameters: {combined_df['parameter'].unique().tolist()}")
    
    print("\n" + "-"*50)
    print("Records per State:")
    print("-"*50)
    state_counts = combined_df.groupby('state').size().sort_values(ascending=False)
    for state, count in state_counts.items():
        print(f"   {state}: {count:,} records")
    
    print(f"\n   TOTAL: {state_counts.sum():,} records")
    
    print("\n" + "-"*50)
    print("Sample Data (before transformation):")
    print("-"*50)
    print(combined_df.head())
else:
    print(f"‚ùå File not found: {main_file}")
    combined_df = pd.DataFrame()

LOADING COMPLETE INDIA AIR QUALITY DATASET

‚úÖ Loaded: data/india_states/india_all_states_aq_last30days.csv

üìä Dataset Statistics:
   Total records: 978,897
   States/UTs: 19
   Districts: 49
   Unique Locations: 49
   Parameters: ['pm25', 'o3', 'co', 'pm10', 'humidity', 'temperature', 'so2', 'no2', 'wind_speed', 'wind_direction']

--------------------------------------------------
Records per State:
--------------------------------------------------
   Punjab: 108,482 records
   Chhattisgarh: 97,180 records
   Madhya Pradesh: 89,237 records
   Kerala: 77,884 records
   Gujarat: 73,395 records
   Andhra Pradesh: 68,901 records
   Odisha: 68,370 records
   Rajasthan: 67,508 records
   Haryana: 60,982 records
   Maharashtra: 56,719 records
   Bihar: 42,569 records
   Meghalaya: 41,239 records
   Assam: 25,638 records
   Nagaland: 24,560 records
   Arunachal Pradesh: 20,240 records
   Karnataka: 18,152 records
   Mizoram: 13,764 records
   Sikkim: 13,065 records
   Jharkhand: 11,012 r

In [29]:
# Cell 8: Transform Data - Pivot Parameters into Feature Columns (ALL RECORDS)

if len(combined_df) > 0:
    print("="*70)
    print("TRANSFORMING DATA: PIVOTING PARAMETERS INTO FEATURE COLUMNS")
    print("="*70)
    print(f"\nInput records: {len(combined_df):,}")
    
    # Define the index columns (grouping columns)
    index_cols = [
        'state', 
        'district', 
        'location_id', 
        'location_name', 
        'datetime_utc', 
        'datetime_local', 
        'latitude', 
        'longitude'
    ]
    
    # Check which columns exist in the data
    available_cols = [col for col in index_cols if col in combined_df.columns]
    print(f"Available index columns: {available_cols}")
    
    # Pivot the dataframe: parameters become columns, values become the data
    # Using 'first' instead of 'mean' to preserve all unique combinations
    transformed_df = combined_df.pivot_table(
        index=available_cols,
        columns='parameter',
        values='value',
        aggfunc='first'  # Take first value if duplicates exist
    ).reset_index()
    
    # Flatten column names (remove multi-index)
    transformed_df.columns.name = None
    
    # Get the parameter columns that were created
    param_columns = [col for col in transformed_df.columns if col not in available_cols]
    
    print(f"\n‚úÖ Transformation complete!")
    print(f"\nüìä Transformation Results:")
    print(f"   Original records: {len(combined_df):,}")
    print(f"   Transformed rows: {len(transformed_df):,}")
    print(f"   Total columns: {len(transformed_df.columns)}")
    print(f"\nüìã Index columns: {available_cols}")
    print(f"üìã Parameter feature columns: {param_columns}")
    
    # Verify no data loss
    print("\n" + "-"*50)
    print("Data Preservation Check:")
    print("-"*50)
    for param in param_columns:
        original_count = combined_df[combined_df['parameter'] == param]['value'].notna().sum()
        transformed_count = transformed_df[param].notna().sum()
        print(f"   {param}: Original={original_count:,}, Transformed={transformed_count:,}")
    
    print("\n" + "-"*50)
    print("Transformed Data Preview:")
    print("-"*50)
    print(transformed_df.head(10))
else:
    print("‚ùå No data to transform!")

TRANSFORMING DATA: PIVOTING PARAMETERS INTO FEATURE COLUMNS

Input records: 978,897
Available index columns: ['state', 'district', 'location_id', 'location_name', 'datetime_utc', 'datetime_local', 'latitude', 'longitude']

‚úÖ Transformation complete!

üìä Transformation Results:
   Original records: 978,897
   Transformed rows: 109,501
   Total columns: 18

üìã Index columns: ['state', 'district', 'location_id', 'location_name', 'datetime_utc', 'datetime_local', 'latitude', 'longitude']
üìã Parameter feature columns: ['co', 'humidity', 'no2', 'o3', 'pm10', 'pm25', 'so2', 'temperature', 'wind_direction', 'wind_speed']

--------------------------------------------------
Data Preservation Check:
--------------------------------------------------
   co: Original=100,818, Transformed=100,818
   humidity: Original=93,526, Transformed=93,526
   no2: Original=105,891, Transformed=105,891
   o3: Original=98,644, Transformed=98,644

‚úÖ Transformation complete!

üìä Transformation Results:


In [30]:
# Cell 9: Data Quality Check and Statistics

if 'transformed_df' in dir() and len(transformed_df) > 0:
    print("="*70)
    print("DATA QUALITY REPORT")
    print("="*70)
    
    # Missing values analysis
    print("\nüìä Missing Values per Parameter Column:")
    print("-"*50)
    param_columns = ['pm25', 'pm10', 'no2', 'co', 'so2', 'o3', 'temperature', 'humidity', 'wind_speed', 'wind_direction']
    existing_params = [col for col in param_columns if col in transformed_df.columns]
    
    for col in existing_params:
        missing = transformed_df[col].isna().sum()
        total = len(transformed_df)
        pct = (missing / total) * 100
        present = total - missing
        print(f"  {col:20s}: {present:,} values ({100-pct:.1f}% coverage), {missing:,} missing ({pct:.1f}%)")
    
    # Statistics for each parameter
    print("\n" + "="*70)
    print("PARAMETER STATISTICS:")
    print("="*70)
    
    stats_data = []
    for col in existing_params:
        if col in transformed_df.columns:
            stats_data.append({
                'Parameter': col,
                'Count': transformed_df[col].notna().sum(),
                'Mean': transformed_df[col].mean(),
                'Std': transformed_df[col].std(),
                'Min': transformed_df[col].min(),
                'Max': transformed_df[col].max()
            })
    
    stats_df = pd.DataFrame(stats_data)
    print(stats_df.to_string(index=False))
    
    # Coverage by state
    print("\n" + "="*70)
    print("DATA COVERAGE BY STATE:")
    print("="*70)
    state_coverage = transformed_df.groupby('state').agg({
        'district': 'nunique',
        'location_id': 'nunique',
        'datetime_utc': 'count'
    }).rename(columns={
        'district': 'districts',
        'location_id': 'locations',
        'datetime_utc': 'records'
    }).sort_values('records', ascending=False)
    
    print(state_coverage.head(15))
else:
    print("No transformed data available!")

DATA QUALITY REPORT

üìä Missing Values per Parameter Column:
--------------------------------------------------
  pm25                : 105,683 values (96.5% coverage), 3,818 missing (3.5%)
  pm10                : 101,126 values (92.4% coverage), 8,375 missing (7.6%)
  no2                 : 105,891 values (96.7% coverage), 3,610 missing (3.3%)
  co                  : 100,818 values (92.1% coverage), 8,683 missing (7.9%)
  so2                 : 106,879 values (97.6% coverage), 2,622 missing (2.4%)
  o3                  : 98,644 values (90.1% coverage), 10,857 missing (9.9%)
  temperature         : 85,780 values (78.3% coverage), 23,721 missing (21.7%)
  humidity            : 93,526 values (85.4% coverage), 15,975 missing (14.6%)
  wind_speed          : 89,086 values (81.4% coverage), 20,415 missing (18.6%)
  wind_direction      : 91,464 values (83.5% coverage), 18,037 missing (16.5%)

PARAMETER STATISTICS:
     Parameter  Count         Mean          Std      Min          Max
         

In [31]:
# Cell 10: Save Final Transformed Dataset (COMPLETE)

if 'transformed_df' in dir() and len(transformed_df) > 0:
    # Create output directory for processed data
    processed_dir = "data/processed"
    os.makedirs(processed_dir, exist_ok=True)
    
    # Define column order for the final output
    index_cols = ['state', 'district', 'location_id', 'location_name', 
                  'datetime_utc', 'datetime_local', 'latitude', 'longitude']
    
    # Parameter columns in desired order
    param_order = ['pm25', 'pm10', 'no2', 'co', 'so2', 'o3', 
                   'temperature', 'humidity', 'wind_speed', 'wind_direction']
    
    # Get existing parameter columns in order
    existing_params = [col for col in param_order if col in transformed_df.columns]
    
    # Get available index columns
    available_index = [col for col in index_cols if col in transformed_df.columns]
    
    # Final column order
    final_columns = available_index + existing_params
    
    # Reorder columns
    final_df = transformed_df[[col for col in final_columns if col in transformed_df.columns]].copy()
    
    # Sort by state, district, location, and datetime
    sort_cols = [col for col in ['state', 'district', 'location_name', 'datetime_utc'] if col in final_df.columns]
    final_df = final_df.sort_values(sort_cols)
    
    # Save the COMPLETE transformed dataset
    output_file = f"{processed_dir}/india_aq_transformed_last30days.csv"
    final_df.to_csv(output_file, index=False)
    
    print("="*70)
    print("FINAL TRANSFORMED DATASET SAVED SUCCESSFULLY!")
    print("="*70)
    print(f"\nüìÅ Output file: {output_file}")
    print(f"üìä Total rows: {len(final_df):,}")
    print(f"üìã Total columns: {len(final_df.columns)}")
    
    print("\n" + "-"*50)
    print("Final Column Structure:")
    print("-"*50)
    for i, col in enumerate(final_df.columns, 1):
        dtype = final_df[col].dtype
        non_null = final_df[col].notna().sum()
        print(f"  {i:2}. {col:20s} ({dtype}) - {non_null:,} values")
    
    print("\n" + "-"*50)
    print("Records per State in Final Dataset:")
    print("-"*50)
    state_counts = final_df.groupby('state').size().sort_values(ascending=False)
    total = 0
    for state, count in state_counts.items():
        print(f"   {state}: {count:,} rows")
        total += count
    print(f"\n   TOTAL ROWS: {total:,}")
    
    print("\n" + "-"*50)
    print("Final Data Preview:")
    print("-"*50)
    print(final_df.head(10))
    
    # Also save state-wise transformed files
    print("\n" + "-"*50)
    print("Saving state-wise transformed files...")
    print("-"*50)
    
    for state in final_df['state'].unique():
        state_df = final_df[final_df['state'] == state]
        state_filename = state.lower().replace(' ', '_').replace('and', '').replace('__', '_').strip('_')
        state_file = f"{processed_dir}/{state_filename}_aq_transformed.csv"
        state_df.to_csv(state_file, index=False)
        print(f"  ‚úÖ {state}: {len(state_df):,} rows -> {state_file}")
    
    print(f"\nüéâ All transformed data saved to '{processed_dir}' directory!")
    print(f"üéâ TOTAL RECORDS PRESERVED: {len(final_df):,}")
else:
    print("‚ùå No transformed data to save!")

FINAL TRANSFORMED DATASET SAVED SUCCESSFULLY!

üìÅ Output file: data/processed/india_aq_transformed_last30days.csv
üìä Total rows: 109,501
üìã Total columns: 18

--------------------------------------------------
Final Column Structure:
--------------------------------------------------
   1. state                (object) - 109,501 values
   2. district             (object) - 109,501 values
   3. location_id          (int64) - 109,501 values
   4. location_name        (object) - 109,501 values
   5. datetime_utc         (object) - 109,501 values
   6. datetime_local       (object) - 109,501 values
   7. latitude             (float64) - 109,501 values
   8. longitude            (float64) - 109,501 values
   9. pm25                 (float64) - 105,683 values
  10. pm10                 (float64) - 101,126 values
  11. no2                  (float64) - 105,891 values
  12. co                   (float64) - 100,818 values
  13. so2                  (float64) - 106,879 values
  14. o3      