In [1]:
!pip install osmnx geopandas pandas openpyxl shapely

Collecting openpyxl
  Downloading openpyxl-3.1.5-py2.py3-none-any.whl.metadata (2.5 kB)
Collecting et-xmlfile (from openpyxl)
  Downloading et_xmlfile-2.0.0-py3-none-any.whl.metadata (2.7 kB)
Downloading openpyxl-3.1.5-py2.py3-none-any.whl (250 kB)
Downloading et_xmlfile-2.0.0-py3-none-any.whl (18 kB)
Installing collected packages: et-xmlfile, openpyxl

   -------------------- ------------------- 1/2 [openpyxl]
   -------------------- ------------------- 1/2 [openpyxl]
   -------------------- ------------------- 1/2 [openpyxl]
   -------------------- ------------------- 1/2 [openpyxl]
   -------------------- ------------------- 1/2 [openpyxl]
   -------------------- ------------------- 1/2 [openpyxl]
   -------------------- ------------------- 1/2 [openpyxl]
   -------------------- ------------------- 1/2 [openpyxl]
   -------------------- ------------------- 1/2 [openpyxl]
   -------------------- ------------------- 1/2 [openpyxl]
   -------------------- ------------------- 1/2 [openp



In [None]:
import pandas as pd
import numpy as np
import osmnx as ox
import geopandas as gpd
from shapely.geometry import Point, box
import warnings
import time
from typing import Dict, List, Tuple
import sys

warnings.filterwarnings('ignore')

# ============================================================================
# CONFIGURATION
# ============================================================================

BATCH_SIZE = 5
SEARCH_RADIUS = 1000  # meters = 1KM
OUTPUT_FILE = "Air_Quality_weather_with_OSM_Features.xlsx"

INPUT_FILE = r"data/processed/india_aq_transformed_last30days.csv"

# ============================================================================
# FEATURE EXTRACTION FUNCTIONS
# ============================================================================

def extract_roads(lat: float, lon: float, radius: int) -> Dict:
    """Extract road features from OSM"""
    try:
        roads = ox.features_from_point(
            (lat, lon),
            tags={'highway': True},
            dist=radius
        )
        
        if len(roads) > 0:
            road_names = [roads.index.get_level_values(0).unique()[i] 
                         for i in range(min(5, len(roads.index.get_level_values(0).unique())))]
            return {
                'count': len(roads),
                'examples': ', '.join(str(r)[:40] for r in road_names),
                'status': 'success'
            }
        return {'count': 0, 'examples': '', 'status': 'success'}
    except Exception as e:
        return {'count': 0, 'examples': f"Error: {str(e)[:30]}", 'status': 'failed'}

def extract_industrial_zones(lat: float, lon: float, radius: int) -> Dict:
    """Extract industrial facilities from OSM"""
    try:
        tags = {'industrial': True, 'landuse': 'industrial'}
        industrial = ox.features_from_point((lat, lon), tags=tags, dist=radius)
        
        if len(industrial) > 0:
            ind_names = industrial.index.get_level_values(0).unique()[:5]
            return {
                'count': len(industrial),
                'examples': ', '.join(str(i)[:35] for i in ind_names),
                'status': 'success'
            }
        return {'count': 0, 'examples': '', 'status': 'success'}
    except Exception as e:
        return {'count': 0, 'examples': f"Error: {str(e)[:30]}", 'status': 'failed'}

def extract_dump_sites(lat: float, lon: float, radius: int) -> Dict:
    """Extract waste management and dump sites from OSM"""
    try:
        tags = {'waste': True}
        dumps = ox.features_from_point((lat, lon), tags=tags, dist=radius)
        
        if len(dumps) > 0:
            dump_names = dumps.index.get_level_values(0).unique()[:5]
            return {
                'count': len(dumps),
                'examples': ', '.join(str(d)[:35] for d in dump_names),
                'status': 'success'
            }
        return {'count': 0, 'examples': '', 'status': 'success'}
    except Exception as e:
        return {'count': 0, 'examples': f"Error: {str(e)[:30]}", 'status': 'failed'}

def extract_agricultural_fields(lat: float, lon: float, radius: int) -> Dict:
    """Extract agricultural areas from OSM"""
    try:
        tags = {'landuse': ['farmland', 'farm', 'agricultural', 'grass', 'meadow']}
        agriculture = ox.features_from_point((lat, lon), tags=tags, dist=radius)
        
        if len(agriculture) > 0:
            agr_names = agriculture.index.get_level_values(0).unique()[:5]
            return {
                'count': len(agriculture),
                'examples': ', '.join(str(a)[:35] for a in agr_names),
                'status': 'success'
            }
        return {'count': 0, 'examples': '', 'status': 'success'}
    except Exception as e:
        return {'count': 0, 'examples': f"Error: {str(e)[:30]}", 'status': 'failed'}

def extract_all_features(lat: float, lon: float, radius: int) -> Dict:
    """Extract all features for a location"""
    features = {
        'roads': extract_roads(lat, lon, radius),
        'industrial': extract_industrial_zones(lat, lon, radius),
        'dumps': extract_dump_sites(lat, lon, radius),
        'agriculture': extract_agricultural_fields(lat, lon, radius)
    }
    return features

# ============================================================================
# MAIN PROCESSING
# ============================================================================

def main():
    print(f"\n{'='*80}")
    print("OSM FEATURE EXTRACTION FOR AIR QUALITY ANALYSIS")
    print(f"{'='*80}\n")
    
    # Load dataset
    print(f"Loading dataset from {INPUT_FILE}...")
    df = pd.read_excel(INPUT_FILE, sheet_name=0)
    print(f"âœ“ Loaded {len(df)} records\n")
    
    # Get unique locations
    unique_locations = df[['latitude', 'longitude', 'location_name', 'state',
                           'district']].drop_duplicates(
                           subset=['latitude', 'longitude']).reset_index(drop=True)
    
    print(f"Unique locations to process: {len(unique_locations)}")
    print(f"Processing parameters:")
    print(f"  - Batch size: {BATCH_SIZE}")
    print(f"  - Search radius: {SEARCH_RADIUS}m")
    print(f"\n{'='*80}\n")
    
    # Initialize results
    features_data = []
    total_batches = (len(unique_locations) - 1) // BATCH_SIZE + 1
    
    # Process in batches
    for batch_num in range(0, len(unique_locations), BATCH_SIZE):
        batch = unique_locations.iloc[batch_num:batch_num + BATCH_SIZE]
        batch_idx = (batch_num // BATCH_SIZE) + 1
        
        print(f"Batch {batch_idx}/{total_batches}:")
        print(f"Processing locations {batch_num + 1} to {min(batch_num + BATCH_SIZE, len(unique_locations))}\n")
        
        for idx, row in batch.iterrows():
            lat = row['latitude']
            lon = row['longitude']
            location_name = row['location_name']
            city = row['district']
            state = row['state']
            
            try:
                # Extract features
                print(f"  Querying {city}, {state} ({lat:.4f}, {lon:.4f})...", end='', flush=True)
                features = extract_all_features(lat, lon, SEARCH_RADIUS)
                
                # Compile record
                record = {
                    'Location': location_name,
                    'City': city,
                    'State': state,
                    'Latitude': lat,
                    'Longitude': lon,
                    'Roads_count': features['roads']['count'],
                    'Roads_examples': features['roads']['examples'],
                    'Industrial_zones_count': features['industrial']['count'],
                    'Industrial_examples': features['industrial']['examples'],
                    'Dump_sites_count': features['dumps']['count'],
                    'Dump_examples': features['dumps']['examples'],
                    'Agricultural_fields_count': features['agriculture']['count'],
                    'Agricultural_examples': features['agriculture']['examples'],
                    'Query_status': 'Success'
                }
                
                features_data.append(record)
                
                print(f" âœ“ Roads:{features['roads']['count']} | "
                      f"Industrial:{features['industrial']['count']} | "
                      f"Dumps:{features['dumps']['count']} | "
                      f"Agriculture:{features['agriculture']['count']}")
                
            except Exception as e:
                print(f" âœ— Error: {str(e)[:50]}")
                record = {
                    'Location': location_name,
                    'City': city,
                    'State': state,
                    'Latitude': lat,
                    'Longitude': lon,
                    'Roads_count': 0,
                    'Roads_examples': '',
                    'Industrial_zones_count': 0,
                    'Industrial_examples': '',
                    'Dump_sites_count': 0,
                    'Dump_examples': '',
                    'Agricultural_fields_count': 0,
                    'Agricultural_examples': '',
                    'Query_status': f'Failed: {str(e)[:40]}'
                }
                features_data.append(record)
            
            # Small delay between queries to avoid rate limiting
            time.sleep(0.5)
        
        print(f"\nBatch completed. Waiting before next batch...\n")
        time.sleep(2)
    
    # Create features dataframe
    features_df = pd.DataFrame(features_data)
    
    # Print summary statistics
    print(f"\n{'='*80}")
    print("EXTRACTION SUMMARY")
    print(f"{'='*80}\n")
    
    print(f"Total locations processed: {len(features_df)}")
    successful = (features_df['Query_status'] == 'Success').sum()
    print(f"Successful queries: {successful}/{len(features_df)}")
    
    print(f"\n{'Feature Statistics':^80}")
    print(f"{'-'*80}")
    print(f"{'Feature':<20} {'Total':<12} {'Mean':<12} {'Max':<12}")
    print(f"{'-'*80}")
    print(f"{'Roads':<20} {features_df['Roads_count'].sum():<12.0f} "
          f"{features_df['Roads_count'].mean():<12.1f} {features_df['Roads_count'].max():<12.0f}")
    print(f"{'Industrial':<20} {features_df['Industrial_zones_count'].sum():<12.0f} "
          f"{features_df['Industrial_zones_count'].mean():<12.1f} "
          f"{features_df['Industrial_zones_count'].max():<12.0f}")
    print(f"{'Dump Sites':<20} {features_df['Dump_sites_count'].sum():<12.0f} "
          f"{features_df['Dump_sites_count'].mean():<12.1f} {features_df['Dump_sites_count'].max():<12.0f}")
    print(f"{'Agriculture':<20} {features_df['Agricultural_fields_count'].sum():<12.0f} "
          f"{features_df['Agricultural_fields_count'].mean():<12.1f} "
          f"{features_df['Agricultural_fields_count'].max():<12.0f}")
    print(f"{'-'*80}\n")
    
    # Merge with original data
    print("Merging extracted features with original air quality data...")
    
    # Merge on latitude and longitude
    merged_df = df.merge(
        features_df[['Latitude', 'Longitude', 'Roads_count', 'Industrial_zones_count',
                     'Dump_sites_count', 'Agricultural_fields_count', 'Query_status']],
        on=['Latitude', 'Longitude'],
        how='left'
    )
    
    # Fill NaN values for unmatched records (should be minimal)
    merged_df['Roads_count'].fillna(0, inplace=True)
    merged_df['Industrial_zones_count'].fillna(0, inplace=True)
    merged_df['Dump_sites_count'].fillna(0, inplace=True)
    merged_df['Agricultural_fields_count'].fillna(0, inplace=True)
    merged_df['Query_status'].fillna('Not processed', inplace=True)
    
    # Create additional derived features
    merged_df['Urban_density_score'] = (merged_df['Roads_count'] / merged_df['Roads_count'].max()).round(2)
    merged_df['Industrial_presence'] = (merged_df['Industrial_zones_count'] > 0).astype(int)
    merged_df['Pollution_source_risk'] = (
        (merged_df['Industrial_zones_count'] * 0.4 + 
         merged_df['Dump_sites_count'] * 0.3 + 
         merged_df['Roads_count'] * 0.3) / 100
    ).round(2)
    merged_df['Green_area_ratio'] = (
        merged_df['Agricultural_fields_count'] / 
        (merged_df['Roads_count'] + merged_df['Agricultural_fields_count'])
    ).round(2)
    
    # Save to Excel
    print(f"\nSaving merged data to {OUTPUT_FILE}...")
    with pd.ExcelWriter(OUTPUT_FILE, engine='openpyxl') as writer:
        merged_df.to_excel(writer, sheet_name='All Data', index=False)
        features_df.to_excel(writer, sheet_name='OSM Features', index=False)
        
        # Summary statistics sheet
        summary_stats = pd.DataFrame({
            'Metric': ['Total Records', 'Unique Locations', 'Successful Queries',
                      'Mean Roads', 'Mean Industrial', 'Mean Dumps', 'Mean Agriculture',
                      'Max Roads', 'Max Industrial', 'Max Dumps', 'Max Agriculture'],
            'Value': [len(merged_df), len(features_df), successful,
                     f"{features_df['Roads_count'].mean():.1f}",
                     f"{features_df['Industrial_zones_count'].mean():.1f}",
                     f"{features_df['Dump_sites_count'].mean():.1f}",
                     f"{features_df['Agricultural_fields_count'].mean():.1f}",
                     features_df['Roads_count'].max(),
                     features_df['Industrial_zones_count'].max(),
                     features_df['Dump_sites_count'].max(),
                     features_df['Agricultural_fields_count'].max()]
        })
        summary_stats.to_excel(writer, sheet_name='Summary', index=False)
    
    print(f"âœ“ Successfully saved to {OUTPUT_FILE}\n")
    
    # Display sample results
    print(f"{'Sample Results (First 15 locations):':^80}")
    print(merged_df[['City / town / village', 'State / Union Territory', 'Roads_count',
                     'Industrial_zones_count', 'Dump_sites_count', 'Agricultural_fields_count',
                     'Pollution_source_risk', 'Green_area_ratio']].head(15))
    
    print(f"\n{'='*80}")
    print("âœ“ Processing Complete!")
    print(f"{'='*80}\n")
    
    return merged_df, features_df

if __name__ == "__main__":
    merged_df, features_df = main()

In [3]:
import pandas as pd
import numpy as np
import osmnx as ox
import warnings
import time

warnings.filterwarnings("ignore")

# ======================================================================
# CONFIG
# ======================================================================
BATCH_SIZE = 5
SEARCH_RADIUS = 1000  # meters
OUTPUT_FILE = "Air_Quality_weather_with_OSM_Features.csv"
INPUT_FILE = r"data/processed/india_aq_transformed_last30days.csv"

# ======================================================================
# CORRECT OSM TAG SETS
# ======================================================================

ROAD_TAGS = {"highway": True}

INDUSTRIAL_TAGS = {
    "landuse": ["industrial", "commercial"],
    "building": ["industrial", "commercial"]
}

DUMP_TAGS = {
    "landuse": ["landfill", "waste"],
    "amenity": ["waste_disposal", "recycling"]
}

AGRICULTURE_TAGS = {
    "landuse": ["farmland", "meadow", "orchard", "vineyard", "farm"]
}

# ======================================================================
# FEATURE EXTRACTION
# ======================================================================

def extract_feature(lat, lon, radius, tags):
    """Extract OSM features for a given coordinate"""
    try:
        gdf = ox.features_from_point((lat, lon), tags=tags, dist=radius)

        if gdf is None or len(gdf) == 0:
            return 0

        return len(gdf)

    except Exception:
        return 0


# ======================================================================
# MAIN FUNCTION
# ======================================================================

def main():

    print("\nLoading CSV dataset...")
    df = pd.read_csv(INPUT_FILE)
    print(f"âœ“ Loaded {len(df)} rows")

    required_cols = ["latitude", "longitude", "location_name", "state", "district"]
    for col in required_cols:
        if col not in df.columns:
            raise ValueError(f"Dataset missing required column â†’ {col}")

    # Extract unique locations
    unique_locations = df[['latitude', 'longitude', 'location_name', 'state', 'district']].drop_duplicates()
    print(f"Unique locations to process = {len(unique_locations)}")

    features_list = []
    total_batches = (len(unique_locations) - 1) // BATCH_SIZE + 1

    for b in range(0, len(unique_locations), BATCH_SIZE):

        batch = unique_locations.iloc[b : b + BATCH_SIZE]
        batch_num = b // BATCH_SIZE + 1

        print(f"\nðŸ“¦ Processing batch {batch_num}/{total_batches}")

        for _, row in batch.iterrows():

            lat = row["latitude"]
            lon = row["longitude"]
            name = row["location_name"]

            print(f" â†’ Extracting for {name} ({lat:.4f}, {lon:.4f})...", end=" ")

            # Extract all features
            roads = extract_feature(lat, lon, SEARCH_RADIUS, ROAD_TAGS)
            industrial = extract_feature(lat, lon, SEARCH_RADIUS, INDUSTRIAL_TAGS)
            dumps = extract_feature(lat, lon, SEARCH_RADIUS, DUMP_TAGS)
            agriculture = extract_feature(lat, lon, SEARCH_RADIUS, AGRICULTURE_TAGS)

            features_list.append({
                "latitude": lat,
                "longitude": lon,
                "location_name": row["location_name"],
                "state": row["state"],
                "district": row["district"],
                "Roads_count": roads,
                "Industrial_zones_count": industrial,
                "Dump_sites_count": dumps,
                "Agricultural_fields_count": agriculture
            })

            print("âœ“ Done")

    # Convert to DataFrame
    features_df = pd.DataFrame(features_list)

    print("\nMerging with original dataset...")
    merged_df = df.merge(features_df, on=["latitude", "longitude"], how="left")

    # Save to CSV
    print(f"Saving results â†’ {OUTPUT_FILE}")
    merged_df.to_csv(OUTPUT_FILE, index=False)

    print("\nâœ“ Completed successfully!\n")
    return merged_df, features_df


# RUN
if __name__ == "__main__":
    merged_df, features_df = main()



Loading CSV dataset...
âœ“ Loaded 77994 rows
Unique locations to process = 36

ðŸ“¦ Processing batch 1/8
 â†’ Extracting for Tirumala, Tirupati - APPCB (13.6700, 79.3500)... âœ“ Done
 â†’ Extracting for Secretariat, Amaravati - APPCB (16.5151, 80.5182)... âœ“ Done
 â†’ Extracting for GVM Corporation, Visakhapatnam - APPCB (17.7227, 83.3082)... âœ“ Done
 â†’ Extracting for Naharlagun, Naharlagun - APSPCB (27.1034, 93.6796)... âœ“ Done
 â†’ Extracting for Tarapur, Silchar - PCBA (24.8283, 92.7952)... âœ“ Done

ðŸ“¦ Processing batch 2/8
 â†’ Extracting for Mayaganj, Bhagalpur - BSPCB (25.2652, 87.0129)... âœ“ Done
 â†’ Extracting for IGSC Planetarium Complex, Patna - BSPCB (25.5941, 85.1376)... âœ“ Done
 â†’ Extracting for Civic Center, Bhilai - Bhilai Steel Plant (21.1856, 81.3432)... âœ“ Done
 â†’ Extracting for Mangala, Bilaspur - CECB (22.0881, 82.1374)... âœ“ Done
 â†’ Extracting for Hathkhoj, Bhilai - CECB (21.2242, 81.4083)... âœ“ Done

ðŸ“¦ Processing batch 3/8
 â†’ Extracting fo

In [4]:
df=pd.read_csv("Air_Quality_weather_with_OSM_Features.csv")
df.head()

Unnamed: 0,state_x,district_x,location_id,location_name_x,datetime_utc,datetime_local,latitude,longitude,pm25,pm10,...,humidity,wind_speed,wind_direction,location_name_y,state_y,district_y,Roads_count,Industrial_zones_count,Dump_sites_count,Agricultural_fields_count
0,Andhra Pradesh,Tirupati,5649,"Tirumala, Tirupati - APPCB",2025-11-11T15:00:00Z,2025-11-11T20:30:00+05:30,13.67,79.35,81.0,115.0,...,87.0,0.5,355.0,"Tirumala, Tirupati - APPCB",Andhra Pradesh,Tirupati,155,4,0,0
1,Andhra Pradesh,Tirupati,5649,"Tirumala, Tirupati - APPCB",2025-11-11T15:15:00Z,2025-11-11T20:45:00+05:30,13.67,79.35,81.0,115.0,...,87.0,0.4,355.0,"Tirumala, Tirupati - APPCB",Andhra Pradesh,Tirupati,155,4,0,0
2,Andhra Pradesh,Tirupati,5649,"Tirumala, Tirupati - APPCB",2025-11-11T15:45:00Z,2025-11-11T21:15:00+05:30,13.67,79.35,81.0,115.0,...,87.0,0.3,355.0,"Tirumala, Tirupati - APPCB",Andhra Pradesh,Tirupati,155,4,0,0
3,Andhra Pradesh,Tirupati,5649,"Tirumala, Tirupati - APPCB",2025-11-11T16:15:00Z,2025-11-11T21:45:00+05:30,13.67,79.35,90.0,114.0,...,84.0,0.3,355.0,"Tirumala, Tirupati - APPCB",Andhra Pradesh,Tirupati,155,4,0,0
4,Andhra Pradesh,Tirupati,5649,"Tirumala, Tirupati - APPCB",2025-11-11T16:30:00Z,2025-11-11T22:00:00+05:30,13.67,79.35,90.0,114.0,...,85.0,0.4,355.0,"Tirumala, Tirupati - APPCB",Andhra Pradesh,Tirupati,155,4,0,0


In [5]:
df.tail()

Unnamed: 0,state_x,district_x,location_id,location_name_x,datetime_utc,datetime_local,latitude,longitude,pm25,pm10,...,humidity,wind_speed,wind_direction,location_name_y,state_y,district_y,Roads_count,Industrial_zones_count,Dump_sites_count,Agricultural_fields_count
77989,Tamil Nadu,Tiruchirappalli,3409356,"Bharathidasan University, Palkalaiperur - TNPCB",2025-12-05T05:15:00Z,2025-12-05T10:45:00+05:30,10.681158,78.741746,,,...,,,,"Bharathidasan University, Palkalaiperur - TNPCB",Tamil Nadu,Tiruchirappalli,141,0,0,0
77990,Tamil Nadu,Tiruchirappalli,3409356,"Bharathidasan University, Palkalaiperur - TNPCB",2025-12-05T06:15:00Z,2025-12-05T11:45:00+05:30,10.681158,78.741746,13.33,28.33,...,,,,"Bharathidasan University, Palkalaiperur - TNPCB",Tamil Nadu,Tiruchirappalli,141,0,0,0
77991,Tamil Nadu,Tiruchirappalli,3409356,"Bharathidasan University, Palkalaiperur - TNPCB",2025-12-05T07:30:00Z,2025-12-05T13:00:00+05:30,10.681158,78.741746,,,...,,,,"Bharathidasan University, Palkalaiperur - TNPCB",Tamil Nadu,Tiruchirappalli,141,0,0,0
77992,Tamil Nadu,Tiruchirappalli,3409356,"Bharathidasan University, Palkalaiperur - TNPCB",2025-12-11T10:15:00Z,2025-12-11T15:45:00+05:30,10.681158,78.741746,43.0,89.0,...,,,,"Bharathidasan University, Palkalaiperur - TNPCB",Tamil Nadu,Tiruchirappalli,141,0,0,0
77993,Tamil Nadu,Tiruchirappalli,3409356,"Bharathidasan University, Palkalaiperur - TNPCB",2025-12-11T12:15:00Z,2025-12-11T17:45:00+05:30,10.681158,78.741746,44.0,91.0,...,,,,"Bharathidasan University, Palkalaiperur - TNPCB",Tamil Nadu,Tiruchirappalli,141,0,0,0
