In [1]:
# Cell 1: Install Required Libraries

# Install osmnx if not available
import subprocess
import sys

def install_package(package):
    subprocess.check_call([sys.executable, "-m", "pip", "install", package, "-q"])

# Install required packages
packages = ['osmnx', 'geopandas', 'shapely']
for pkg in packages:
    try:
        __import__(pkg)
        print(f"‚úì {pkg} already installed")
    except ImportError:
        print(f"Installing {pkg}...")
        install_package(pkg)
        print(f"‚úì {pkg} installed")

print("\n‚úÖ All packages ready!")

‚úì osmnx already installed
‚úì geopandas already installed
‚úì shapely already installed

‚úÖ All packages ready!


In [2]:
# Cell 2: Import Libraries and Load Location Data

import osmnx as ox
import geopandas as gpd
import pandas as pd
import numpy as np
from shapely.geometry import Point
from shapely.ops import nearest_points
import warnings
import time
import os

warnings.filterwarnings('ignore')

# Configure OSMnx
ox.settings.log_console = False
ox.settings.use_cache = True
ox.settings.timeout = 180

print("Libraries imported successfully!")
print(f"OSMnx version: {ox.__version__}")

# Load the air quality dataset to get unique locations
aq_file = "data/processed/india_aq_transformed_last30days.csv"

if os.path.exists(aq_file):
    aq_df = pd.read_csv(aq_file)
    
    # Get unique locations
    locations_df = aq_df[['state', 'district', 'location_id', 'location_name', 'latitude', 'longitude']].drop_duplicates()
    locations_df = locations_df.reset_index(drop=True)
    
    print(f"\nüìä Air Quality Dataset loaded: {len(aq_df):,} records")
    print(f"üìç Unique monitoring locations: {len(locations_df)}")
    print(f"\n" + "="*60)
    print("Locations to extract physical features for:")
    print("="*60)
    print(locations_df[['state', 'district', 'location_name', 'latitude', 'longitude']].to_string())
else:
    print(f"‚ùå File not found: {aq_file}")
    locations_df = pd.DataFrame()

Libraries imported successfully!
OSMnx version: 2.0.7

üìä Air Quality Dataset loaded: 109,501 records
üìç Unique monitoring locations: 49

Locations to extract physical features for:
                state            district                                    location_name   latitude  longitude
0      Andhra Pradesh            Tirupati                       Tirumala, Tirupati - APPCB  13.670000  79.350000
1      Andhra Pradesh          Vijayawada                   Secretariat, Amaravati - APPCB  16.515083  80.518167
2      Andhra Pradesh       Visakhapatnam           GVM Corporation, Visakhapatnam - APPCB  17.722682  83.308197
3   Arunachal Pradesh            Itanagar                  Naharlagun, Naharlagun - APSPCB  27.103358  93.679645
4               Assam             Silchar                          Tarapur, Silchar - PCBA  24.828270  92.795250
5               Bihar           Bhagalpur                      Mayaganj, Bhagalpur - BSPCB  25.265194  87.012947
6               Bihar  

In [3]:
# Cell 3: Define Physical Feature Extraction Functions

# Search radius in meters for feature extraction
SEARCH_RADIUS = 2000  # 2 km radius

# Define OSM tags for each feature category based on project requirements
OSM_TAGS = {
    # Roads - for vehicular pollution source
    'roads': {
        'highway': ['motorway', 'trunk', 'primary', 'secondary', 'tertiary', 
                    'motorway_link', 'trunk_link', 'primary_link', 'secondary_link']
    },
    
    # Industrial zones - for industrial pollution source
    'industrial': {
        'landuse': ['industrial'],
        'building': ['industrial', 'factory', 'warehouse'],
        'man_made': ['works', 'chimney']
    },
    
    # Dump sites/Landfills - for waste-related pollution
    'dump_sites': {
        'landuse': ['landfill'],
        'amenity': ['waste_disposal', 'waste_transfer_station', 'recycling'],
        'man_made': ['wastewater_plant']
    },
    
    # Agricultural fields - for agricultural burning pollution
    'agricultural': {
        'landuse': ['farmland', 'farm', 'farmyard', 'orchard', 'vineyard', 
                    'plant_nursery', 'greenhouse_horticulture']
    },
    
    # Additional: Power plants (can contribute to pollution)
    'power_plants': {
        'power': ['plant', 'generator'],
        'man_made': ['chimney']
    },
    
    # Additional: Residential areas (for context)
    'residential': {
        'landuse': ['residential']
    },
    
    # Additional: Commercial areas
    'commercial': {
        'landuse': ['commercial', 'retail']
    }
}

def get_features_from_osm(lat, lon, tags, radius=SEARCH_RADIUS):
    """
    Extract features from OpenStreetMap for a given location.
    Returns GeoDataFrame of features or empty GeoDataFrame if none found.
    """
    try:
        # Create a point and buffer for the search area
        gdf = ox.features_from_point((lat, lon), tags=tags, dist=radius)
        return gdf
    except Exception as e:
        # Return empty GeoDataFrame if no features found
        return gpd.GeoDataFrame()

def calculate_min_distance(point, features_gdf):
    """
    Calculate minimum distance from a point to nearest feature.
    Returns distance in meters.
    """
    if features_gdf.empty:
        return np.nan
    
    try:
        # Project to UTM for accurate distance calculation
        features_gdf_proj = features_gdf.to_crs(epsg=32643)  # UTM zone 43N for India
        point_proj = gpd.GeoSeries([point], crs='EPSG:4326').to_crs(epsg=32643).iloc[0]
        
        # Calculate distances
        distances = features_gdf_proj.geometry.distance(point_proj)
        return distances.min()
    except Exception as e:
        return np.nan

def count_features_in_radius(features_gdf, radius=SEARCH_RADIUS):
    """
    Count number of features within the search radius.
    """
    if features_gdf.empty:
        return 0
    return len(features_gdf)

def calculate_total_road_length(roads_gdf):
    """
    Calculate total length of roads in meters.
    """
    if roads_gdf.empty:
        return 0
    
    try:
        # Project to UTM for accurate length calculation
        roads_proj = roads_gdf.to_crs(epsg=32643)
        total_length = roads_proj.geometry.length.sum()
        return total_length
    except:
        return 0

def calculate_feature_area(features_gdf):
    """
    Calculate total area of polygon features in square meters.
    """
    if features_gdf.empty:
        return 0
    
    try:
        # Project to UTM for accurate area calculation
        features_proj = features_gdf.to_crs(epsg=32643)
        # Filter only polygons
        polygons = features_proj[features_proj.geometry.type.isin(['Polygon', 'MultiPolygon'])]
        if polygons.empty:
            return 0
        return polygons.geometry.area.sum()
    except:
        return 0

print("‚úÖ Feature extraction functions defined!")
print(f"\nSearch radius: {SEARCH_RADIUS} meters ({SEARCH_RADIUS/1000} km)")
print(f"\nFeature categories to extract:")
for category in OSM_TAGS.keys():
    print(f"  ‚Ä¢ {category}")

‚úÖ Feature extraction functions defined!

Search radius: 2000 meters (2.0 km)

Feature categories to extract:
  ‚Ä¢ roads
  ‚Ä¢ industrial
  ‚Ä¢ dump_sites
  ‚Ä¢ agricultural
  ‚Ä¢ power_plants
  ‚Ä¢ residential
  ‚Ä¢ commercial


In [4]:
# Cell 4: Test Extraction for a Single Location

if len(locations_df) > 0:
    # Select first location for testing
    test_loc = locations_df.iloc[0]
    test_lat = test_loc['latitude']
    test_lon = test_loc['longitude']
    
    print("="*70)
    print("TESTING PHYSICAL FEATURE EXTRACTION FOR SINGLE LOCATION")
    print("="*70)
    print(f"\nüìç Location: {test_loc['location_name']}")
    print(f"   State: {test_loc['state']}")
    print(f"   District: {test_loc['district']}")
    print(f"   Coordinates: ({test_lat}, {test_lon})")
    print(f"   Search radius: {SEARCH_RADIUS}m")
    
    # Create point geometry
    test_point = Point(test_lon, test_lat)
    
    # Extract each feature type
    test_results = {}
    
    print("\n" + "-"*50)
    print("Extracting features...")
    print("-"*50)
    
    for category, tags in OSM_TAGS.items():
        print(f"  Extracting {category}...", end=" ")
        
        features = get_features_from_osm(test_lat, test_lon, tags, SEARCH_RADIUS)
        count = count_features_in_radius(features)
        min_dist = calculate_min_distance(test_point, features)
        
        if category == 'roads':
            total_length = calculate_total_road_length(features)
            test_results[f'{category}_count'] = count
            test_results[f'{category}_min_distance_m'] = min_dist
            test_results[f'{category}_total_length_m'] = total_length
            print(f"‚úì {count} features, min_dist={min_dist:.0f}m, length={total_length:.0f}m" if not np.isnan(min_dist) else f"‚úì {count} features")
        else:
            total_area = calculate_feature_area(features)
            test_results[f'{category}_count'] = count
            test_results[f'{category}_min_distance_m'] = min_dist
            test_results[f'{category}_area_sqm'] = total_area
            print(f"‚úì {count} features, min_dist={min_dist:.0f}m, area={total_area:.0f}sqm" if not np.isnan(min_dist) else f"‚úì {count} features")
        
        time.sleep(0.5)  # Rate limiting
    
    print("\n" + "-"*50)
    print("Test Results Summary:")
    print("-"*50)
    for key, value in test_results.items():
        if isinstance(value, float) and not np.isnan(value):
            print(f"  {key}: {value:.2f}")
        else:
            print(f"  {key}: {value}")
    
    print("\n‚úÖ Single location test completed successfully!")
else:
    print("‚ùå No locations available for testing")

TESTING PHYSICAL FEATURE EXTRACTION FOR SINGLE LOCATION

üìç Location: Tirumala, Tirupati - APPCB
   State: Andhra Pradesh
   District: Tirupati
   Coordinates: (13.67, 79.35)
   Search radius: 2000m

--------------------------------------------------
Extracting features...
--------------------------------------------------
  Extracting roads... ‚úì 67 features, min_dist=64m, length=52878m
  Extracting industrial... ‚úì 2 features, min_dist=397m, area=7470sqm
  Extracting dump_sites... ‚úì 3 features, min_dist=976m, area=47901sqm
  Extracting agricultural... ‚úì 0 features
  Extracting power_plants... ‚úì 2 features, min_dist=353m, area=0sqm
  Extracting residential... ‚úì 15 features, min_dist=214m, area=218283sqm
  Extracting commercial... ‚úì 10 features, min_dist=446m, area=1533015sqm

--------------------------------------------------
Test Results Summary:
--------------------------------------------------
  roads_count: 67
  roads_min_distance_m: 63.78
  roads_total_length_m: 52

In [5]:
# Cell 5: Define Main Feature Extraction Function for All Locations

def extract_physical_features_for_location(row):
    """
    Extract all physical features for a single location.
    Returns a dictionary with all extracted features.
    """
    lat = row['latitude']
    lon = row['longitude']
    point = Point(lon, lat)
    
    # Initialize result with location identifiers
    result = {
        'state': row['state'],
        'district': row['district'],
        'location_id': row['location_id'],
        'location_name': row['location_name'],
        'latitude': lat,
        'longitude': lon
    }
    
    # Extract features for each category
    for category, tags in OSM_TAGS.items():
        try:
            features = get_features_from_osm(lat, lon, tags, SEARCH_RADIUS)
            count = count_features_in_radius(features)
            min_dist = calculate_min_distance(point, features)
            
            result[f'{category}_count'] = count
            result[f'{category}_distance_m'] = round(min_dist, 2) if not np.isnan(min_dist) else np.nan
            
            if category == 'roads':
                result[f'{category}_total_length_m'] = round(calculate_total_road_length(features), 2)
            else:
                result[f'{category}_area_sqm'] = round(calculate_feature_area(features), 2)
                
        except Exception as e:
            result[f'{category}_count'] = 0
            result[f'{category}_distance_m'] = np.nan
            if category == 'roads':
                result[f'{category}_total_length_m'] = 0
            else:
                result[f'{category}_area_sqm'] = 0
        
        time.sleep(0.3)  # Rate limiting for OSM API
    
    return result

print("‚úÖ Main extraction function defined!")
print("\nFeatures to be extracted per location:")
print("-"*50)
sample_features = ['location_id', 'location_name', 'latitude', 'longitude']
for category in OSM_TAGS.keys():
    sample_features.append(f'{category}_count')
    sample_features.append(f'{category}_distance_m')
    if category == 'roads':
        sample_features.append(f'{category}_total_length_m')
    else:
        sample_features.append(f'{category}_area_sqm')

for feat in sample_features:
    print(f"  ‚Ä¢ {feat}")

‚úÖ Main extraction function defined!

Features to be extracted per location:
--------------------------------------------------
  ‚Ä¢ location_id
  ‚Ä¢ location_name
  ‚Ä¢ latitude
  ‚Ä¢ longitude
  ‚Ä¢ roads_count
  ‚Ä¢ roads_distance_m
  ‚Ä¢ roads_total_length_m
  ‚Ä¢ industrial_count
  ‚Ä¢ industrial_distance_m
  ‚Ä¢ industrial_area_sqm
  ‚Ä¢ dump_sites_count
  ‚Ä¢ dump_sites_distance_m
  ‚Ä¢ dump_sites_area_sqm
  ‚Ä¢ agricultural_count
  ‚Ä¢ agricultural_distance_m
  ‚Ä¢ agricultural_area_sqm
  ‚Ä¢ power_plants_count
  ‚Ä¢ power_plants_distance_m
  ‚Ä¢ power_plants_area_sqm
  ‚Ä¢ residential_count
  ‚Ä¢ residential_distance_m
  ‚Ä¢ residential_area_sqm
  ‚Ä¢ commercial_count
  ‚Ä¢ commercial_distance_m
  ‚Ä¢ commercial_area_sqm


In [6]:
# Cell 6: Extract Physical Features for ALL Locations

if len(locations_df) > 0:
    print("="*70)
    print("EXTRACTING PHYSICAL FEATURES FOR ALL LOCATIONS")
    print("="*70)
    print(f"\nTotal locations to process: {len(locations_df)}")
    print(f"Search radius: {SEARCH_RADIUS} meters")
    print(f"Estimated time: ~{len(locations_df) * 3} seconds\n")
    
    # Store all results
    all_physical_features = []
    
    # Process each location
    for idx, row in locations_df.iterrows():
        print(f"[{idx+1}/{len(locations_df)}] Processing: {row['location_name'][:40]}...", end=" ")
        
        try:
            features = extract_physical_features_for_location(row)
            all_physical_features.append(features)
            
            # Show summary
            roads = features.get('roads_count', 0)
            industrial = features.get('industrial_count', 0)
            agricultural = features.get('agricultural_count', 0)
            print(f"‚úì Roads:{roads}, Industrial:{industrial}, Agricultural:{agricultural}")
            
        except Exception as e:
            print(f"‚úó Error: {str(e)[:30]}")
            # Add row with NaN values
            error_result = {
                'state': row['state'],
                'district': row['district'],
                'location_id': row['location_id'],
                'location_name': row['location_name'],
                'latitude': row['latitude'],
                'longitude': row['longitude']
            }
            all_physical_features.append(error_result)
    
    # Convert to DataFrame
    physical_features_df = pd.DataFrame(all_physical_features)
    
    print("\n" + "="*70)
    print("EXTRACTION COMPLETE!")
    print("="*70)
    print(f"\nüìä Total locations processed: {len(physical_features_df)}")
    print(f"üìã Total features extracted: {len(physical_features_df.columns)}")
    
    print("\n" + "-"*50)
    print("Physical Features DataFrame Preview:")
    print("-"*50)
    print(physical_features_df.head(10))
else:
    print("‚ùå No locations to process")
    physical_features_df = pd.DataFrame()

EXTRACTING PHYSICAL FEATURES FOR ALL LOCATIONS

Total locations to process: 49
Search radius: 2000 meters
Estimated time: ~147 seconds

[1/49] Processing: Tirumala, Tirupati - APPCB... ‚úì Roads:67, Industrial:2, Agricultural:0
[2/49] Processing: Secretariat, Amaravati - APPCB... ‚úì Roads:52, Industrial:0, Agricultural:4
[3/49] Processing: GVM Corporation, Visakhapatnam - APPCB... ‚úì Roads:393, Industrial:10, Agricultural:0
[4/49] Processing: Naharlagun, Naharlagun - APSPCB... ‚úì Roads:37, Industrial:0, Agricultural:0
[5/49] Processing: Tarapur, Silchar - PCBA... ‚úì Roads:74, Industrial:0, Agricultural:0
[6/49] Processing: Mayaganj, Bhagalpur - BSPCB... ‚úì Roads:19, Industrial:1, Agricultural:2
[7/49] Processing: IGSC Planetarium Complex, Patna - BSPCB... ‚úì Roads:357, Industrial:3, Agricultural:0
[8/49] Processing: Civic Center, Bhilai - Bhilai Steel Plan... ‚úì Roads:137, Industrial:2, Agricultural:0
[9/49] Processing: Mangala, Bilaspur - CECB... ‚úì Roads:117, Industrial:1, Ag

In [7]:
# Cell 7: Data Quality Check and Summary Statistics

if len(physical_features_df) > 0:
    print("="*70)
    print("PHYSICAL FEATURES DATA QUALITY REPORT")
    print("="*70)
    
    print("\nüìã Column Information:")
    print("-"*50)
    for col in physical_features_df.columns:
        non_null = physical_features_df[col].notna().sum()
        dtype = physical_features_df[col].dtype
        print(f"  {col:35s}: {non_null:3d}/{len(physical_features_df)} values ({dtype})")
    
    # Feature statistics
    print("\n" + "-"*50)
    print("Feature Statistics (Distance in meters):")
    print("-"*50)
    
    distance_cols = [col for col in physical_features_df.columns if '_distance_m' in col]
    for col in distance_cols:
        values = physical_features_df[col].dropna()
        if len(values) > 0:
            print(f"\n  {col}:")
            print(f"    Min: {values.min():.2f}m")
            print(f"    Max: {values.max():.2f}m")
            print(f"    Mean: {values.mean():.2f}m")
            print(f"    Median: {values.median():.2f}m")
    
    # Count statistics
    print("\n" + "-"*50)
    print("Feature Count Statistics:")
    print("-"*50)
    
    count_cols = [col for col in physical_features_df.columns if '_count' in col]
    for col in count_cols:
        values = physical_features_df[col].dropna()
        if len(values) > 0:
            print(f"  {col:30s}: Total={values.sum():.0f}, Avg={values.mean():.1f}, Max={values.max():.0f}")
    
    # Locations with features
    print("\n" + "-"*50)
    print("Locations with Features Found:")
    print("-"*50)
    
    for col in count_cols:
        has_feature = (physical_features_df[col] > 0).sum()
        pct = (has_feature / len(physical_features_df)) * 100
        print(f"  {col.replace('_count', ''):20s}: {has_feature}/{len(physical_features_df)} locations ({pct:.1f}%)")
else:
    print("‚ùå No physical features data available")

PHYSICAL FEATURES DATA QUALITY REPORT

üìã Column Information:
--------------------------------------------------
  state                              :  49/49 values (object)
  district                           :  49/49 values (object)
  location_id                        :  49/49 values (int64)
  location_name                      :  49/49 values (object)
  latitude                           :  49/49 values (float64)
  longitude                          :  49/49 values (float64)
  roads_count                        :  49/49 values (int64)
  roads_distance_m                   :  49/49 values (float64)
  roads_total_length_m               :  49/49 values (float64)
  industrial_count                   :  49/49 values (int64)
  industrial_distance_m              :  36/49 values (float64)
  industrial_area_sqm                :  49/49 values (float64)
  dump_sites_count                   :  49/49 values (int64)
  dump_sites_distance_m              :  18/49 values (float64)
  dump_sites_a

In [8]:
# Cell 8: Save Physical Features Dataset

if len(physical_features_df) > 0:
    # Create output directory
    output_dir = "data/processed"
    os.makedirs(output_dir, exist_ok=True)
    
    # Define column order for output
    id_cols = ['state', 'district', 'location_id', 'location_name', 'latitude', 'longitude']
    feature_cols = [col for col in physical_features_df.columns if col not in id_cols]
    
    # Reorder columns
    final_cols = id_cols + sorted(feature_cols)
    physical_features_df = physical_features_df[[col for col in final_cols if col in physical_features_df.columns]]
    
    # Sort by state and district
    physical_features_df = physical_features_df.sort_values(['state', 'district', 'location_name'])
    
    # Save to CSV
    output_file = f"{output_dir}/india_physical_features.csv"
    physical_features_df.to_csv(output_file, index=False)
    
    print("="*70)
    print("PHYSICAL FEATURES DATASET SAVED!")
    print("="*70)
    print(f"\nüìÅ Output file: {output_file}")
    print(f"üìä Total locations: {len(physical_features_df)}")
    print(f"üìã Total columns: {len(physical_features_df.columns)}")
    
    print("\n" + "-"*50)
    print("Final Column Structure:")
    print("-"*50)
    for i, col in enumerate(physical_features_df.columns, 1):
        print(f"  {i:2d}. {col}")
    
    print("\n" + "-"*50)
    print("Final Dataset Preview:")
    print("-"*50)
    print(physical_features_df.head())
else:
    print("‚ùå No data to save!")

PHYSICAL FEATURES DATASET SAVED!

üìÅ Output file: data/processed/india_physical_features.csv
üìä Total locations: 49
üìã Total columns: 27

--------------------------------------------------
Final Column Structure:
--------------------------------------------------
   1. state
   2. district
   3. location_id
   4. location_name
   5. latitude
   6. longitude
   7. agricultural_area_sqm
   8. agricultural_count
   9. agricultural_distance_m
  10. commercial_area_sqm
  11. commercial_count
  12. commercial_distance_m
  13. dump_sites_area_sqm
  14. dump_sites_count
  15. dump_sites_distance_m
  16. industrial_area_sqm
  17. industrial_count
  18. industrial_distance_m
  19. power_plants_area_sqm
  20. power_plants_count
  21. power_plants_distance_m
  22. residential_area_sqm
  23. residential_count
  24. residential_distance_m
  25. roads_count
  26. roads_distance_m
  27. roads_total_length_m

--------------------------------------------------
Final Dataset Preview:
--------------

In [9]:
# Cell 9: Verify Data Compatibility with Air Quality Dataset

print("="*70)
print("DATA COMPATIBILITY VERIFICATION")
print("="*70)

# Load both datasets
aq_file = "data/processed/india_aq_transformed_last30days.csv"
pf_file = "data/processed/india_physical_features.csv"

if os.path.exists(aq_file) and os.path.exists(pf_file):
    aq_df = pd.read_csv(aq_file)
    pf_df = pd.read_csv(pf_file)
    
    print(f"\nüìä Air Quality Dataset: {len(aq_df):,} records")
    print(f"üìä Physical Features Dataset: {len(pf_df)} locations")
    
    # Check location_id matching
    aq_locations = set(aq_df['location_id'].unique())
    pf_locations = set(pf_df['location_id'].unique())
    
    matching = aq_locations.intersection(pf_locations)
    missing_in_pf = aq_locations - pf_locations
    extra_in_pf = pf_locations - aq_locations
    
    print("\n" + "-"*50)
    print("Location ID Matching:")
    print("-"*50)
    print(f"  ‚úì Matching location IDs: {len(matching)}")
    print(f"  ‚ö† In AQ but missing in Physical Features: {len(missing_in_pf)}")
    print(f"  ‚ö† In Physical Features but not in AQ: {len(extra_in_pf)}")
    
    if len(missing_in_pf) == 0 and len(extra_in_pf) == 0:
        print("\n‚úÖ PERFECT MATCH! All locations are compatible.")
    else:
        if len(missing_in_pf) > 0:
            print(f"\n‚ö† Missing location IDs: {missing_in_pf}")
    
    # Common columns for merging
    common_cols = ['state', 'district', 'location_id', 'location_name', 'latitude', 'longitude']
    aq_merge_cols = [col for col in common_cols if col in aq_df.columns]
    pf_merge_cols = [col for col in common_cols if col in pf_df.columns]
    
    print("\n" + "-"*50)
    print("Merge Columns Available:")
    print("-"*50)
    print(f"  AQ Dataset: {aq_merge_cols}")
    print(f"  Physical Features: {pf_merge_cols}")
    
    # Show merge example
    print("\n" + "-"*50)
    print("Sample Merge Preview (first 3 records):")
    print("-"*50)
    
    # Get unique AQ locations and merge with physical features
    aq_unique = aq_df[common_cols].drop_duplicates()
    merged_sample = pd.merge(aq_unique.head(3), pf_df, on='location_id', how='left', suffixes=('', '_pf'))
    print(merged_sample[['location_id', 'location_name', 'roads_count', 'industrial_count', 'agricultural_count']].to_string())
    
    print("\n" + "="*70)
    print("‚úÖ Data is ready for merging in the next phase!")
    print("="*70)
else:
    print("\n‚ùå One or more files not found!")
    if not os.path.exists(aq_file):
        print(f"   Missing: {aq_file}")
    if not os.path.exists(pf_file):
        print(f"   Missing: {pf_file}")

DATA COMPATIBILITY VERIFICATION

üìä Air Quality Dataset: 109,501 records
üìä Physical Features Dataset: 49 locations

--------------------------------------------------
Location ID Matching:
--------------------------------------------------
  ‚úì Matching location IDs: 49
  ‚ö† In AQ but missing in Physical Features: 0
  ‚ö† In Physical Features but not in AQ: 0

‚úÖ PERFECT MATCH! All locations are compatible.

--------------------------------------------------
Merge Columns Available:
--------------------------------------------------
  AQ Dataset: ['state', 'district', 'location_id', 'location_name', 'latitude', 'longitude']
  Physical Features: ['state', 'district', 'location_id', 'location_name', 'latitude', 'longitude']

--------------------------------------------------
Sample Merge Preview (first 3 records):
--------------------------------------------------
   location_id                           location_name  roads_count  industrial_count  agricultural_count
0         5

## Summary

This notebook extracts physical features from OpenStreetMap for each monitoring location:

### Features Extracted:
1. **Roads**: Count, minimum distance, total length (for vehicular pollution)
2. **Industrial**: Count, minimum distance, total area (for industrial pollution)
3. **Dump Sites**: Count, minimum distance, total area (for waste-related pollution)
4. **Agricultural**: Count, minimum distance, total area (for agricultural burning)
5. **Power Plants**: Count, minimum distance, total area
6. **Residential**: Count, minimum distance, total area
7. **Commercial**: Count, minimum distance, total area

### Output File:
- `data/processed/india_physical_features.csv`

### Merge Key:
- `location_id` - Primary key for merging with air quality data
- Also compatible with: `state`, `district`, `location_name`, `latitude`, `longitude`

### Next Steps:
1. Combine with air quality dataset
2. Calculate proximity features for model training
3. Apply source labeling rules based on features