In [2]:
import osmnx as ox
import geopandas as gpd
import matplotlib.pyplot as plt
import pandas as pd
from shapely.geometry import Point, LineString
import warnings
warnings.filterwarnings('ignore')

print("=" * 60)
print("KOLKATA CANAL NETWORK EXTRACTION")
print("=" * 60)

# Define Kolkata Metropolitan Area bounding box
# Covers KMC + eastern drainage areas to Vidyadhari River
north, south, east, west = 22.75, 22.40, 88.55, 88.15

print(f"\nüìç Extraction Area:")
print(f"   Bounding box: {south}¬∞N to {north}¬∞N, {west}¬∞E to {east}¬∞E")
print(f"   Covers: KMC + Eastern drainage basins + Salt Lake + Howrah")

# Extract waterways from OpenStreetMap
print(f"\nüîç Querying OpenStreetMap for waterways...")
print("   (This may take 2-3 minutes...)")

try:
    # Get all waterway features
    tags = {
        'waterway': ['canal', 'drain', 'ditch', 'stream', 'river']
    }
    
    canals_gdf = ox.features_from_bbox(
        bbox=(north, south, east, west),
        tags=tags
    )
    
    print(f"‚úì Successfully retrieved {len(canals_gdf)} waterway features")
    
    # Filter to LineString geometries only (remove points/polygons)
    original_count = len(canals_gdf)
    canals_gdf = canals_gdf[canals_gdf.geometry.type == 'LineString'].copy()
    
    print(f"‚úì Filtered to {len(canals_gdf)} linear waterways (removed {original_count - len(canals_gdf)} points/polygons)")
    
    # Clean up the data
    print(f"\nüîß Processing waterway data...")
    
    # Ensure proper column types
    if 'width' in canals_gdf.columns:
        canals_gdf['width'] = pd.to_numeric(canals_gdf['width'], errors='coerce')
    else:
        canals_gdf['width'] = None
    
    # Categorize by type
    print(f"\nüìä Breakdown by waterway type:")
    type_counts = canals_gdf['waterway'].value_counts()
    for wtype, count in type_counts.items():
        print(f"   {wtype}: {count}")
    
    # Identify major canals
    # Criteria: width > 5m OR has a name OR waterway type is 'canal' or 'river'
    canals_gdf['is_major'] = (
        (canals_gdf['width'].fillna(0) > 5) | 
        (canals_gdf['name'].notna()) |
        (canals_gdf['waterway'].isin(['canal', 'river']))
    )
    
    major_canals = canals_gdf[canals_gdf['is_major']].copy()
    minor_drains = canals_gdf[~canals_gdf['is_major']].copy()
    
    print(f"\n‚úì Identified {len(major_canals)} major canals/rivers")
    print(f"‚úì Identified {len(minor_drains)} minor drains/ditches")
    
    # Show major canal names
    if len(major_canals[major_canals['name'].notna()]) > 0:
        print(f"\nüìù Major canal/river names found:")
        named_canals = major_canals[major_canals['name'].notna()]['name'].value_counts()
        for name, count in named_canals.head(20).items():
            print(f"   ‚Ä¢ {name} ({count} segments)")
    
    # Calculate total lengths
    canals_gdf['length_km'] = canals_gdf.geometry.length / 1000  # degrees to km approximation
    major_canals['length_km'] = major_canals.geometry.length / 1000
    minor_drains['length_km'] = minor_drains.geometry.length / 1000
    
    print(f"\nüìè Total Drainage Network Length:")
    print(f"   Major canals: {major_canals['length_km'].sum():.1f} km")
    print(f"   Minor drains: {minor_drains['length_km'].sum():.1f} km")
    print(f"   Total: {canals_gdf['length_km'].sum():.1f} km")
    
    # Save files
    print(f"\nüíæ Saving files...")
    
    # Keep essential columns for smaller file size
    save_columns = ['waterway', 'name', 'width', 'is_major', 'length_km', 'geometry']
    canals_to_save = canals_gdf[[col for col in save_columns if col in canals_gdf.columns]]
    major_to_save = major_canals[[col for col in save_columns if col in major_canals.columns]]
    
    canals_to_save.to_file('kolkata_all_waterways.geojson', driver='GeoJSON')
    major_to_save.to_file('kolkata_major_canals.geojson', driver='GeoJSON')
    
    print(f"‚úì Saved: kolkata_all_waterways.geojson ({len(canals_to_save)} features)")
    print(f"‚úì Saved: kolkata_major_canals.geojson ({len(major_to_save)} features)")
    
    # Create visualization
    print(f"\nüó∫Ô∏è  Creating visualization...")
    
    fig, (ax1, ax2) = plt.subplots(1, 2, figsize=(20, 10))
    
    # Left: All waterways
    minor_drains.plot(ax=ax1, color='lightblue', linewidth=0.5, alpha=0.5, label=f'Minor drains ({len(minor_drains)})')
    major_canals.plot(ax=ax1, color='blue', linewidth=2, label=f'Major canals ({len(major_canals)})')
    ax1.set_title('Complete Drainage Network', fontsize=14, fontweight='bold')
    ax1.set_xlabel('Longitude')
    ax1.set_ylabel('Latitude')
    ax1.legend()
    ax1.grid(True, alpha=0.3)
    
    # Right: Major canals only with names
    major_canals.plot(ax=ax2, color='darkblue', linewidth=2.5)
    ax2.set_title('Major Canals & Rivers', fontsize=14, fontweight='bold')
    ax2.set_xlabel('Longitude')
    ax2.set_ylabel('Latitude')
    ax2.grid(True, alpha=0.3)
    
    # Add text labels for major named canals (top 10)
    if len(named_canals) > 0:
        for name in named_canals.head(10).index:
            canal_segments = major_canals[major_canals['name'] == name]
            if len(canal_segments) > 0:
                # Get centroid of longest segment for label placement
                longest_seg = canal_segments.loc[canal_segments.geometry.length.idxmax()]
                centroid = longest_seg.geometry.centroid
                ax2.annotate(name, xy=(centroid.x, centroid.y), 
                           fontsize=8, ha='center',
                           bbox=dict(boxstyle='round,pad=0.3', facecolor='yellow', alpha=0.7))
    
    plt.tight_layout()
    plt.savefig('kolkata_canal_network.png', dpi=300, bbox_inches='tight')
    print(f"‚úì Saved: kolkata_canal_network.png")
    
    # Create summary statistics
    summary = {
        'total_waterways': len(canals_gdf),
        'major_canals': len(major_canals),
        'minor_drains': len(minor_drains),
        'total_length_km': canals_gdf['length_km'].sum(),
        'major_length_km': major_canals['length_km'].sum(),
        'named_canals': len(major_canals[major_canals['name'].notna()]['name'].unique()),
        'bbox': f"{south},{west},{north},{east}"
    }
    
    summary_df = pd.DataFrame([summary])
    summary_df.to_csv('../data/canals/canal_extraction_summary.csv', index=False)
    print(f"‚úì Saved: canal_extraction_summary.csv")
    
    print("\n" + "=" * 60)
    print("EXTRACTION COMPLETE!")
    print("=" * 60)
    print("\nüìÇ Files created:")
    print("   1. kolkata_all_waterways.geojson - Complete drainage network")
    print("   2. kolkata_major_canals.geojson - Major canals only (for visualization)")
    print("   3. kolkata_canal_network.png - Map visualization")
    print("   4. canal_extraction_summary.csv - Summary statistics")
    
    print("\nüéØ Next steps:")
    print("   1. Load kolkata_all_waterways.geojson in QGIS to verify")
    print("   2. Calculate canal features per ward (next script)")
    print("   3. Integrate with SWMManywhere drainage network")
    
except Exception as e:
    print(f"\n‚ùå Error during extraction: {e}")
    print("\nTroubleshooting:")
    print("   1. Check internet connection")
    print("   2. Try reducing bounding box size")
    print("   3. OSM servers might be overloaded - try again in a few minutes")
    print("\nAlternative: Use Overpass Turbo (https://overpass-turbo.eu/)")

CANAL FEATURES PER WARD CALCULATION

üìÇ Working directory: /Users/romitbasak/Projects/KolkataFloodMapping/notebooks
üìÇ Canal data directory: /Users/romitbasak/Projects/KolkataFloodMapping/notebooks/../data/canals

üìÇ Loading data...
‚úì Loaded 141 wards from ../data/raw
‚úì Loaded 98 canals

‚öôÔ∏è  Calculating canal features per ward...
   Processing ward 1/141...
   Processing ward 21/141...
   Processing ward 41/141...
   Processing ward 61/141...
   Processing ward 81/141...
   Processing ward 101/141...
   Processing ward 121/141...
   Processing ward 141/141...

‚úì Calculated features for all 141 wards

üìä Canal Coverage Summary:
   Wards with canals: 44 (31.2%)
   Wards with major canals: 22
   Wards with named canals: 30
   Wards with river segments: 17

üìè Canal Length Statistics:
   Total canal length: 0.0 km
   Mean per ward: 0.00 km
   Max in single ward: 0.00 km

üó∫Ô∏è  Canal Density Statistics:
   Mean density: 25193.25 km/km¬≤
   Max density: 223151.16 km/km

ValueError: You are trying to merge on object and int64 columns for key 'WARD'. If you wish to proceed you should use pd.concat