In [1]:
import geopandas as gpd
import pandas as pd
import numpy as np
from pathlib import Path
import matplotlib.pyplot as plt

print("=" * 60)
print("CANAL FEATURES PER WARD CALCULATION")
print("=" * 60)

# Define paths
DATA_DIR = Path('../data')
CANAL_DIR = DATA_DIR / 'canals'
WARDS_DIR = DATA_DIR / 'raw'  # Adjust if your wards are elsewhere

# Create output directory if needed
CANAL_DIR.mkdir(parents=True, exist_ok=True)

print(f"\nüìÇ Working directory: {Path.cwd()}")
print(f"üìÇ Canal data directory: {CANAL_DIR.absolute()}")

# Load data
print("\nüìÇ Loading data...")
canals = gpd.read_file(CANAL_DIR / 'kolkata_all_waterways.geojson')

# Load wards - adjust filename as needed
try:
    wards = gpd.read_file(WARDS_DIR / 'kolkata_wards_clean.gpkg')
    print(f"‚úì Loaded {len(wards)} wards from {WARDS_DIR}")
except FileNotFoundError:
    # Try alternate locations
    try:
        wards = gpd.read_file(DATA_DIR / 'kmc_141_wards.geojson')
        print(f"‚úì Loaded {len(wards)} wards from {DATA_DIR}")
    except FileNotFoundError:
        print("‚ùå Ward file not found. Please specify correct path:")
        print("   Expected locations:")
        print(f"   - {WARDS_DIR / 'kmc_141_wards.geojson'}")
        print(f"   - {DATA_DIR / 'kmc_141_wards.geojson'}")
        raise

print(f"‚úì Loaded {len(canals)} canals")

# Ensure same CRS
if canals.crs != wards.crs:
    print(f"\nüîß Reprojecting canals to match wards CRS...")
    canals = canals.to_crs(wards.crs)

# Calculate canal features per ward
print(f"\n‚öôÔ∏è  Calculating canal features per ward...")

ward_canal_features = []

for idx, ward in wards.iterrows():
    if idx % 20 == 0:
        print(f"   Processing ward {idx+1}/{len(wards)}...")
    
    ward_geom = ward.geometry
    ward_area_km2 = ward_geom.area / 1e6  # Convert to km¬≤ (approximate)
    
    # Find canals that intersect this ward
    intersecting_canals = canals[canals.intersects(ward_geom)]
    
    # Calculate features
    features = {
        'ward_id': ward.get('ward_no', ward.get('WARD_NO', ward.get('id', idx))),
        'ward_name': ward.get('ward_name', ward.get('WARD_NAME', f'Ward_{idx}')),
        
        # Canal count features
        'canal_count': len(intersecting_canals),
        'canal_count_major': len(intersecting_canals[intersecting_canals['waterway'] == 'canal']),
        'canal_count_drain': len(intersecting_canals[intersecting_canals['waterway'] == 'drain']),
        'canal_count_river': len(intersecting_canals[intersecting_canals['waterway'] == 'river']),
        
        # Canal length features (clip to ward boundary for accurate length)
        'canal_length_km': 0,
        'canal_length_major_km': 0,
        'canal_length_drain_km': 0,
        
        # Canal density (km of canal per km¬≤ of ward)
        'canal_density': 0,
        
        # Boolean indicators
        'has_canal': False,
        'has_major_canal': False,
        'has_named_canal': False,
        'has_river': False,
        
        # Named canals in this ward
        'named_canals': [],
        
        # Distance to nearest canal (for wards without canals)
        'distance_to_canal_m': 0,
        
        # Drainage outlet type inference
        'primary_drainage_type': 'unknown'
    }
    
    if len(intersecting_canals) > 0:
        # Calculate actual lengths (clip to ward boundary)
        total_length = 0
        major_length = 0
        drain_length = 0
        named_canals = []
        
        for _, canal in intersecting_canals.iterrows():
            # Clip canal to ward boundary
            clipped = canal.geometry.intersection(ward_geom)
            if not clipped.is_empty:
                length_km = clipped.length / 1000  # Approximate degrees to km
                total_length += length_km
                
                # Categorize by type
                if canal['waterway'] == 'canal':
                    major_length += length_km
                elif canal['waterway'] == 'drain':
                    drain_length += length_km
                
                # Track named canals
                if pd.notna(canal.get('name')):
                    named_canals.append(canal['name'])
        
        features['canal_length_km'] = total_length
        features['canal_length_major_km'] = major_length
        features['canal_length_drain_km'] = drain_length
        features['canal_density'] = total_length / ward_area_km2 if ward_area_km2 > 0 else 0
        
        features['has_canal'] = True
        features['has_major_canal'] = major_length > 0
        features['has_river'] = len(intersecting_canals[intersecting_canals['waterway'] == 'river']) > 0
        features['has_named_canal'] = len(named_canals) > 0
        features['named_canals'] = ', '.join(set(named_canals)) if named_canals else ''
        
        # Infer drainage type
        if features['has_river']:
            features['primary_drainage_type'] = 'river'
        elif major_length > drain_length:
            features['primary_drainage_type'] = 'canal'
        elif drain_length > 0:
            features['primary_drainage_type'] = 'drain'
        else:
            features['primary_drainage_type'] = 'mixed'
        
        features['distance_to_canal_m'] = 0  # Ward has canals
    else:
        # No canals in ward - calculate distance to nearest
        ward_centroid = ward_geom.centroid
        if len(canals) > 0:
            distances = canals.geometry.distance(ward_centroid)
            features['distance_to_canal_m'] = distances.min() * 111000  # degrees to meters approx
            features['primary_drainage_type'] = 'pipe'  # Likely piped drainage
    
    ward_canal_features.append(features)

# Create DataFrame
canal_features_df = pd.DataFrame(ward_canal_features)

print(f"\n‚úì Calculated features for all {len(canal_features_df)} wards")

# Summary statistics
print(f"\nüìä Canal Coverage Summary:")
print(f"   Wards with canals: {canal_features_df['has_canal'].sum()} ({canal_features_df['has_canal'].sum()/len(canal_features_df)*100:.1f}%)")
print(f"   Wards with major canals: {canal_features_df['has_major_canal'].sum()}")
print(f"   Wards with named canals: {canal_features_df['has_named_canal'].sum()}")
print(f"   Wards with river segments: {canal_features_df['has_river'].sum()}")

print(f"\nüìè Canal Length Statistics:")
print(f"   Total canal length: {canal_features_df['canal_length_km'].sum():.1f} km")
print(f"   Mean per ward: {canal_features_df['canal_length_km'].mean():.2f} km")
print(f"   Max in single ward: {canal_features_df['canal_length_km'].max():.2f} km")

print(f"\nüó∫Ô∏è  Canal Density Statistics:")
print(f"   Mean density: {canal_features_df['canal_density'].mean():.2f} km/km¬≤")
print(f"   Max density: {canal_features_df['canal_density'].max():.2f} km/km¬≤")

print(f"\nüèóÔ∏è  Primary Drainage Type Distribution:")
drainage_types = canal_features_df['primary_drainage_type'].value_counts()
for dtype, count in drainage_types.items():
    print(f"   {dtype}: {count} wards ({count/len(canal_features_df)*100:.1f}%)")

# Top wards by canal length
print(f"\nüèÜ Top 10 Wards by Canal Length:")
top_wards = canal_features_df.nlargest(10, 'canal_length_km')[
    ['ward_name', 'canal_length_km', 'canal_density', 'named_canals']
]
for idx, row in top_wards.iterrows():
    canals_display = row['named_canals'][:50] + '...' if len(str(row['named_canals'])) > 50 else row['named_canals']
    print(f"   {row['ward_name']}: {row['canal_length_km']:.2f} km (density: {row['canal_density']:.2f}) - {canals_display}")

# Save results
print(f"\nüíæ Saving results...")
canal_features_df.to_csv(CANAL_DIR / 'ward_canal_features.csv', index=False)
print(f"‚úì Saved: {CANAL_DIR / 'ward_canal_features.csv'}")

# Merge with ward shapefile for visualization
# Find the ward ID column dynamically
ward_id_cols = [col for col in wards.columns if 'ward' in col.lower() or 'id' in col.lower()]
if ward_id_cols:
    merge_col = ward_id_cols[0]
else:
    merge_col = wards.columns[0]

wards_with_canals = wards.merge(
    canal_features_df[['ward_id', 'canal_length_km', 'canal_density', 'primary_drainage_type', 'has_major_canal']], 
    left_on=merge_col,
    right_on='ward_id',
    how='left'
)
wards_with_canals.to_file(CANAL_DIR / 'kmc_wards_with_canal_features.geojson', driver='GeoJSON')
print(f"‚úì Saved: {CANAL_DIR / 'kmc_wards_with_canal_features.geojson'}")

# Create visualization
print(f"\nüó∫Ô∏è  Creating visualization...")

fig, axes = plt.subplots(2, 2, figsize=(18, 16))

# 1. Canal length per ward
ax1 = axes[0, 0]
wards_with_canals.plot(column='canal_length_km', cmap='Blues', legend=True,
                       ax=ax1, edgecolor='black', linewidth=0.5, missing_kwds={'color': 'lightgray'})
canals.plot(ax=ax1, color='darkblue', linewidth=1, alpha=0.5)
ax1.set_title('Canal Length per Ward (km)', fontsize=14, fontweight='bold')
ax1.set_xlabel('Longitude')
ax1.set_ylabel('Latitude')

# 2. Canal density
ax2 = axes[0, 1]
wards_with_canals.plot(column='canal_density', cmap='YlGnBu', legend=True,
                       ax=ax2, edgecolor='black', linewidth=0.5, missing_kwds={'color': 'lightgray'})
ax2.set_title('Canal Density (km/km¬≤)', fontsize=14, fontweight='bold')
ax2.set_xlabel('Longitude')
ax2.set_ylabel('Latitude')

# 3. Primary drainage type
ax3 = axes[1, 0]
drainage_colors = {'canal': 'blue', 'drain': 'cyan', 'pipe': 'gray', 
                  'river': 'darkblue', 'mixed': 'purple', 'unknown': 'lightgray'}
wards_with_canals['color'] = wards_with_canals['primary_drainage_type'].map(drainage_colors).fillna('lightgray')
wards_with_canals.plot(color=wards_with_canals['color'], ax=ax3, 
                       edgecolor='black', linewidth=0.5)
ax3.set_title('Primary Drainage Type', fontsize=14, fontweight='bold')
ax3.set_xlabel('Longitude')
ax3.set_ylabel('Latitude')

# Create legend
from matplotlib.patches import Patch
legend_elements = [Patch(facecolor=color, label=dtype) 
                  for dtype, color in drainage_colors.items() 
                  if dtype in wards_with_canals['primary_drainage_type'].values]
ax3.legend(handles=legend_elements, loc='upper left')

# 4. Has major canal (binary)
ax4 = axes[1, 1]
wards_with_canals.plot(column='has_major_canal', cmap='RdYlGn', legend=True,
                       ax=ax4, edgecolor='black', linewidth=0.5, missing_kwds={'color': 'lightgray'})
canals[canals['waterway'] == 'canal'].plot(ax=ax4, color='darkgreen', linewidth=2)
ax4.set_title('Wards with Major Canals', fontsize=14, fontweight='bold')
ax4.set_xlabel('Longitude')
ax4.set_ylabel('Latitude')

plt.tight_layout()
plt.savefig(CANAL_DIR / 'ward_canal_analysis.png', dpi=300, bbox_inches='tight')
print(f"‚úì Saved: {CANAL_DIR / 'ward_canal_analysis.png'}")

print("\n" + "=" * 60)
print("CANAL FEATURE EXTRACTION COMPLETE!")
print("=" * 60)

print(f"\nüìÇ All files saved to: {CANAL_DIR.absolute()}")
print("   1. ward_canal_features.csv - Canal metrics per ward")
print("   2. kmc_wards_with_canal_features.geojson - Ward shapefile with canal data")
print("   3. ward_canal_analysis.png - Visualization")

print("\nüéØ Features created (ready for model):")
print("   ‚Ä¢ canal_count, canal_length_km, canal_density")
print("   ‚Ä¢ has_canal, has_major_canal, has_named_canal")
print("   ‚Ä¢ primary_drainage_type (canal/drain/pipe/river)")
print("   ‚Ä¢ distance_to_canal_m (for wards without canals)")

print("\n‚úÖ Next: Integrate with SWMManywhere pipe features!")

CANAL FEATURES PER WARD CALCULATION

üìÇ Working directory: /Users/romitbasak/Projects/KolkataFloodMapping/notebooks
üìÇ Canal data directory: /Users/romitbasak/Projects/KolkataFloodMapping/notebooks/../data/canals

üìÇ Loading data...
‚úì Loaded 141 wards from ../data/raw
‚úì Loaded 98 canals

‚öôÔ∏è  Calculating canal features per ward...
   Processing ward 1/141...
   Processing ward 21/141...
   Processing ward 41/141...



  distances = canals.geometry.distance(ward_centroid)

  distances = canals.geometry.distance(ward_centroid)

  distances = canals.geometry.distance(ward_centroid)

  distances = canals.geometry.distance(ward_centroid)

  distances = canals.geometry.distance(ward_centroid)

  distances = canals.geometry.distance(ward_centroid)

  distances = canals.geometry.distance(ward_centroid)

  distances = canals.geometry.distance(ward_centroid)

  distances = canals.geometry.distance(ward_centroid)

  distances = canals.geometry.distance(ward_centroid)

  distances = canals.geometry.distance(ward_centroid)

  distances = canals.geometry.distance(ward_centroid)

  distances = canals.geometry.distance(ward_centroid)

  distances = canals.geometry.distance(ward_centroid)

  distances = canals.geometry.distance(ward_centroid)

  distances = canals.geometry.distance(ward_centroid)

  distances = canals.geometry.distance(ward_centroid)

  distances = canals.geometry.distance(ward_centroid)

  distanc

   Processing ward 61/141...
   Processing ward 81/141...
   Processing ward 101/141...
   Processing ward 121/141...
   Processing ward 141/141...

‚úì Calculated features for all 141 wards

üìä Canal Coverage Summary:
   Wards with canals: 44 (31.2%)
   Wards with major canals: 22
   Wards with named canals: 30
   Wards with river segments: 17

üìè Canal Length Statistics:
   Total canal length: 0.0 km
   Mean per ward: 0.00 km
   Max in single ward: 0.00 km

üó∫Ô∏è  Canal Density Statistics:
   Mean density: 25193.25 km/km¬≤
   Max density: 223151.16 km/km¬≤

üèóÔ∏è  Primary Drainage Type Distribution:
   pipe: 97 wards (68.8%)
   canal: 21 wards (14.9%)
   river: 17 wards (12.1%)
   drain: 5 wards (3.5%)
   mixed: 1 wards (0.7%)

üèÜ Top 10 Wards by Canal Length:
   Ward_17: 0.00 km (density: 141930.92) - south purbachal canal, Lead Canal, Panchanangram C...
   Ward_119: 0.00 km (density: 183721.00) - south purbachal canal, Lead Canal
   Ward_40: 0.00 km (density: 127038.19) -


  distances = canals.geometry.distance(ward_centroid)

  distances = canals.geometry.distance(ward_centroid)

  distances = canals.geometry.distance(ward_centroid)

  distances = canals.geometry.distance(ward_centroid)

  distances = canals.geometry.distance(ward_centroid)

  distances = canals.geometry.distance(ward_centroid)

  distances = canals.geometry.distance(ward_centroid)

  distances = canals.geometry.distance(ward_centroid)

  distances = canals.geometry.distance(ward_centroid)

  distances = canals.geometry.distance(ward_centroid)

  distances = canals.geometry.distance(ward_centroid)

  distances = canals.geometry.distance(ward_centroid)

  distances = canals.geometry.distance(ward_centroid)

  distances = canals.geometry.distance(ward_centroid)

  distances = canals.geometry.distance(ward_centroid)

  distances = canals.geometry.distance(ward_centroid)

  distances = canals.geometry.distance(ward_centroid)

  distances = canals.geometry.distance(ward_centroid)

  distanc

ValueError: You are trying to merge on object and int64 columns for key 'WARD'. If you wish to proceed you should use pd.concat