# Données Géospatiales

## Traitement Avancé des Zones

In [1]:
# notebooks/05_geospatial_processing.py
import geopandas as gpd
from shapely import wkt
from pyspark.sql.functions import udf
from pyspark.sql.types import DoubleType, StringType

def process_geospatial_data():
    """Traitement géospatial avancé"""
    
    # Lecture du shapefile avec Geopandas
    gdf = gpd.read_file("/tmp/taxi_zones/taxi_zones.shp")
    
    # Calcul des centroïdes
    gdf['centroid'] = gdf.geometry.centroid
    gdf['centroid_lon'] = gdf.centroid.x
    gdf['centroid_lat'] = gdf.centroid.y
    
    # Calcul de la superficie
    gdf['area_sq_miles'] = gdf.geometry.area * 3.861e-7  # Conversion en miles²
    
    # Conversion en DataFrame Spark
    zones_enhanced = spark.createDataFrame(gdf[['LocationID', 'zone', 'borough', 
                                                'centroid_lon', 'centroid_lat', 
                                                'area_sq_miles']])
    
    # Enregistrement
    zones_enhanced.writeTo("local.silver.zones_enhanced").createOrReplace()
    
    print(f"✅ Table silver.zones_enhanced créée")
    
    # UDF pour calculer la distance Haversine
    @udf(returnType=DoubleType())
    def haversine_distance(lat1, lon1, lat2, lon2):
        """Calcule la distance en miles entre deux points GPS"""
        from math import radians, sin, cos, sqrt, atan2
        
        R = 3958.8  # Rayon de la Terre en miles
        
        lat1, lon1, lat2, lon2 = map(radians, [lat1, lon1, lat2, lon2])
        
        dlat = lat2 - lat1
        dlon = lon2 - lon1
        
        a = sin(dlat/2)**2 + cos(lat1) * cos(lat2) * sin(dlon/2)**2
        c = 2 * atan2(sqrt(a), sqrt(1-a))
        
        return R * c
    
    # Jointure avec les centroïdes pour calcul des distances réelles
    trips_df = spark.table("local.silver.trips_complete")
    zones_df = spark.table("local.silver.zones_enhanced")
    
    # Jointure pickup
    trips_with_geo = trips_df.alias("t") \
        .join(zones_df.alias("pz"), 
              F.col("t.PULocationID") == F.col("pz.LocationID")) \
        .select(
            "t.*",
            F.col("pz.centroid_lat").alias("pickup_lat"),
            F.col("pz.centroid_lon").alias("pickup_lon"),
            F.col("pz.area_sq_miles").alias("pickup_area")
        )
    
    # Jointure dropoff
    trips_with_geo = trips_with_geo.alias("t") \
        .join(zones_df.alias("dz"), 
              F.col("t.DOLocationID") == F.col("dz.LocationID")) \
        .select(
            "t.*",
            F.col("dz.centroid_lat").alias("dropoff_lat"),
            F.col("dz.centroid_lon").alias("dropoff_lon"),
            F.col("dz.area_sq_miles").alias("dropoff_area")
        )
    
    # Calcul de la distance réelle
    trips_with_geo = trips_with_geo.withColumn(
        "real_distance_miles",
        haversine_distance(
            F.col("pickup_lat"), F.col("pickup_lon"),
            F.col("dropoff_lat"), F.col("dropoff_lon")
        )
    ).withColumn(
        "distance_accuracy",
        F.when(F.col("trip_distance") > 0,
               F.abs(F.col("real_distance_miles") - F.col("trip_distance")) / 
               F.col("trip_distance"))
        .otherwise(None)
    )
    
    # Enregistrement
    trips_with_geo.writeTo("local.silver.trips_geospatial") \
                  .partitionedBy("year", "month") \
                  .createOrReplace()
    
    print(f"✅ Table silver.trips_geospatial créée avec {trips_with_geo.count()} lignes")
    
    return trips_with_geo

ModuleNotFoundError: No module named 'geopandas'