## Feature Engineering pour ML

In [None]:
# notebooks/06_feature_engineering.py
def create_ml_features():
    """Crée les features pour le Machine Learning"""
    
    # Chargement des données
    trips_df = spark.table("local.silver.trips_geospatial")
    
    # Features temporelles
    ml_features = trips_df.withColumn(
        "pickup_day_of_week", F.dayofweek("tpep_pickup_datetime")
    ).withColumn(
        "pickup_week_of_year", F.weekofyear("tpep_pickup_datetime")
    ).withColumn(
        "pickup_month", F.month("tpep_pickup_datetime")
    ).withColumn(
        "pickup_is_morning", F.when((F.col("hour_of_day") >= 6) & 
                                   (F.col("hour_of_day") <= 10), 1).otherwise(0)
    ).withColumn(
        "pickup_is_evening", F.when((F.col("hour_of_day") >= 16) & 
                                   (F.col("hour_of_day") <= 20), 1).otherwise(0)
    ).withColumn(
        "pickup_is_night", F.when((F.col("hour_of_day") >= 22) | 
                                 (F.col("hour_of_day") <= 4), 1).otherwise(0)
    )
    
    # Features géographiques
    ml_features = ml_features.withColumn(
        "is_same_borough", 
        F.when(F.col("pickup_borough") == F.col("dropoff_borough"), 1).otherwise(0)
    ).withColumn(
        "is_airport_trip",
        F.when(F.col("pickup_zone").contains("Airport") | 
               F.col("dropoff_zone").contains("Airport"), 1).otherwise(0)
    ).withColumn(
        "is_manhattan_trip",
        F.when((F.col("pickup_borough") == "Manhattan") & 
               (F.col("dropoff_borough") == "Manhattan"), 1).otherwise(0)
    )
    
    # Features météo
    ml_features = ml_features.withColumn(
        "is_rainy", F.when(F.col("precip") > 0.1, 1).otherwise(0)
    ).withColumn(
        "is_snowy", F.when(F.col("snow") > 0.1, 1).otherwise(0)
    ).withColumn(
        "is_cold", F.when(F.col("temp") < 32, 1).otherwise(0)  # < 0°C
    ).withColumn(
        "is_hot", F.when(F.col("temp") > 86, 1).otherwise(0)   # > 30°C
    )
    
    # Features dérivées
    ml_features = ml_features.withColumn(
        "fare_per_mile",
        F.when(F.col("trip_distance") > 0,
               F.col("total_amount") / F.col("trip_distance"))
        .otherwise(None)
    ).withColumn(
        "fare_per_minute",
        F.when(F.col("trip_duration_minutes") > 0,
               F.col("total_amount") / F.col("trip_duration_minutes"))
        .otherwise(None)
    ).withColumn(
        "speed_mph",
        F.when(F.col("trip_duration_minutes") > 0,
               F.col("trip_distance") / (F.col("trip_duration_minutes") / 60))
        .otherwise(None)
    )
    
    # Features de trafic (agrégations fenêtrées)
    from pyspark.sql.window import Window
    
    window_spec = Window.partitionBy("pickup_zone") \
                       .orderBy("tpep_pickup_datetime") \
                       .rowsBetween(-100, -1)
    
    ml_features = ml_features.withColumn(
        "recent_trips_in_zone",
        F.count("*").over(window_spec)
    ).withColumn(
        "avg_recent_duration",
        F.avg("trip_duration_minutes").over(window_spec)
    ).withColumn(
        "avg_recent_fare",
        F.avg("total_amount").over(window_spec)
    )
    
    # Sélection des colonnes finales
    ml_dataset = ml_features.select(
        # Target variables
        "trip_duration_minutes",
        "total_amount",
        
        # Features temporelles
        "hour_of_day",
        "pickup_day_of_week",
        "pickup_is_morning",
        "pickup_is_evening",
        "pickup_is_night",
        "is_weekend",
        
        # Features géographiques
        "pickup_borough",
        "dropoff_borough",
        "pickup_zone",
        "dropoff_zone",
        "is_same_borough",
        "is_airport_trip",
        "is_manhattan_trip",
        "trip_distance",
        "real_distance_miles",
        
        # Features météo
        "temp",
        "precip",
        "snow",
        "is_rainy",
        "is_snowy",
        "is_cold",
        "is_hot",
        
        # Features dérivées
        "passenger_count",
        "fare_per_mile",
        "fare_per_minute",
        "speed_mph",
        
        # Features de trafic
        "recent_trips_in_zone",
        "avg_recent_duration",
        "avg_recent_fare",
        
        # Metadata
        "tpep_pickup_datetime",
        "VendorID",
        "year",
        "month"
    )
    
    # Enregistrement
    ml_dataset.writeTo("local.gold.ml_trip_features") \
              .partitionedBy("year", "month") \
              .createOrReplace()
    
    print(f"✅ Table gold.ml_trip_features créée avec {ml_dataset.count()} lignes")
    
    # Création d'un dataset réduit pour le développement
    ml_sample = ml_dataset.sample(0.1, seed=42)  # 10% des données
    
    ml_sample.writeTo("local.gold.ml_trip_features_sample") \
             .createOrReplace()
    
    print(f"✅ Table gold.ml_trip_features_sample créée")
    
    return ml_dataset