 # Machine Learning en Production

## Probl√®me 1: Pr√©diction du Prix de la Course

In [1]:
# notebooks/07_ml_price_prediction.py
from pyspark.ml import Pipeline
from pyspark.ml.feature import (
    StringIndexer, OneHotEncoder, VectorAssembler, 
    StandardScaler, Imputer
)
from pyspark.ml.regression import (
    RandomForestRegressor, GBTRegressor, LinearRegression
)
from pyspark.ml.evaluation import RegressionEvaluator
from pyspark.ml.tuning import CrossValidator, ParamGridBuilder
import matplotlib.pyplot as plt
import pandas as pd

def train_price_prediction_model():
    """Entra√Æne un mod√®le pour pr√©dire le prix des courses"""
    
    # Chargement des donn√©es
    ml_df = spark.table("local.gold.ml_trip_features_sample") \
                .filter(F.col("total_amount").isNotNull() &
                       (F.col("total_amount") > 1) &
                       (F.col("total_amount") < 200))
    
    print(f"Dataset ML charg√©: {ml_df.count()} lignes")
    
    # Split train/test
    train_df, test_df = ml_df.randomSplit([0.8, 0.2], seed=42)
    
    # Pr√©processing des features
    # 1. Imputation des valeurs manquantes
    numeric_cols = ["trip_distance", "real_distance_miles", "temp", 
                    "passenger_count", "recent_trips_in_zone"]
    
    imputer = Imputer(
        inputCols=numeric_cols,
        outputCols=[f"{col}_imputed" for col in numeric_cols],
        strategy="median"
    )
    
    # 2. Indexation des colonnes cat√©gorielles
    categorical_cols = ["pickup_borough", "dropoff_borough", 
                       "hour_of_day", "pickup_day_of_week"]
    
    indexers = [
        StringIndexer(inputCol=col, outputCol=f"{col}_index", 
                      handleInvalid="keep")
        for col in categorical_cols
    ]
    
    # 3. One-Hot Encoding
    encoder = OneHotEncoder(
        inputCols=[f"{col}_index" for col in categorical_cols],
        outputCols=[f"{col}_vec" for col in categorical_cols]
    )
    
    # 4. Assemblage des features
    feature_cols = [f"{col}_imputed" for col in numeric_cols] + \
                  [f"{col}_vec" for col in categorical_cols] + \
                  ["is_weekend", "is_airport_trip", "is_manhattan_trip"]
    
    assembler = VectorAssembler(
        inputCols=feature_cols,
        outputCol="features"
    )
    
    # 5. Normalisation
    scaler = StandardScaler(
        inputCol="features",
        outputCol="scaled_features",
        withStd=True,
        withMean=True
    )
    
    # 6. Mod√®le Random Forest
    rf = RandomForestRegressor(
        featuresCol="scaled_features",
        labelCol="total_amount",
        numTrees=100,
        maxDepth=10,
        seed=42
    )
    
    # Pipeline complet
    pipeline = Pipeline(stages=[imputer] + indexers + [encoder, assembler, scaler, rf])
    
    # Entra√Ænement
    print("‚è≥ Entra√Ænement du mod√®le Random Forest...")
    model = pipeline.fit(train_df)
    
    # Pr√©dictions
    predictions = model.transform(test_df)
    
    # √âvaluation
    evaluator = RegressionEvaluator(
        labelCol="total_amount",
        predictionCol="prediction",
        metricName="rmse"
    )
    
    rmse = evaluator.evaluate(predictions)
    mae_evaluator = RegressionEvaluator(
        labelCol="total_amount",
        predictionCol="prediction",
        metricName="mae"
    )
    mae = mae_evaluator.evaluate(predictions)
    r2_evaluator = RegressionEvaluator(
        labelCol="total_amount", 
        predictionCol="prediction",
        metricName="r2"
    )
    r2 = r2_evaluator.evaluate(predictions)
    
    print(f"‚úÖ Mod√®le entra√Æn√©!")
    print(f"üìä Performance du mod√®le:")
    print(f"   RMSE: ${rmse:.2f}")
    print(f"   MAE: ${mae:.2f}")
    print(f"   R¬≤: {r2:.3f}")
    
    # Feature importance
    rf_model = model.stages[-1]
    feature_importances = pd.DataFrame({
        'feature': feature_cols,
        'importance': rf_model.featureImportances.toArray()
    }).sort_values('importance', ascending=False)
    
    print("\nüîù Top 10 des features les plus importantes:")
    print(feature_importances.head(10))
    
    # Sauvegarde du mod√®le
    model_path = "/home/iceberg/models/price_prediction_rf"
    model.write().overwrite().save(model_path)
    print(f"üíæ Mod√®le sauvegard√©: {model_path}")
    
    # Pr√©dictions sur l'ensemble de test pour analyse
    predictions.select("total_amount", "prediction", "trip_distance") \
               .writeTo("local.gold.ml_price_predictions") \
               .createOrReplace()
    
    return model, predictions, feature_importances

## Probl√®me 2: Pr√©diction de la Dur√©e

In [2]:
def train_duration_prediction_model():
    """Entra√Æne un mod√®le pour pr√©dire la dur√©e des courses"""
    
    # Chargement et pr√©paration des donn√©es
    ml_df = spark.table("local.gold.ml_trip_features_sample") \
                .filter(F.col("trip_duration_minutes").isNotNull() &
                       (F.col("trip_duration_minutes") > 1) &
                       (F.col("trip_duration_minutes") < 120))
    
    train_df, test_df = ml_df.randomSplit([0.8, 0.2], seed=42)
    
    # Pipeline similaire mais avec dur√©e comme target
    # ... (code similaire au pr√©c√©dent)
    
    print("‚úÖ Mod√®le de pr√©diction de dur√©e entra√Æn√©!")
    
    return duration_model

## Probl√®me 3: Classification des Courses Premium

In [4]:
def train_premium_classification():
    """Classification binaire: course premium vs standard"""
    
    ml_df = spark.table("local.gold.ml_trip_features_sample")
    
    # D√©finition: premium si > 75√®me percentile
    percentile_75 = ml_df.approxQuantile("total_amount", [0.75], 0.01)[0]
    
    ml_df = ml_df.withColumn(
        "is_premium",
        F.when(F.col("total_amount") > percentile_75, 1).otherwise(0)
    )
    
    from pyspark.ml.classification import RandomForestClassifier
    from pyspark.ml.evaluation import BinaryClassificationEvaluator
    
    # Pipeline de classification
    # ... (code similaire mais avec classifier)
    
    print(f"‚úÖ Mod√®le de classification premium entra√Æn√©!")
    print(f"   Seuil premium: ${percentile_75:.2f}")
    
    return classification_model