In [1]:
from pyspark.sql import SparkSession
from pyspark.sql.functions import *
from pyspark.sql.types import *
from datetime import datetime

In [2]:
# Initialisation Spark
spark = SparkSession.builder.getOrCreate()

In [5]:
# Si elles n'existent pas, ex√©cutez d'abord ceci
spark.sql("CREATE DATABASE IF NOT EXISTS iceberg.nyc_taxi")

DataFrame[]

In [3]:
from pyspark.sql import SparkSession
from pyspark.sql.functions import *
from pyspark.sql.types import *
from datetime import datetime

# Initialisation Spark
spark = SparkSession.builder.getOrCreate()

def create_silver_tables():
    """Cr√©e les tables silver nettoy√©es et enrichies"""
    
    # Table des trajets silver (nettoy√©s) - SANS IDENTITY COLUMN
    spark.sql("""
    CREATE TABLE IF NOT EXISTS iceberg.nyc_taxi.silver_trips (
        vendor_id INT,
        pickup_datetime TIMESTAMP,
        dropoff_datetime TIMESTAMP,
        passenger_count INT,
        trip_distance DOUBLE,
        rate_code_id INT,
        store_and_fwd_flag STRING,
        pickup_location_id INT,
        dropoff_location_id INT,
        payment_type INT,
        fare_amount DOUBLE,
        extra DOUBLE,
        mta_tax DOUBLE,
        tip_amount DOUBLE,
        tolls_amount DOUBLE,
        improvement_surcharge DOUBLE,
        total_amount DOUBLE,
        congestion_surcharge DOUBLE,
        airport_fee DOUBLE,
        
        -- Colonnes calcul√©es
        trip_duration_seconds DOUBLE,
        trip_speed_mph DOUBLE,
        is_valid_trip BOOLEAN,
        is_long_trip BOOLEAN,
        is_airport_trip BOOLEAN,
        pickup_hour INT,
        pickup_day_of_week INT,
        pickup_month INT,
        pickup_date DATE,
        
        -- M√©tadonn√©es
        bronze_source STRING,
        processed_at TIMESTAMP,
        data_quality_score DOUBLE
    )
    USING iceberg
    PARTITIONED BY (pickup_date)
    TBLPROPERTIES (
        'format-version'='2',
        'write.parquet.compression-codec'='zstd'
    )
    """)
    
    # Table des zones enrichies
    spark.sql("""
    CREATE TABLE IF NOT EXISTS iceberg.nyc_taxi.silver_zones (
        location_id INT,
        borough STRING,
        zone STRING,
        service_zone STRING,
        zone_type STRING,
        is_airport BOOLEAN,
        is_manhattan BOOLEAN,
        borough_code INT
    )
    USING iceberg
    TBLPROPERTIES ('format-version'='2')
    """)
    
    # Table m√©t√©o silver
    spark.sql("""
    CREATE TABLE IF NOT EXISTS iceberg.nyc_taxi.silver_weather (
        observation_date DATE,
        temperature_avg DOUBLE,
        temperature_min DOUBLE,
        temperature_max DOUBLE,
        precipitation_mm DOUBLE,
        snowfall_mm DOUBLE,
        snow_depth_mm DOUBLE,
        wind_speed_mps DOUBLE,
        weather_conditions STRING,
        is_rainy_day BOOLEAN,
        is_snowy_day BOOLEAN,
        is_windy_day BOOLEAN,
        season STRING,
        processed_at TIMESTAMP
    )
    USING iceberg
    PARTITIONED BY (observation_date)
    TBLPROPERTIES ('format-version'='2')
    """)

def clean_and_transform_trips():
    """Nettoie et transforme les donn√©es bronze vers silver"""
    
    print("üìä Lecture des donn√©es bronze...")
    
    # Lire les donn√©es bronze
    try:
        bronze_df = spark.table("iceberg.nyc_taxi.bronze_yellow_trips")
        print(f"‚úÖ Lignes dans bronze: {bronze_df.count():,}")
    except Exception as e:
        print(f"‚ùå Erreur lecture table bronze: {e}")
        print("Cr√©ation d'un DataFrame d'exemple pour les tests...")
        
        # Cr√©er un DataFrame d'exemple pour tester
        from pyspark.sql.types import StructType, StructField, IntegerType, DoubleType, StringType, TimestampType
        
        schema = StructType([
            StructField("VendorID", IntegerType(), True),
            StructField("tpep_pickup_datetime", TimestampType(), True),
            StructField("tpep_dropoff_datetime", TimestampType(), True),
            StructField("passenger_count", DoubleType(), True),
            StructField("trip_distance", DoubleType(), True),
            StructField("RatecodeID", DoubleType(), True),
            StructField("store_and_fwd_flag", StringType(), True),
            StructField("PULocationID", IntegerType(), True),
            StructField("DOLocationID", IntegerType(), True),
            StructField("payment_type", IntegerType(), True),
            StructField("fare_amount", DoubleType(), True),
            StructField("extra", DoubleType(), True),
            StructField("mta_tax", DoubleType(), True),
            StructField("tip_amount", DoubleType(), True),
            StructField("tolls_amount", DoubleType(), True),
            StructField("improvement_surcharge", DoubleType(), True),
            StructField("total_amount", DoubleType(), True),
            StructField("congestion_surcharge", DoubleType(), True),
            StructField("airport_fee", DoubleType(), True),
            StructField("ingestion_date", TimestampType(), True),
            StructField("source_file", StringType(), True),
            StructField("batch_id", StringType(), True)
        ])
        
        # Donn√©es d'exemple
        from datetime import datetime
        sample_data = [
            (1, datetime(2024, 1, 1, 8, 30), datetime(2024, 1, 1, 8, 45), 1.0, 3.5, 1.0, "N", 
             161, 237, 1, 15.5, 0.5, 0.5, 3.1, 0.0, 0.3, 19.9, 2.5, 0.0, 
             datetime.now(), "yellow_2024_01.parquet", "batch_1")
        ]
        
        bronze_df = spark.createDataFrame(sample_data, schema)
        print("‚úÖ DataFrame d'exemple cr√©√©")
    
    print(f"üìä Lignes avant nettoyage: {bronze_df.count():,}")
    
    # Appliquer les transformations
    silver_df = bronze_df \
        .withColumnRenamed("VendorID", "vendor_id") \
        .withColumnRenamed("tpep_pickup_datetime", "pickup_datetime") \
        .withColumnRenamed("tpep_dropoff_datetime", "dropoff_datetime") \
        .withColumnRenamed("PULocationID", "pickup_location_id") \
        .withColumnRenamed("DOLocationID", "dropoff_location_id") \
        .withColumnRenamed("RatecodeID", "rate_code_id")
    
    # Convertir les types
    silver_df = silver_df \
        .withColumn("passenger_count", 
                   when(col("passenger_count").isNull(), 1)
                   .otherwise(col("passenger_count").cast(IntegerType()))) \
        .withColumn("rate_code_id", 
                   when(col("rate_code_id").isNull(), 1)
                   .otherwise(col("rate_code_id").cast(IntegerType()))) \
        .withColumn("payment_type", col("payment_type").cast(IntegerType())) \
        .withColumn("pickup_location_id", col("pickup_location_id").cast(IntegerType())) \
        .withColumn("dropoff_location_id", col("dropoff_location_id").cast(IntegerType()))
    
    # Calculer la dur√©e du trajet en secondes
    silver_df = silver_df \
        .withColumn("trip_duration_seconds", 
                   (unix_timestamp(col("dropoff_datetime")) - 
                    unix_timestamp(col("pickup_datetime"))).cast(DoubleType()))
    
    # Calculer la vitesse (miles par heure)
    silver_df = silver_df \
        .withColumn("trip_speed_mph", 
                   when((col("trip_duration_seconds") > 0) & (col("trip_distance") > 0),
                       (col("trip_distance") / (col("trip_duration_seconds") / 3600)))
                   .otherwise(0.0))
    
    # Ajouter les flags de qualit√©
    silver_df = silver_df \
        .withColumn("is_valid_trip",
                   (col("trip_distance") > 0) &
                   (col("fare_amount") > 0) &
                   (col("trip_duration_seconds") > 0) &
                   (col("trip_duration_seconds") < 3600*24) &  # < 24 heures
                   (col("passenger_count") > 0) &
                   (col("passenger_count") <= 6) &
                   (col("trip_speed_mph") < 100)) \
        .withColumn("is_long_trip", col("trip_distance") > 20) \
        .withColumn("is_airport_trip", 
                   (col("rate_code_id").isin(2, 3)) |  # JFK/Newark
                   (col("airport_fee") > 0))
    
    # Extraire les composants temporels
    silver_df = silver_df \
        .withColumn("pickup_hour", hour(col("pickup_datetime"))) \
        .withColumn("pickup_day_of_week", dayofweek(col("pickup_datetime"))) \
        .withColumn("pickup_month", month(col("pickup_datetime"))) \
        .withColumn("pickup_date", to_date(col("pickup_datetime")))
    
    # Calculer un score de qualit√© des donn√©es
    silver_df = silver_df \
        .withColumn("data_quality_score",
                   when(col("is_valid_trip"), 1.0)
                   .otherwise(0.5)) \
        .withColumn("processed_at", current_timestamp()) \
        .withColumn("bronze_source", col("source_file"))
    
    # Filtrer seulement les trajets valides pour silver
    silver_df = silver_df.filter(col("is_valid_trip"))
    
    # S√©lectionner les colonnes pour silver (dans l'ordre de la table)
    silver_columns = [
        "vendor_id", "pickup_datetime", "dropoff_datetime",
        "passenger_count", "trip_distance", "rate_code_id",
        "store_and_fwd_flag", "pickup_location_id", "dropoff_location_id",
        "payment_type", "fare_amount", "extra", "mta_tax",
        "tip_amount", "tolls_amount", "improvement_surcharge",
        "total_amount", "congestion_surcharge", "airport_fee",
        "trip_duration_seconds", "trip_speed_mph",
        "is_valid_trip", "is_long_trip", "is_airport_trip",
        "pickup_hour", "pickup_day_of_week", "pickup_month", "pickup_date",
        "bronze_source", "processed_at", "data_quality_score"
    ]
    
    silver_df = silver_df.select(*silver_columns)
    
    print(f"üìä Lignes apr√®s nettoyage: {silver_df.count():,}")
    
    # √âcrire dans la table silver
    try:
        # Mode overwrite pour la premi√®re ex√©cution, append ensuite
        silver_df.writeTo("iceberg.nyc_taxi.silver_trips") \
            .option("compression", "zstd") \
            .option("overwrite-mode", "overwrite") \
            .createOrReplace()  # Utiliser createOrReplace au lieu de append pour √©viter les erreurs
        
        print("‚úÖ Donn√©es silver √©crites avec succ√®s")
    except Exception as e:
        print(f"‚ùå Erreur √©criture dans silver: {e}")
        # Essayer en mode append
        try:
            silver_df.writeTo("iceberg.nyc_taxi.silver_trips") \
                .option("compression", "zstd") \
                .append()
            print("‚úÖ Donn√©es silver √©crites en mode append")
        except Exception as e2:
            print(f"‚ùå Erreur append aussi: {e2}")
    
    return silver_df

def process_zones():
    """Transforme les donn√©es des zones"""
    
    print("üó∫Ô∏è Traitement des donn√©es des zones...")
    
    # Essayer de lire les zones bronze
    try:
        zones_df = spark.table("iceberg.nyc_taxi.bronze_taxi_zones")
        print(f"‚úÖ Zones bronze trouv√©es: {zones_df.count()} lignes")
        
        # Enrichir les zones
        zones_silver = zones_df \
            .withColumnRenamed("LocationID", "location_id") \
            .withColumnRenamed("Borough", "borough") \
            .withColumnRenamed("Zone", "zone") \
            .withColumnRenamed("service_zone", "service_zone") \
            .withColumn("zone_type",
                       when(col("service_zone").contains("Airports"), "Airport")
                       .when(col("zone").contains("NV"), "Dispatch Zone")
                       .otherwise("Regular Zone")) \
            .withColumn("is_airport", 
                       col("service_zone").contains("Airports")) \
            .withColumn("is_manhattan", 
                       col("borough") == "Manhattan") \
            .withColumn("borough_code",
                       when(col("borough") == "Manhattan", 1)
                       .when(col("borough") == "Queens", 2)
                       .when(col("borough") == "Brooklyn", 3)
                       .when(col("borough") == "Bronx", 4)
                       .when(col("borough") == "Staten Island", 5)
                       .otherwise(0)) \
            .select("location_id", "borough", "zone", "service_zone", 
                   "zone_type", "is_airport", "is_manhattan", "borough_code")
        
        # √âcrire
        zones_silver.writeTo("iceberg.nyc_taxi.silver_zones") \
            .option("overwrite-mode", "overwrite") \
            .createOrReplace()
        
        print("‚úÖ Zones silver √©crites avec succ√®s")
        
    except Exception as e:
        print(f"‚ö†Ô∏è Zones non disponibles: {e}")
        print("Cr√©ation de zones par d√©faut...")
        
        # Cr√©er des zones par d√©faut
        default_zones = [
            (1, "Manhattan", "Alphabet City", "Yellow Zone", "Regular Zone", False, True, 1),
            (132, "Queens", "JFK Airport", "Airports", "Airport", True, False, 2),
            (138, "Queens", "LaGuardia Airport", "Airports", "Airport", True, False, 2),
            (79, "Manhattan", "Times Square", "Yellow Zone", "Regular Zone", False, True, 1),
            (4, "Manhattan", "Financial District", "Yellow Zone", "Regular Zone", False, True, 1),
        ]
        
        schema = StructType([
            StructField("location_id", IntegerType(), True),
            StructField("borough", StringType(), True),
            StructField("zone", StringType(), True),
            StructField("service_zone", StringType(), True),
            StructField("zone_type", StringType(), True),
            StructField("is_airport", BooleanType(), True),
            StructField("is_manhattan", BooleanType(), True),
            StructField("borough_code", IntegerType(), True)
        ])
        
        default_df = spark.createDataFrame(default_zones, schema)
        
        default_df.writeTo("iceberg.nyc_taxi.silver_zones") \
            .option("overwrite-mode", "overwrite") \
            .createOrReplace()
        
        print("‚úÖ Zones par d√©faut cr√©√©es")

def process_weather():
    """Transforme les donn√©es m√©t√©o"""
    
    print("üå§Ô∏è Traitement des donn√©es m√©t√©o...")
    
    try:
        # Essayer de lire les donn√©es m√©t√©o bronze
        weather_df = spark.table("iceberg.nyc_taxi.bronze_weather")
        print(f"‚úÖ Donn√©es m√©t√©o bronze trouv√©es: {weather_df.count()} lignes")
        
        # Traitement des donn√©es m√©t√©o
        weather_silver = weather_df \
            .withColumnRenamed("date", "observation_date") \
            .withColumn("temperature_avg", 
                       when(col("temp").isNotNull(), col("temp"))
                       .when(col("tavg").isNotNull(), col("tavg"))
                       .otherwise(lit(None).cast(DoubleType()))) \
            .withColumn("temperature_min", col("temp_min")) \
            .withColumn("temperature_max", col("temp_max")) \
            .withColumn("precipitation_mm", col("prcp")) \
            .withColumn("snowfall_mm", col("snow")) \
            .withColumn("snow_depth_mm", col("snwd")) \
            .withColumn("wind_speed_mps", 
                       when(col("wspd").isNotNull(), col("wspd"))
                       .when(col("awnd").isNotNull(), col("awnd"))
                       .otherwise(lit(None).cast(DoubleType())))
        
        # D√©terminer les conditions m√©t√©o
        weather_silver = weather_silver \
            .withColumn("weather_conditions",
                       when(col("precipitation_mm") > 10, "Heavy Rain")
                       .when(col("precipitation_mm") > 0.1, "Rain")
                       .when(col("snowfall_mm") > 10, "Heavy Snow")
                       .when(col("snowfall_mm") > 0.1, "Snow")
                       .when(col("temperature_avg") > 30, "Hot")
                       .when(col("temperature_avg") < 0, "Cold")
                       .when(col("wind_speed_mps") > 10, "Windy")
                       .otherwise("Clear")) \
            .withColumn("is_rainy_day", col("precipitation_mm") > 0.1) \
            .withColumn("is_snowy_day", col("snowfall_mm") > 0.1) \
            .withColumn("is_windy_day", col("wind_speed_mps") > 8)
        
        # Ajouter la saison
        weather_silver = weather_silver \
            .withColumn("season",
                       when((month(col("observation_date")) >= 3) & 
                            (month(col("observation_date")) <= 5), "Spring")
                       .when((month(col("observation_date")) >= 6) & 
                            (month(col("observation_date")) <= 8), "Summer")
                       .when((month(col("observation_date")) >= 9) & 
                            (month(col("observation_date")) <= 11), "Fall")
                       .otherwise("Winter")) \
            .withColumn("processed_at", current_timestamp())
        
        # S√©lectionner les colonnes finales
        weather_silver = weather_silver.select(
            "observation_date",
            "temperature_avg",
            "temperature_min",
            "temperature_max",
            "precipitation_mm",
            "snowfall_mm",
            "snow_depth_mm",
            "wind_speed_mps",
            "weather_conditions",
            "is_rainy_day",
            "is_snowy_day",
            "is_windy_day",
            "season",
            "processed_at"
        )
        
        # √âcrire
        weather_silver.writeTo("iceberg.nyc_taxi.silver_weather") \
            .option("overwrite-mode", "overwrite") \
            .createOrReplace()
        
        print("‚úÖ Donn√©es m√©t√©o silver √©crites avec succ√®s")
        
    except Exception as e:
        print(f"‚ö†Ô∏è Donn√©es m√©t√©o non disponibles: {e}")
        # Cr√©er des donn√©es m√©t√©o synth√©tiques
        create_synthetic_weather()

def create_synthetic_weather():
    """Cr√©e des donn√©es m√©t√©o synth√©tiques pour les tests"""
    
    print("üå§Ô∏è Cr√©ation de donn√©es m√©t√©o synth√©tiques...")
    
    from pyspark.sql.types import StructType, StructField, DateType, DoubleType, StringType, BooleanType, TimestampType
    from datetime import datetime, timedelta
    
    # G√©n√©rer des dates pour 2024
    start_date = datetime(2024, 1, 1)
    end_date = datetime(2024, 12, 31)
    date_range = [start_date + timedelta(days=i) for i in range((end_date - start_date).days + 1)]
    
    import numpy as np
    np.random.seed(42)
    
    weather_data = []
    for i, current_date in enumerate(date_range):
        # Temp√©rature avec saisonnalit√©
        day_of_year = current_date.timetuple().tm_yday
        base_temp = 10  # Temp√©rature de base
        seasonal_amp = 15  # Amplitude saisonni√®re
        
        temp_avg = base_temp + seasonal_amp * np.sin(2 * np.pi * (day_of_year - 105) / 365)
        temp_avg += np.random.normal(0, 3)  # Bruit
        
        temp_min = temp_avg - np.random.uniform(3, 8)
        temp_max = temp_avg + np.random.uniform(3, 8)
        
        # Pr√©cipitations
        precip_prob = 0.3 + 0.2 * np.sin(2 * np.pi * (day_of_year - 105) / 365)
        precip = 0
        if np.random.random() < precip_prob:
            precip = np.random.exponential(5)
        
        # Neige (hiver seulement)
        snow = 0
        if (day_of_year <= 60 or day_of_year >= 335) and temp_avg < 2:
            if np.random.random() < 0.1:
                snow = np.random.exponential(2)
        
        # Vent
        wind_speed = np.random.weibull(2) * 5
        
        # Conditions
        if precip > 10:
            conditions = "Heavy Rain"
        elif precip > 0.1:
            conditions = "Rain"
        elif snow > 10:
            conditions = "Heavy Snow"
        elif snow > 0.1:
            conditions = "Snow"
        elif temp_avg > 30:
            conditions = "Hot"
        elif temp_avg < 0:
            conditions = "Cold"
        elif wind_speed > 10:
            conditions = "Windy"
        else:
            conditions = "Clear"
        
        # Saison
        month = current_date.month
        if 3 <= month <= 5:
            season = "Spring"
        elif 6 <= month <= 8:
            season = "Summer"
        elif 9 <= month <= 11:
            season = "Fall"
        else:
            season = "Winter"
        
        weather_data.append((
            current_date.date(),
            round(temp_avg, 1),
            round(temp_min, 1),
            round(temp_max, 1),
            round(precip, 1),
            round(snow, 1),
            0.0,  # snow depth
            round(wind_speed, 1),
            conditions,
            precip > 0.1,
            snow > 0.1,
            wind_speed > 8,
            season,
            datetime.now()
        ))
    
    schema = StructType([
        StructField("observation_date", DateType(), True),
        StructField("temperature_avg", DoubleType(), True),
        StructField("temperature_min", DoubleType(), True),
        StructField("temperature_max", DoubleType(), True),
        StructField("precipitation_mm", DoubleType(), True),
        StructField("snowfall_mm", DoubleType(), True),
        StructField("snow_depth_mm", DoubleType(), True),
        StructField("wind_speed_mps", DoubleType(), True),
        StructField("weather_conditions", StringType(), True),
        StructField("is_rainy_day", BooleanType(), True),
        StructField("is_snowy_day", BooleanType(), True),
        StructField("is_windy_day", BooleanType(), True),
        StructField("season", StringType(), True),
        StructField("processed_at", TimestampType(), True)
    ])
    
    weather_df = spark.createDataFrame(weather_data, schema)
    
    weather_df.writeTo("iceberg.nyc_taxi.silver_weather") \
        .option("overwrite-mode", "overwrite") \
        .createOrReplace()
    
    print(f"‚úÖ Donn√©es m√©t√©o synth√©tiques cr√©√©es: {len(weather_data)} jours")

# Ex√©cution
if __name__ == "__main__":
    print("üöï D√©marrage de la transformation Silver...")
    
    # 1. Cr√©er les tables
    print("üìã Cr√©ation des tables silver...")
    create_silver_tables()
    
    # 2. Transformer les trajets
    print("üöñ Transformation des trajets taxi...")
    silver_trips = clean_and_transform_trips()
    
    # 3. Traiter les zones
    print("üó∫Ô∏è Traitement des zones de taxi...")
    process_zones()
    
    # 4. Traiter la m√©t√©o
    print("üå§Ô∏è Traitement des donn√©es m√©t√©o...")
    process_weather()
    
    # V√©rification
    print("\n" + "="*50)
    print("‚úÖ TRANSFORMATION SILVER TERMIN√âE")
    print("="*50)
    
    # Afficher les tables cr√©√©es
    print("\nüìã Tables disponibles dans nyc_taxi:")
    spark.sql("SHOW TABLES IN iceberg.nyc_taxi").show(truncate=False)
    
    # Statistiques des trajets
    if silver_trips:
        print("\nüìä Statistiques des trajets silver:")
        spark.sql("""
        SELECT 
            COUNT(*) as total_trips,
            AVG(trip_distance) as avg_distance,
            AVG(total_amount) as avg_fare,
            AVG(trip_duration_seconds)/60 as avg_duration_min,
            SUM(CASE WHEN is_airport_trip THEN 1 ELSE 0 END) as airport_trips,
            AVG(data_quality_score) as avg_quality_score
        FROM iceberg.nyc_taxi.silver_trips
        """).show()
    
    print("\nüîç Aper√ßu des 5 premiers trajets:")
    spark.sql("SELECT * FROM iceberg.nyc_taxi.silver_trips LIMIT 5").show(truncate=False)

In [6]:
create_silver_tables()

ParseException: 
[PARSE_SYNTAX_ERROR] Syntax error at or near 'IDENTITY': missing '('.(line 3, pos 43)

== SQL ==

    CREATE TABLE IF NOT EXISTS iceberg.nyc_taxi.silver_trips (
        trip_id BIGINT GENERATED ALWAYS AS IDENTITY,
-------------------------------------------^^^
        vendor_id INT,
        pickup_datetime TIMESTAMP,
        dropoff_datetime TIMESTAMP,
        passenger_count INT,
        trip_distance DOUBLE,
        rate_code_id INT,
        store_and_fwd_flag STRING,
        pickup_location_id INT,
        dropoff_location_id INT,
        payment_type INT,
        fare_amount DOUBLE,
        extra DOUBLE,
        mta_tax DOUBLE,
        tip_amount DOUBLE,
        tolls_amount DOUBLE,
        improvement_surcharge DOUBLE,
        total_amount DOUBLE,
        congestion_surcharge DOUBLE,
        airport_fee DOUBLE,
        
        -- Colonnes calcul√©es
        trip_duration_seconds DOUBLE,
        trip_speed_mph DOUBLE,
        is_valid_trip BOOLEAN,
        is_long_trip BOOLEAN,
        is_airport_trip BOOLEAN,
        pickup_hour INT,
        pickup_day_of_week INT,
        pickup_month INT,
        pickup_date DATE,
        
        -- M√©tadonn√©es
        bronze_source STRING,
        processed_at TIMESTAMP,
        data_quality_score DOUBLE
    )
    USING iceberg
    PARTITIONED BY (pickup_date)
    TBLPROPERTIES (
        'format-version'='2',
        'write.parquet.compression-codec'='zstd'
    )
    
