In [18]:
from pyspark.sql import functions as F

def create_silver_tables():
    """
    Silver Layer ‚Äì Iceberg only
    Bronze -> Silver
    Compatible Spark + Trino
    """

    print("üöÄ D√©marrage Silver Layer")

    catalog = "iceberg"

    # ------------------------------------------------------------------
    # Namespace Silver
    # ------------------------------------------------------------------
    spark.sql(f"CREATE NAMESPACE IF NOT EXISTS {catalog}.silver")

    # ------------------------------------------------------------------
    # 1Ô∏è‚É£ TRIPS NETTOY√âS
    # ------------------------------------------------------------------
    print("üìä Nettoyage des trips")

    trips_df = spark.table(f"{catalog}.bronze.taxi_trips_2024")

    trips_cleaned = (
        trips_df
        .filter(
            (F.col("trip_distance").between(0.1, 100)) &
            (F.col("total_amount").between(1, 500)) &
            (F.col("passenger_count").between(1, 6)) &
            (F.col("tpep_dropoff_datetime") > F.col("tpep_pickup_datetime"))
        )
        .withColumn(
            "trip_duration_minutes",
            (F.unix_timestamp("tpep_dropoff_datetime") -
             F.unix_timestamp("tpep_pickup_datetime")) / 60
        )
        .withColumn(
            "speed_mph",
            F.col("trip_distance") / (F.col("trip_duration_minutes") / 60)
        )
        .filter(
            (F.col("trip_duration_minutes").between(1, 180)) &
            (F.col("speed_mph").between(0, 80))
        )
        .withColumn("pickup_date", F.to_date("tpep_pickup_datetime"))
        .withColumn("pickup_hour", F.hour("tpep_pickup_datetime"))
        .withColumn("is_weekend", F.dayofweek("tpep_pickup_datetime").isin([1, 7]))
    )

    trips_cleaned.writeTo(f"{catalog}.silver.trips_enriched") \
        .partitionedBy(F.months("pickup_date")) \
        .createOrReplace()

    print("‚úÖ silver.trips_enriched cr√©√©e")

    # ------------------------------------------------------------------
    # 2Ô∏è‚É£ ENRICHISSEMENT ZONES
    # ------------------------------------------------------------------
    print("üó∫Ô∏è Enrichissement zones")

    zones_df = spark.table(f"{catalog}.bronze.taxi_zones")

    trips_with_zones = (
        trips_cleaned.alias("t")
        .join(zones_df.alias("pz"), F.col("t.PULocationID") == F.col("pz.LocationID"), "left")
        .join(zones_df.alias("dz"), F.col("t.DOLocationID") == F.col("dz.LocationID"), "left")
        .select(
            "t.*",
            F.col("pz.zone").alias("pickup_zone"),
            F.col("pz.borough").alias("pickup_borough"),
            F.col("dz.zone").alias("dropoff_zone"),
            F.col("dz.borough").alias("dropoff_borough"),
        )
        .withColumn(
            "is_same_borough",
            (F.col("pickup_borough") == F.col("dropoff_borough"))
        )
        .withColumn(
            "is_airport_trip",
            (F.col("PULocationID").isin([1, 132, 138])) |
            (F.col("DOLocationID").isin([1, 132, 138]))
        )
    )

    trips_with_zones.writeTo(f"{catalog}.silver.trips_with_zones") \
        .partitionedBy(F.months("pickup_date")) \
        .createOrReplace()

    print("‚úÖ silver.trips_with_zones cr√©√©e")

    # ------------------------------------------------------------------
    # 3Ô∏è‚É£ ENRICHISSEMENT M√âT√âO
    # ------------------------------------------------------------------
    print("üå§Ô∏è Enrichissement m√©t√©o")

    weather_df = (
        spark.table(f"{catalog}.bronze.weather_2024")
        .withColumn("weather_date", F.to_date("time"))
        .withColumn("weather_hour", F.hour("time"))
    )

    trips_complete = (
        trips_with_zones.alias("t")
        .join(
            weather_df.alias("w"),
            (F.col("t.pickup_date") == F.col("w.weather_date")) &
            (F.col("t.pickup_hour") == F.col("w.weather_hour")),
            "left"
        )
        .select(
            "t.*",
            F.col("w.temp").alias("temp"),
            F.col("w.rhum").alias("rhum"),
            F.col("w.prcp").alias("prcp"),
            F.col("w.wspd").alias("wspd"),
            F.col("w.pres").alias("pres")
        )
        .withColumn("is_rainy", F.col("prcp") > 0)
        .withColumn("is_cold", F.col("temp") <= 5)
        .withColumn("is_hot", F.col("temp") >= 25)
    )

    trips_complete.writeTo(f"{catalog}.silver.trips_complete") \
        .partitionedBy(F.months("pickup_date")) \
        .createOrReplace()

    print("‚úÖ silver.trips_complete cr√©√©e")

    print("\nüéâ SILVER LAYER TERMIN√â AVEC SUCC√àS")

    return trips_complete


In [19]:
create_silver_tables()

üöÄ D√©marrage Silver Layer
üìä Nettoyage des trips


                                                                                

‚úÖ silver.trips_enriched cr√©√©e
üó∫Ô∏è Enrichissement zones


26/01/06 13:48:14 WARN SparkStringUtils: Truncated the string representation of a plan since it was too large. This behavior can be adjusted by setting 'spark.sql.debug.maxToStringFields'.
                                                                                

‚úÖ silver.trips_with_zones cr√©√©e
üå§Ô∏è Enrichissement m√©t√©o


                                                                                

‚úÖ silver.trips_complete cr√©√©e

üéâ SILVER LAYER TERMIN√â AVEC SUCC√àS


DataFrame[VendorID: int, tpep_pickup_datetime: timestamp_ntz, tpep_dropoff_datetime: timestamp_ntz, passenger_count: bigint, trip_distance: double, RatecodeID: bigint, store_and_fwd_flag: string, PULocationID: int, DOLocationID: int, payment_type: bigint, fare_amount: double, extra: double, mta_tax: double, tip_amount: double, tolls_amount: double, improvement_surcharge: double, total_amount: double, congestion_surcharge: double, Airport_fee: double, trip_duration_minutes: double, speed_mph: double, pickup_date: date, pickup_hour: int, is_weekend: boolean, pickup_zone: string, pickup_borough: string, dropoff_zone: string, dropoff_borough: string, is_same_borough: boolean, is_airport_trip: boolean, temp: double, rhum: double, prcp: double, wspd: double, pres: double, is_rainy: boolean, is_cold: boolean, is_hot: boolean]

In [20]:
spark.sql("show tables from iceberg.silver").show()

+---------+----------------+-----------+
|namespace|       tableName|isTemporary|
+---------+----------------+-----------+
|   silver|  trips_complete|      false|
|   silver|  trips_enriched|      false|
|   silver|trips_with_zones|      false|
+---------+----------------+-----------+



In [21]:
# ‚ö° Affichage "embelli" des 5 premi√®res lignes
import pandas as pd

# Lire les 5 premi√®res lignes depuis Spark
df_preview = spark.table("iceberg.silver.trips_complete").limit(5).toPandas()

# Afficher avec Pandas (tableau propre)
pd.set_option('display.max_columns', None)  # Affiche toutes les colonnes
pd.set_option('display.width', 200)         # Largeur maximale
display(df_preview)


                                                                                

Unnamed: 0,VendorID,tpep_pickup_datetime,tpep_dropoff_datetime,passenger_count,trip_distance,RatecodeID,store_and_fwd_flag,PULocationID,DOLocationID,payment_type,fare_amount,extra,mta_tax,tip_amount,tolls_amount,improvement_surcharge,total_amount,congestion_surcharge,Airport_fee,trip_duration_minutes,speed_mph,pickup_date,pickup_hour,is_weekend,pickup_zone,pickup_borough,dropoff_zone,dropoff_borough,is_same_borough,is_airport_trip,temp,rhum,prcp,wspd,pres,is_rainy,is_cold,is_hot
0,2,2024-12-01 00:12:27,2024-12-01 00:31:12,1,9.76,1,N,138,33,1,38.0,6.0,0.5,4.72,0.0,1.0,51.97,0.0,1.75,18.75,31.232,2024-12-01,0,True,LaGuardia Airport,Queens,Brooklyn Heights,Brooklyn,False,True,1.0,47.0,0.0,17.0,1017.0,False,True,False
1,2,2024-12-01 00:50:35,2024-12-01 01:24:46,4,20.07,2,N,132,236,2,70.0,0.0,0.5,0.0,6.94,1.0,82.69,2.5,1.75,34.183333,35.227694,2024-12-01,0,True,JFK Airport,Queens,Upper East Side North,Manhattan,False,True,1.0,47.0,0.0,17.0,1017.0,False,True,False
2,2,2024-12-01 00:18:16,2024-12-01 00:33:16,3,2.34,1,N,142,186,1,15.6,1.0,0.5,4.12,0.0,1.0,24.72,2.5,0.0,15.0,9.36,2024-12-01,0,True,Lincoln Square East,Manhattan,Penn Station/Madison Sq West,Manhattan,True,False,1.0,47.0,0.0,17.0,1017.0,False,True,False
3,2,2024-12-01 00:56:13,2024-12-01 01:18:25,1,5.05,1,N,107,80,1,26.8,1.0,0.5,5.0,0.0,1.0,36.8,2.5,0.0,22.2,13.648649,2024-12-01,0,True,Gramercy,Manhattan,East Williamsburg,Brooklyn,False,False,1.0,47.0,0.0,17.0,1017.0,False,True,False
4,1,2024-12-01 00:21:17,2024-12-01 00:37:22,1,4.3,1,N,249,141,1,20.5,3.5,0.5,5.1,0.0,1.0,30.6,2.5,0.0,16.083333,16.041451,2024-12-01,0,True,West Village,Manhattan,Lenox Hill West,Manhattan,True,False,1.0,47.0,0.0,17.0,1017.0,False,True,False


In [22]:
# Afficher 5 lignes avec les colonnes m√©t√©o
spark.table("iceberg.silver.trips_complete").select(
    "pickup_date",
    "pickup_hour",
    "trip_distance",
    "trip_duration_minutes",
    "total_amount",
    "tip_amount",
    "temp",
    "rhum",
    "prcp",
    "wspd",
    "pres",
    "is_rainy",
    "is_cold",
    "is_hot"
).show(5, truncate=False)


+-----------+-----------+-------------+---------------------+------------+----------+----+----+----+----+------+--------+-------+------+
|pickup_date|pickup_hour|trip_distance|trip_duration_minutes|total_amount|tip_amount|temp|rhum|prcp|wspd|pres  |is_rainy|is_cold|is_hot|
+-----------+-----------+-------------+---------------------+------------+----------+----+----+----+----+------+--------+-------+------+
|2024-12-01 |0          |9.76         |18.75                |51.97       |4.72      |1.0 |47.0|0.0 |17.0|1017.0|false   |true   |false |
|2024-12-01 |0          |20.07        |34.18333333333333    |82.69       |0.0       |1.0 |47.0|0.0 |17.0|1017.0|false   |true   |false |
|2024-12-01 |0          |2.34         |15.0                 |24.72       |4.12      |1.0 |47.0|0.0 |17.0|1017.0|false   |true   |false |
|2024-12-01 |0          |5.05         |22.2                 |36.8        |5.0       |1.0 |47.0|0.0 |17.0|1017.0|false   |true   |false |
|2024-12-01 |0          |4.3          |16