# Architecture Lakehouse - NYC Taxi Data Portfolio

# Pipeline ETL - Step by Step

## √âtape 1: Initialisation Spark avec Iceberg

In [1]:
# notebooks/01_setup_iceberg.py
from pyspark.sql import SparkSession
import pyspark.sql.functions as F
from pyspark.sql.types import *

# Configuration Spark avec Iceberg
spark = SparkSession.builder.appName("NYC_Taxi").config("spark.driver.memory", "4g") \
    .config("spark.executor.memory", "4g") \
    .config("spark.memory.offHeap.enabled", "true") \
    .config("spark.memory.offHeap.size", "2g") \
    .config("spark.sql.shuffle.partitions", "200") \
    .config("spark.sql.adaptive.enabled", "true") \
    .config("spark.sql.adaptive.coalescePartitions.enabled", "true") \
    .getOrCreate()

# M√™me catalog "iceberg" que Trino, en REST sur MinIO
spark.conf.set("spark.sql.catalog.iceberg", "org.apache.iceberg.spark.SparkCatalog")
spark.conf.set("spark.sql.catalog.iceberg.catalog-impl", "org.apache.iceberg.rest.RESTCatalog")
spark.conf.set("spark.sql.catalog.iceberg.uri", "http://rest:8181")
spark.conf.set("spark.sql.catalog.iceberg.warehouse", "s3://warehouse/")

# Acc√®s S3 / MinIO identique au service REST et √† Trino
spark.conf.set("spark.sql.catalog.iceberg.io-impl", "org.apache.iceberg.aws.s3.S3FileIO")
spark.conf.set("spark.sql.catalog.iceberg.s3.endpoint", "http://minio:9000")
spark.conf.set("spark.sql.catalog.iceberg.s3.path-style-access", "true")
spark.conf.set("spark.sql.catalog.iceberg.s3.access-key-id", "minioadmin")
spark.conf.set("spark.sql.catalog.iceberg.s3.secret-access-key", "minioadmin123")
spark.conf.set("spark.sql.catalog.iceberg.s3.region", "us-east-1")

# Cr√©ation des namespaces Iceberg
spark.sql("CREATE NAMESPACE IF NOT EXISTS iceberg.bronze")
spark.sql("CREATE NAMESPACE IF NOT EXISTS iceberg.silver")
spark.sql("CREATE NAMESPACE IF NOT EXISTS iceberg.gold")

26/01/05 01:29:24 WARN SparkSession: Using an existing Spark session; only runtime SQL configurations will take effect.


DataFrame[]

## √âtape 2: Ingestion des Donn√©es Brutes (Bronze)

In [2]:
!pip install geopandas

[0m
[1m[[0m[34;49mnotice[0m[1;39;49m][0m[39;49m A new release of pip is available: [0m[31;49m23.0.1[0m[39;49m -> [0m[32;49m25.3[0m
[1m[[0m[34;49mnotice[0m[1;39;49m][0m[39;49m To update, run: [0m[32;49mpip install --upgrade pip[0m


In [5]:
# notebooks/02_bronze_ingestion.py
from pyspark.sql import functions as F

def create_bronze_tables():
    """Cr√©e les tables bronze depuis les donn√©es sources"""
    
    # 1. Taxi Trips (Parquet 2024)
    trips_df = spark.read.parquet("/home/iceberg/data/raw/nyc_taxi/taxi_trips/*.parquet")
    
    # Ajout des colonnes de partition
    trips_df = trips_df.withColumn("year", F.year("tpep_pickup_datetime")) \
                      .withColumn("month", F.month("tpep_pickup_datetime")) \
                      .withColumn("day", F.dayofmonth("tpep_pickup_datetime"))
    
    # Repartitionner pour l'√©criture
    trips_df = trips_df.repartition(200, "year", "month")
    
    # Cr√©ation table Iceberg avec partitions
    trips_df.writeTo("iceberg.bronze.taxi_trips_2024") \
            .partitionedBy("year", "month") \
            .createOrReplace()
    
    print(f"‚úÖ Table bronze.taxi_trips_2024 cr√©√©e: {trips_df.count()} lignes")
    
    # 2. Taxi Zones (Shapefile)
    import geopandas as gpd
    import zipfile
    
    # Extraction du shapefile
    with zipfile.ZipFile("/home/iceberg/data/raw/nyc_taxi/taxi_zones/taxi_zones.zip", 'r') as zip_ref:
        zip_ref.extractall("/tmp/taxi_zones")
    
    # Lecture avec geopandas
    gdf = gpd.read_file("/tmp/taxi_zones/taxi_zones.shp")
    
    # Conversion en DataFrame Spark
    zones_df = spark.createDataFrame(gdf.drop(columns='geometry'))
    
    # Ajout des colonnes WKT pour le g√©ospatial
    zones_df = zones_df.withColumn("geometry_wkt", 
                                  F.lit(gdf.geometry.to_wkt().tolist()))
    
    # Repartitionner en 1 pour √©viter trop de fichiers
    zones_df = zones_df.repartition(1)
    
    zones_df.writeTo("iceberg.bronze.taxi_zones").createOrReplace()
    print(f"‚úÖ Table bronze.taxi_zones cr√©√©e: {zones_df.count()} zones")
    
    # 3. Donn√©es M√©t√©o
    weather_df = spark.read.option("header", "true") \
                          .option("inferSchema", "true") \
                          .csv("/home/iceberg/data/raw/nyc_taxi/weather/central_park_weather_2024.csv")
    
    weather_df = weather_df.withColumn("date", F.to_date("datetime"))
    
    # Repartitionner par date pour l'√©criture
    weather_df = weather_df.repartition(50, "date")
    
    weather_df.writeTo("iceberg.bronze.weather_2024") \
              .partitionedBy(F.col("date")) \
              .createOrReplace()
    
    print(f"‚úÖ Table bronze.weather_2024 cr√©√©e: {weather_df.count()} lignes")
    
    return trips_df, zones_df, weather_df

In [7]:
# notebooks/02_bronze_ingestion.py - CORRIG√â
from pyspark.sql import functions as F

def create_bronze_tables2():
    """Cr√©e les tables bronze depuis les donn√©es sources"""
    
    # 1. Taxi Trips (Parquet 2024)
    print("üìä √âtape 1/3: Ingestion des trips taxi...")
    trips_df = spark.read.parquet("/home/iceberg/data/raw/nyc_taxi/taxi_trips/*.parquet")
    
    # V√©rifier le sch√©ma des trips
    print("Sch√©ma des trips:")
    trips_df.printSchema()
    
    # Ajout des colonnes de partition
    trips_df = trips_df.withColumn("year", F.year("tpep_pickup_datetime")) \
                      .withColumn("month", F.month("tpep_pickup_datetime")) \
                      .withColumn("day", F.dayofmonth("tpep_pickup_datetime"))
    
    # Repartitionner pour l'√©criture (moins de partitions pour √©viter OOM)
    trips_df = trips_df.repartition(12, "year", "month")  # 12 partitions pour 12 mois
    
    # Cr√©ation table Iceberg avec partitions
    trips_df.writeTo("iceberg.bronze.taxi_trips_2024") \
            .partitionedBy("year", "month") \
            .createOrReplace()
    
    print(f"‚úÖ Table bronze.taxi_trips_2024 cr√©√©e: {trips_df.count()} lignes")
    
    # 2. Taxi Zones (Shapefile)
    print("\nüó∫Ô∏è  √âtape 2/3: Ingestion des zones taxi...")
    
    # D'abord essayer de lire un CSV si disponible
    try:
        # V√©rifier si un fichier CSV existe
        import os
        csv_path = "/home/iceberg/data/raw/nyc_taxi/taxi_zones/taxi_zones.csv"
        
        if os.path.exists(csv_path):
            zones_df = spark.read.option("header", "true").csv(csv_path)
            print("üìÅ Fichier CSV trouv√©, chargement...")
        else:
            # Sinon utiliser le shapefile
            import geopandas as gpd
            import zipfile
            
            print("üìÅ Fichier CSV non trouv√©, utilisation du shapefile...")
            # Extraction du shapefile
            with zipfile.ZipFile("/home/iceberg/data/raw/nyc_taxi/taxi_zones/taxi_zones.zip", 'r') as zip_ref:
                zip_ref.extractall("/tmp/taxi_zones")
            
            # Lecture avec geopandas
            gdf = gpd.read_file("/tmp/taxi_zones/taxi_zones.shp")
            
            # Conversion en DataFrame Spark
            zones_df = spark.createDataFrame(gdf.drop(columns='geometry'))
            
            # Ajout des colonnes WKT pour le g√©ospatial (optionnel)
            zones_df = zones_df.withColumn("geometry_wkt", 
                                          F.lit(gdf.geometry.to_wkt().tolist()))
    except Exception as e:
        print(f"‚ö†Ô∏è  Erreur avec les zones: {e}")
        print("Cr√©ation d'une table zones vide...")
        from pyspark.sql.types import StructType, StructField, IntegerType, StringType
        
        schema = StructType([
            StructField("LocationID", IntegerType(), True),
            StructField("zone", StringType(), True),
            StructField("borough", StringType(), True)
        ])
        zones_df = spark.createDataFrame([], schema)
    
    # Afficher le sch√©ma des zones
    print("Sch√©ma des zones:")
    zones_df.printSchema()
    
    # Repartitionner en 1 pour √©viter trop de fichiers
    zones_df = zones_df.repartition(1)
    
    zones_df.writeTo("iceberg.bronze.taxi_zones").createOrReplace()
    print(f"‚úÖ Table bronze.taxi_zones cr√©√©e: {zones_df.count()} zones")
    
    # 3. Donn√©es M√©t√©o
    print("\nüå§Ô∏è  √âtape 3/3: Ingestion des donn√©es m√©t√©o...")
    
    weather_df = spark.read.option("header", "true") \
                          .option("inferSchema", "true") \
                          .csv("/home/iceberg/data/raw/nyc_taxi/weather/central_park_weather_2024.csv")
    
    # Afficher le sch√©ma pour voir les colonnes disponibles
    print("Sch√©ma des donn√©es m√©t√©o:")
    weather_df.printSchema()
    print("Colonnes disponibles:", weather_df.columns)
    print("Aper√ßu des donn√©es:")
    weather_df.show(5)
    
    # CORRECTION : La colonne s'appelle probablement 'time' au lieu de 'datetime'
    # V√©rifier quelle colonne contient la date
    if 'time' in weather_df.columns:
        # Si la colonne s'appelle 'time', la convertir en date
        weather_df = weather_df.withColumn("date", F.to_date("time"))
        print("‚úÖ Colonne 'time' convertie en 'date'")
    elif 'date' in weather_df.columns:
        # Si une colonne 'date' existe d√©j√†
        weather_df = weather_df.withColumnRenamed("date", "date_temp").withColumn("date", F.to_date("date_temp"))
        print("‚úÖ Colonne 'date' d√©j√† pr√©sente")
    else:
        # Essayer d'utiliser la premi√®re colonne qui ressemble √† une date
        date_cols = [col for col in weather_df.columns if any(x in col.lower() for x in ['date', 'time', 'datetime', 'timestamp'])]
        if date_cols:
            weather_df = weather_df.withColumn("date", F.to_date(date_cols[0]))
            print(f"‚úÖ Colonne '{date_cols[0]}' utilis√©e comme date")
        else:
            # Si aucune colonne de date n'est trouv√©e, cr√©er une date par d√©faut
            weather_df = weather_df.withColumn("date", F.lit("2024-01-01").cast("date"))
            print("‚ö†Ô∏è  Aucune colonne de date trouv√©e, date par d√©faut utilis√©e")
    
    # Repartitionner par date pour l'√©criture
    weather_df = weather_df.repartition(12, "date")  # 12 partitions pour 12 mois
    
    weather_df.writeTo("iceberg.bronze.weather_2024") \
              .partitionedBy(F.col("date")) \
              .createOrReplace()
    
    print(f"‚úÖ Table bronze.weather_2024 cr√©√©e: {weather_df.count()} lignes")
    
    print("\nüéâ Ingestion Bronze termin√©e avec succ√®s!")
    return trips_df, zones_df, weather_df

In [8]:
create_bronze_tables2()

üìä √âtape 1/3: Ingestion des trips taxi...
Sch√©ma des trips:
root
 |-- VendorID: integer (nullable = true)
 |-- tpep_pickup_datetime: timestamp_ntz (nullable = true)
 |-- tpep_dropoff_datetime: timestamp_ntz (nullable = true)
 |-- passenger_count: long (nullable = true)
 |-- trip_distance: double (nullable = true)
 |-- RatecodeID: long (nullable = true)
 |-- store_and_fwd_flag: string (nullable = true)
 |-- PULocationID: integer (nullable = true)
 |-- DOLocationID: integer (nullable = true)
 |-- payment_type: long (nullable = true)
 |-- fare_amount: double (nullable = true)
 |-- extra: double (nullable = true)
 |-- mta_tax: double (nullable = true)
 |-- tip_amount: double (nullable = true)
 |-- tolls_amount: double (nullable = true)
 |-- improvement_surcharge: double (nullable = true)
 |-- total_amount: double (nullable = true)
 |-- congestion_surcharge: double (nullable = true)
 |-- Airport_fee: double (nullable = true)



                                                                                

‚úÖ Table bronze.taxi_trips_2024 cr√©√©e: 41169720 lignes

üó∫Ô∏è  √âtape 2/3: Ingestion des zones taxi...
üìÅ Fichier CSV non trouv√©, utilisation du shapefile...
Sch√©ma des zones:
root
 |-- OBJECTID: long (nullable = true)
 |-- Shape_Leng: double (nullable = true)
 |-- Shape_Area: double (nullable = true)
 |-- zone: string (nullable = true)
 |-- LocationID: long (nullable = true)
 |-- borough: string (nullable = true)
 |-- geometry_wkt: array (nullable = false)
 |    |-- element: string (containsNull = false)



26/01/03 23:11:51 WARN DAGScheduler: Broadcasting large task binary with size 2.8 MiB
                                                                                

‚úÖ Table bronze.taxi_zones cr√©√©e: 263 zones

üå§Ô∏è  √âtape 3/3: Ingestion des donn√©es m√©t√©o...
Sch√©ma des donn√©es m√©t√©o:
root
 |-- time: timestamp (nullable = true)
 |-- temp: double (nullable = true)
 |-- rhum: integer (nullable = true)
 |-- prcp: double (nullable = true)
 |-- wspd: double (nullable = true)
 |-- pres: double (nullable = true)

Colonnes disponibles: ['time', 'temp', 'rhum', 'prcp', 'wspd', 'pres']
Aper√ßu des donn√©es:
+-------------------+----+----+----+----+------+
|               time|temp|rhum|prcp|wspd|  pres|
+-------------------+----+----+----+----+------+
|2024-01-01 00:00:00| 5.0|  55| 0.0| 9.0|1016.0|
|2024-01-01 01:00:00| 5.0|  55| 0.0| 9.0|1016.0|
|2024-01-01 02:00:00| 5.0|  55| 0.0|13.0|1016.0|
|2024-01-01 03:00:00| 4.0|  60| 0.0| 6.0|1016.0|
|2024-01-01 04:00:00| 4.0|  67| 0.0|10.1|1016.0|
+-------------------+----+----+----+----+------+
only showing top 5 rows

‚úÖ Colonne 'time' convertie en 'date'


                                                                                

‚úÖ Table bronze.weather_2024 cr√©√©e: 8576 lignes

üéâ Ingestion Bronze termin√©e avec succ√®s!


(DataFrame[VendorID: int, tpep_pickup_datetime: timestamp_ntz, tpep_dropoff_datetime: timestamp_ntz, passenger_count: bigint, trip_distance: double, RatecodeID: bigint, store_and_fwd_flag: string, PULocationID: int, DOLocationID: int, payment_type: bigint, fare_amount: double, extra: double, mta_tax: double, tip_amount: double, tolls_amount: double, improvement_surcharge: double, total_amount: double, congestion_surcharge: double, Airport_fee: double, year: int, month: int, day: int],
 DataFrame[OBJECTID: bigint, Shape_Leng: double, Shape_Area: double, zone: string, LocationID: bigint, borough: string, geometry_wkt: array<string>],
 DataFrame[time: timestamp, temp: double, rhum: int, prcp: double, wspd: double, pres: double, date: date])

In [2]:
# V√©rification des tables cr√©√©es
print("üìã Liste des tables dans le namespace bronze:")
spark.sql("SHOW TABLES IN iceberg.bronze").show()

# Pour une table sp√©cifique
print("\nüîç Description de la table taxi_trips_2024:")
spark.sql("DESCRIBE EXTENDED iceberg.bronze.taxi_trips_2024").show(truncate=False)

# V√©rifier les donn√©es
print("\nüìä Aper√ßu des donn√©es (5 premi√®res lignes):")
spark.sql("SELECT * FROM iceberg.bronze.taxi_trips_2024 LIMIT 5").show()

# V√©rifier les partitions
print("\nüóÇÔ∏è Partitions de la table taxi_trips_2024:")
spark.sql("SHOW PARTITIONS iceberg.bronze.taxi_trips_2024").show()

üìã Liste des tables dans le namespace bronze:
+---------+---------+-----------+
|namespace|tableName|isTemporary|
+---------+---------+-----------+
+---------+---------+-----------+


üîç Description de la table taxi_trips_2024:


AnalysisException: [TABLE_OR_VIEW_NOT_FOUND] The table or view `iceberg`.`bronze`.`taxi_trips_2024` cannot be found. Verify the spelling and correctness of the schema and catalog.
If you did not qualify the name with a schema, verify the current_schema() output, or qualify the name with the correct schema and catalog.
To tolerate the error on drop use DROP VIEW IF EXISTS or DROP TABLE IF EXISTS.; line 1 pos 18;
'DescribeRelation true, [col_name#19, data_type#20, comment#21]
+- 'UnresolvedTableOrView [iceberg, bronze, taxi_trips_2024], DESCRIBE TABLE, true
