# Configuration Initial et setup

In [5]:
!pip install geopandas
!pip install delta

[0m
[1m[[0m[34;49mnotice[0m[1;39;49m][0m[39;49m A new release of pip is available: [0m[31;49m23.0.1[0m[39;49m -> [0m[32;49m25.3[0m
[1m[[0m[34;49mnotice[0m[1;39;49m][0m[39;49m To update, run: [0m[32;49mpip install --upgrade pip[0m
Collecting delta
  Downloading delta-0.4.2.tar.gz (4.1 kB)
  Preparing metadata (setup.py) ... [?25ldone
[?25hBuilding wheels for collected packages: delta
  Building wheel for delta (setup.py) ... [?25done
[?25h  Created wheel for delta: filename=delta-0.4.2-py3-none-any.whl size=2914 sha256=6144f473517bf4801888882eb07b3b7ea6d540138db6d02fb74b97eb9ba24421
  Stored in directory: /root/.cache/pip/wheels/a8/86/24/a486f14769cf86a2a9ce6b589a82b7414b14657c6fd515dc75
Successfully built delta
Installing collected packages: delta
Successfully installed delta-0.4.2
[0m
[1m[[0m[34;49mnotice[0m[1;39;49m][0m[39;49m A new release of pip is available: [0m[31;49m23.0.1[0m[39;49m -> [0m[32;49m25.3[0m
[1m[[0m[34;49mnotice[0m[1;3

In [6]:
import sys
import os
import zipfile
import pandas as pd
import geopandas as gpd
from pyspark.sql import SparkSession
from pyspark.sql.functions import *
from pyspark.sql.types import *
from delta import *
from datetime import datetime
import warnings
warnings.filterwarnings('ignore')

In [8]:
builder = SparkSession.builder \
    .appName("01_ingestion_bronze") \
    .config("spark.sql.extensions", "io.delta.sql.DeltaSparkSessionExtension") \
    .config("spark.sql.catalog.spark_catalog", "org.apache.spark.sql.delta.catalog.DeltaCatalog") \
    .config("spark.driver.memory", "8g") \
    .config("spark.executor.memory", "8g") \
    .config("spark.sql.parquet.compression.codec", "snappy")

In [11]:
# Configuration des chemins
base_path = "/dbfs/mnt/lakehouse"
bronze_path = f"{base_path}/bronze"

# Cr√©ation des r√©pertoires Delta
directories = [
    f"{bronze_path}/weather",
    f"{bronze_path}/taxi_trips", 
    f"{bronze_path}/taxi_zones",
    f"{bronze_path}/taxi_zones_shp"
]


In [12]:
for dir_path in directories:
    os.makedirs(dir_path, exist_ok=True)

print("‚úÖ Configuration initiale termin√©e")
print(f"üìÅ R√©pertoire base: {base_path}")
print(f"üìÅ R√©pertoire bronze: {bronze_path}")

‚úÖ Configuration initiale termin√©e
üìÅ R√©pertoire base: /dbfs/mnt/lakehouse
üìÅ R√©pertoire bronze: /dbfs/mnt/lakehouse/bronze


# Fonctions Utilitaires

In [13]:
def log_ingestion(source_name, file_count, record_count, success=True, error_msg=None):
    """Log des op√©rations d'ingestion"""
    timestamp = datetime.now().strftime("%Y-%m-%d %H:%M:%S")
    
    log_data = {
        "timestamp": timestamp,
        "source": source_name,
        "file_count": file_count,
        "record_count": record_count,
        "success": success,
        "error_message": error_msg if error_msg else ""
    }
    
    # Sauvegarde dans un DataFrame Spark
    log_df = spark.createDataFrame([log_data])
    
    # √âcriture ou ajout au log Delta
    log_path = f"{bronze_path}/ingestion_log"
    
    if os.path.exists(f"{log_path}/_delta_log"):
        log_df.write.format("delta").mode("append").save(log_path)
    else:
        log_df.write.format("delta").save(log_path)
    
    if success:
        print(f"‚úÖ {source_name}: {record_count:,} enregistrements ing√©r√©s")
    else:
        print(f"‚ùå {source_name}: √âchec - {error_msg}")

def extract_zip(zip_path, extract_to):
    """Extrait un fichier ZIP"""
    with zipfile.ZipFile(zip_path, 'r') as zip_ref:
        zip_ref.extractall(extract_to)
    print(f"üì¶ Fichier ZIP extrait: {zip_path} -> {extract_to}")

def validate_schema(df, expected_columns):
    """Valide le sch√©ma d'un DataFrame"""
    actual_columns = set(df.columns)
    expected_set = set(expected_columns)
    
    missing = expected_set - actual_columns
    extra = actual_columns - expected_set
    
    if missing:
        print(f"‚ö†Ô∏è  Colonnes manquantes: {missing}")
    if extra:
        print(f"‚ö†Ô∏è  Colonnes suppl√©mentaires: {extra}")
    
    return len(missing) == 0

# Ingestion des donn√©es meteos(csv)

In [14]:
print("="*80)
print("üå§Ô∏è  INGESTION DES DONN√âES M√âT√âO")
print("="*80)

# Chemin du fichier CSV
weather_csv_path = "/dbfs/FileStore/shared_uploads/votre_email@domaine.com/central_park_weather_2024.csv"
weather_bronze_path = f"{bronze_path}/weather"

try:
    # Lecture avec Pandas pour inspection
    weather_pd = pd.read_csv(weather_csv_path)
    print(f"üìä Lecture du CSV: {len(weather_pd):,} lignes, {len(weather_pd.columns)} colonnes")
    print("üìã Aper√ßu des colonnes:", weather_pd.columns.tolist())
    print("üìÖ Plage temporelle:", weather_pd['time'].min(), "√†", weather_pd['time'].max())
    
    # Ajout de m√©tadonn√©es
    weather_pd['ingestion_date'] = datetime.now().date()
    weather_pd['source_file'] = 'central_park_weather_2024.csv'
    weather_pd['data_type'] = 'weather_observation'
    
    # Conversion en Spark DataFrame
    weather_df = spark.createDataFrame(weather_pd)
    
    # Correction des types de donn√©es
    weather_df = weather_df \
        .withColumn("time", to_timestamp(col("time"))) \
        .withColumn("temp", col("temp").cast("float")) \
        .withColumn("rhum", col("rhum").cast("int")) \
        .withColumn("prcp", col("prcp").cast("float")) \
        .withColumn("wspd", col("wspd").cast("float")) \
        .withColumn("pres", col("pres").cast("float"))
    
    # Sch√©ma attendu
    expected_weather_cols = ['time', 'temp', 'rhum', 'prcp', 'wspd', 'pres', 
                            'ingestion_date', 'source_file', 'data_type']
    
    if validate_schema(weather_df, expected_weather_cols):
        # √âcriture en Delta avec partition par mois
        weather_df = weather_df \
            .withColumn("year", year(col("time"))) \
            .withColumn("month", month(col("time"))) \
            .withColumn("day", dayofmonth(col("time")))
        
        print("üíæ √âcriture en Delta...")
        weather_df.write \
            .format("delta") \
            .mode("overwrite") \
            .partitionBy("year", "month") \
            .save(weather_bronze_path)
        
        # V√©rification
        delta_weather = spark.read.format("delta").load(weather_bronze_path)
        record_count = delta_weather.count()
        
        log_ingestion(
            source_name="weather_data",
            file_count=1,
            record_count=record_count,
            success=True
        )
        
        print("üìä Statistiques du Delta:")
        delta_weather.select("year", "month").distinct().orderBy("year", "month").show()
        
except Exception as e:
    error_msg = f"Erreur lors de l'ingestion m√©t√©o: {str(e)}"
    log_ingestion(
        source_name="weather_data",
        file_count=1,
        record_count=0,
        success=False,
        error_msg=error_msg
    )
    raise

üå§Ô∏è  INGESTION DES DONN√âES M√âT√âO


Py4JJavaError: An error occurred while calling o68.save.
: org.apache.spark.SparkClassNotFoundException: [DATA_SOURCE_NOT_FOUND] Failed to find the data source: delta. Please find packages at `https://spark.apache.org/third-party-projects.html`.
	at org.apache.spark.sql.errors.QueryExecutionErrors$.dataSourceNotFoundError(QueryExecutionErrors.scala:725)
	at org.apache.spark.sql.execution.datasources.DataSource$.lookupDataSource(DataSource.scala:647)
	at org.apache.spark.sql.execution.datasources.DataSource$.lookupDataSourceV2(DataSource.scala:697)
	at org.apache.spark.sql.DataFrameWriter.lookupV2Provider(DataFrameWriter.scala:873)
	at org.apache.spark.sql.DataFrameWriter.saveInternal(DataFrameWriter.scala:260)
	at org.apache.spark.sql.DataFrameWriter.save(DataFrameWriter.scala:243)
	at java.base/jdk.internal.reflect.NativeMethodAccessorImpl.invoke0(Native Method)
	at java.base/jdk.internal.reflect.NativeMethodAccessorImpl.invoke(NativeMethodAccessorImpl.java:77)
	at java.base/jdk.internal.reflect.DelegatingMethodAccessorImpl.invoke(DelegatingMethodAccessorImpl.java:43)
	at java.base/java.lang.reflect.Method.invoke(Method.java:569)
	at py4j.reflection.MethodInvoker.invoke(MethodInvoker.java:244)
	at py4j.reflection.ReflectionEngine.invoke(ReflectionEngine.java:374)
	at py4j.Gateway.invoke(Gateway.java:282)
	at py4j.commands.AbstractCommand.invokeMethod(AbstractCommand.java:132)
	at py4j.commands.CallCommand.execute(CallCommand.java:79)
	at py4j.ClientServerConnection.waitForCommands(ClientServerConnection.java:182)
	at py4j.ClientServerConnection.run(ClientServerConnection.java:106)
	at java.base/java.lang.Thread.run(Thread.java:840)
Caused by: java.lang.ClassNotFoundException: delta.DefaultSource
	at java.base/java.net.URLClassLoader.findClass(URLClassLoader.java:445)
	at java.base/java.lang.ClassLoader.loadClass(ClassLoader.java:592)
	at java.base/java.lang.ClassLoader.loadClass(ClassLoader.java:525)
	at org.apache.spark.sql.execution.datasources.DataSource$.$anonfun$lookupDataSource$5(DataSource.scala:633)
	at scala.util.Try$.apply(Try.scala:213)
	at org.apache.spark.sql.execution.datasources.DataSource$.$anonfun$lookupDataSource$4(DataSource.scala:633)
	at scala.util.Failure.orElse(Try.scala:224)
	at org.apache.spark.sql.execution.datasources.DataSource$.lookupDataSource(DataSource.scala:633)
	... 16 more
