In [18]:
from pyspark.sql import SparkSession
from pyspark.sql.functions import upper, col
import os
import utils

In [4]:
#!/usr/bin/env python3
import os
import sys
from pyspark.sql import SparkSession
from pyspark.sql.functions import upper, col

# Pin Python for Spark workers & driver
os.environ["PYSPARK_PYTHON"]        = sys.executable
os.environ["PYSPARK_DRIVER_PYTHON"] = sys.executable

def main():
    # 1) Determine base POC folder (one level up from CWD)
    cwd = os.getcwd()  
    poc_root = os.path.abspath(os.path.join(cwd, ".."))

    # 2) Landing parquet actually lives under POC/landing/data/landing/turismo_Provincia
    landing_dir = os.path.join(poc_root, "landing", "data", "landing", "turismo_Provincia")
    print("→ Reading landing Parquet from", landing_dir)

    # 3) Start Spark session
    spark = (
        SparkSession.builder
          .appName("TrustedCSVLoad")
          .config("spark.master", "local[*]")
          .config("spark.driver.memory", "2g")
          .getOrCreate()
    )

    try:
        # 4) Read landing Parquet
        if not os.path.isdir(landing_dir):
            raise FileNotFoundError(f"Landing dir not found: {landing_dir}")
        df = spark.read.parquet(landing_dir)

        # 5) Clean & normalize
        df_clean = df.dropna()
        for c in ["CCAA_ORIGEN","PROVINCIA_ORIGEN","CCAA_DESTINO","PROVINCIA_DESTINO"]:
            df_clean = df_clean.withColumn(c, upper(col(c)))

        # 6) Write trusted CSV under POC/trusted/data/trusted/…
        tgt_dir = os.path.join(poc_root, "trusted", "data", "trusted", "turismo_Provincia_clean_csv")
        os.makedirs(tgt_dir, exist_ok=True)
        print("→ Writing trusted CSV to", tgt_dir)

        (
            df_clean
              .repartition(1)
              .write
              .mode("overwrite")
              .option("header", True)
              .csv(tgt_dir)
        )

        print("✅ Trusted zone load complete (CSV).")

    finally:
        spark.stop()

if __name__ == "__main__":
    main()


→ Reading landing Parquet from c:\Users\joaqu\OneDrive\Documents\AAmaster_UPC\TFM\TravelMind\POC\landing\data\landing\turismo_Provincia
→ Writing trusted CSV to c:\Users\joaqu\OneDrive\Documents\AAmaster_UPC\TFM\TravelMind\POC\trusted\data\trusted\turismo_Provincia_clean_csv
✅ Trusted zone load complete (CSV).


In [None]:

# --- Step 1: Read with Iceberg-aware session ---
iceberg_spark = utils.create_context()
try:
    print("→ Reading from Iceberg landing…")
    df = iceberg_spark.table("landing.spark-warehouse.turismo_Provincia")
finally:
    iceberg_spark.stop()

# --- Step 2: Transform & write CSV with a pure-Java session ---
pure_spark = (
    SparkSession.builder
      .appName("TrustedCSVLoad")
      .config("spark.master", "local[*]")
      .config("spark.driver.memory", "2g")
      .getOrCreate()
)

tgt_dir = "./data/trusted/turismo_Provincia_clean_csv"
os.makedirs(tgt_dir, exist_ok=True)

try:
    print("→ Dropping nulls & upper-case…")
    df_clean = df.dropna()
    for c in ["CCAA_ORIGEN","PROVINCIA_ORIGEN","CCAA_DESTINO","PROVINCIA_DESTINO"]:
        df_clean = df_clean.withColumn(c, upper(col(c)))

    print("→ Writing CSV to", tgt_dir)
    (
      df_clean
        .repartition(1)
        .write
        .mode("overwrite")
        .option("header", True)
        .csv(tgt_dir)
    )
    print("✅ CSV write complete.")
finally:
    pure_spark.stop()


→ Reading from Iceberg landing…


ParseException: 
[INVALID_IDENTIFIER] The identifier spark-warehouse is invalid. Please, consider quoting it with back-quotes as `spark-warehouse`.(line 1, pos 13)

== SQL ==
landing.spark-warehouse.turismo_Provincia
-------------^^^
