In [1]:
#!/usr/bin/env python3
import os
import sys
from pyspark.sql import SparkSession
from pyspark.sql.functions import col

# 0) Pin the Python interpreter for Spark
os.environ["PYSPARK_PYTHON"]        = sys.executable
os.environ["PYSPARK_DRIVER_PYTHON"] = sys.executable

def main():
    # 1) Determine POC root (one level up from exploitation/)
    cwd      = os.getcwd()
    poc_root = os.path.abspath(os.path.join(cwd, ".."))

    # 2) Path to trusted CSV (from your previous step)
    trusted_dir = os.path.join(
        poc_root, "trusted", "data", "trusted", "turismo_Provincia_clean_csv"
    )
    print("→ Reading trusted CSV from", trusted_dir)

    # 3) Start Spark
    spark = (
        SparkSession.builder
          .appName("ExploitationZoneLoad")
          .config("spark.master", "local[*]")
          .config("spark.driver.memory", "2g")
          .getOrCreate()
    )

    try:
        # 4) Load the trusted CSV
        if not os.path.isdir(trusted_dir):
            raise FileNotFoundError(f"Trusted folder not found: {trusted_dir}")
        df = (
            spark.read
                 .option("header", True)
                 .csv(trusted_dir)
        )

        # 5) Filter for Barcelona
        print("→ Filtering for PROVINCIA_DESTINO = 'Barcelona'")
        df_barcelona = df.filter(col("PROVINCIA_DESTINO") == "Barcelona")

        # 6) Write out exploitation CSV
        exploit_dir = os.path.join(
            poc_root, "exploitation", "data", "exploitation", "turismo_Provincia_barcelona"
        )
        os.makedirs(exploit_dir, exist_ok=True)
        print("→ Writing exploitation CSV to", exploit_dir)

        (
            df_barcelona
              .repartition(1)
              .write
              .mode("overwrite")
              .option("header", True)
              .csv(exploit_dir)
        )

        print("✅ Exploitation CSV load complete.")

    finally:
        spark.stop()

if __name__ == "__main__":
    main()


→ Reading trusted CSV from c:\Users\joaqu\OneDrive\Documents\AAmaster_UPC\TFM\TravelMind\POC\trusted\data\trusted\turismo_Provincia_clean_csv
→ Filtering for PROVINCIA_DESTINO = 'Barcelona'
→ Writing exploitation CSV to c:\Users\joaqu\OneDrive\Documents\AAmaster_UPC\TFM\TravelMind\POC\exploitation\data\exploitation\turismo_Provincia_barcelona
✅ Exploitation CSV load complete.


In [1]:
from pyspark.sql import SparkSession
from pyspark.sql.functions import upper, col
import os
import utils
import pyspark
from pyspark.sql import SparkSession,DataFrame
import requests
import json 
from io import BytesIO
import pandas as pd
import os
import sys
import utils as utils

In [None]:
# 1) Re-use the same Iceberg-aware session for read & write
spark = utils.create_context()


# Read landing-zone Iceberg table
src_db, src_tbl = "trusted", "turismo_Provincia"
print(f"→ Reading spark_catalog.{src_db}.{src_tbl}")
df = utils.read_iceberg_table(spark, src_db, src_tbl)
# 5) Clean & normalize
print("→ Filtering for PROVINCIA_DESTINO = 'Barcelona'")
df_barcelona = df.filter(col("PROVINCIA_DESTINO") == "Barcelona")
# Write into exploitation zone
tgt_db, tgt_tbl = "exploitation", "turismo_Provincia"
print(f"→ Writing spark_catalog.{tgt_db}.{tgt_tbl}")
utils.overwrite_iceberg_table(spark, df_barcelona, tgt_db, tgt_tbl)

print("✅ Exploitation load complete.")


spark.stop()