In [None]:
import pyspark
from pyspark.sql import SparkSession,DataFrame
import requests
import json 
from io import BytesIO
import pandas as pd
import os
import sys
import utils as utils

In [2]:
#!/usr/bin/env python3
import os
import requests
from io import StringIO

import pandas as pd
from pyspark.sql import SparkSession
import os, sys

# Tell Spark which Python to use for its workers & driver
os.environ["PYSPARK_PYTHON"]        = sys.executable
os.environ["PYSPARK_DRIVER_PYTHON"] = sys.executable

def main():
    # 1) Spark session (no Iceberg)
    spark = (
        SparkSession.builder
            .appName("LandingZonePureSpark")
            .config("spark.driver.memory", "4g")
            .getOrCreate()
    )

    try:
        # 2) API endpoint & query
        path  = "https://dataestur.azure-api.net/API-SEGITTUR-v1/TURISMO_INTERNO_PROV_CCAA_DL"
        query = "CCAA%20origen=Todos&Provincia%20origen=Todos&CCAA%20destino=Todos&Provincia%20destino=Todos"
        resp  = requests.get(f"{path}?{query}")
        resp.raise_for_status()

        # 3) Parse into pandas to handle ; separator and , decimal
        csv_text = resp.content.decode("latin-1")
        pdf = pd.read_csv(
            StringIO(csv_text),
            sep=";",
            decimal=",",
            encoding="latin-1"
        )

        # 4) Convert to Spark DataFrame
        df = spark.createDataFrame(pdf)

        # 5) Write out as Parquet
        out_dir = os.path.abspath("data/landing/turismo_Provincia")
        os.makedirs(out_dir, exist_ok=True)
        df.write.mode("overwrite").parquet(out_dir)

        print(f"✅ Landing data written to Parquet at {out_dir}")

    finally:
        spark.stop()

if __name__ == "__main__":
    main()


✅ Landing data written to Parquet at c:\Users\joaqu\OneDrive\Documents\AAmaster_UPC\TFM\TravelMind\POC\landing\data\landing\turismo_Provincia


In [9]:
path = "https://dataestur.azure-api.net/API-SEGITTUR-v1/TURISMO_INTERNO_PROV_CCAA_DL"
filter = "CCAA%20origen=Todos&Provincia%20origen=Todos&CCAA%20destino=Todos&Provincia%20destino=Todos"
# https://dataestur.azure-api.net/API-SEGITTUR-v1/TURISMO_INTERNO_PROV_CCAA_DL?CCAA%20origen=Todos&Provincia%20origen=Todos&CCAA%20destino=Todos&Provincia%20destino=Todos
spark = utils.create_context()

try:
    db_name = "landing"
    table_name = "turismo_Provincia"

    if utils.check_table_exists(spark, db_name, table_name):
        df = utils.get_api_endpoint_data(spark, path, filter)
        utils.overwrite_iceberg_table(spark, df, db_name, table_name)

    # Esta parte se ejecuta siempre, independientemente de si la tabla ya existía
    df = utils.get_api_endpoint_data(spark, path, filter)
    utils.overwrite_iceberg_table(spark, df, db_name, table_name)

finally:
    spark.stop()

