In [1]:
import pyspark
from pyspark.sql import SparkSession,DataFrame
import requests
import json 
from io import BytesIO
import pandas as pd
import os
import sys

In [2]:
def create_context() -> SparkSession:

    # Usa el mismo intérprete que el kernel del notebook
    os.environ["PYSPARK_PYTHON"] = sys.executable
    os.environ["PYSPARK_DRIVER_PYTHON"] = sys.executable
    spark = SparkSession.builder\
        .appName("IcebergWritedata") \
        .config("spark.sql.catalog.spark_catalog", "org.apache.iceberg.spark.SparkSessionCatalog") \
        .config("spark.sql.catalog.spark_catalog.type", "hadoop") \
        .config("spark.sql.catalog.spark_catalog.warehouse", "../data/warehouse") \
        .config("spark.sql.extensions", "org.apache.iceberg.spark.extensions.IcebergSparkSessionExtensions") \
        .config("spark.jars.packages", "org.apache.iceberg:iceberg-spark-runtime-3.5_2.12:1.4.3") \
        .getOrCreate()
    return spark

In [3]:
def get_api_endpoint_excel(spark:SparkSession,path:str,filter:str = None) -> DataFrame:

    if filter:
        response = requests.get(f"{path}?{filter}")
    else:
        response = requests.get(f"{path}")
    if response.status_code == 200:
        # Leer el archivo Excel con pandas desde memoria
        excel_file = BytesIO(response.content)
        df_pandas = pd.read_excel(excel_file)

        # Convertir el DataFrame de pandas a Spark
        df_spark = spark.createDataFrame(df_pandas)

        # Mostrar los primeros registros
        return df_spark
    else:
        print(f"Error {response.status_code}: no se pudo obtener el archivo.")


In [4]:
path = "https://dataestur.azure-api.net/API-SEGITTUR-v1/AENA_DESTINOS_DL"
filter = "desde%20%28a%C3%B1o%29=2004&Aeropuerto%20AENA=JT%20Barcelona-El%20Prat"

spark = create_context()
df = get_api_endpoint_excel(spark,path,filter)

df.show()


+----+---+--------------------+---------------------+
| AÑO|MES|     AEROPUERTO_AENA|PASAJEROS_POR_DESTINO|
+----+---+--------------------+---------------------+
|2004|  1|JT Barcelona-El Prat|              1487280|
|2004|  2|JT Barcelona-El Prat|              1681762|
|2004|  3|JT Barcelona-El Prat|              1943771|
|2004|  4|JT Barcelona-El Prat|              2030420|
|2004|  5|JT Barcelona-El Prat|              2128796|
|2004|  6|JT Barcelona-El Prat|              2203999|
|2004|  7|JT Barcelona-El Prat|              2377152|
|2004|  8|JT Barcelona-El Prat|              2503196|
|2004|  9|JT Barcelona-El Prat|              2262870|
|2004| 10|JT Barcelona-El Prat|              2186597|
|2004| 11|JT Barcelona-El Prat|              1846926|
|2004| 12|JT Barcelona-El Prat|              1711314|
|2005|  1|JT Barcelona-El Prat|              1619040|
|2005|  2|JT Barcelona-El Prat|              1766193|
|2005|  3|JT Barcelona-El Prat|              2159888|
|2005|  4|JT Barcelona-El Pr

In [5]:
spark.sql("CREATE DATABASE IF NOT EXISTS spark_catalog.local_db")
# Guardar tabla Iceberg
df.writeTo("spark_catalog.local_db.aena_barcelona").using("iceberg").createOrReplace()

In [6]:
path = "https://dataestur.azure-api.net/API-SEGITTUR-v1/EOH_PUNT_TUR_DL"
filter = "Punto%20tur%C3%ADstico=Todos&Lugar%20de%20residencia=Todos&Provincia=Todos"

df = get_api_endpoint_excel(spark,path,filter)

In [7]:
# Guardar tabla Iceberg
df.writeTo("spark_catalog.local_db.hoteles_punto_turistico").using("iceberg").createOrReplace()