In [None]:
from pyspark.sql import SparkSession,DataFrame
from pyspark.sql.functions import *
from pyspark.sql.types import *
import unicodedata
import re
import utils as utils

In [2]:
source_db = "landing"
source_table = "aena"
db_name = "trusted"
table_name = "f_flujo_aereo"

In [3]:
spark = utils.create_context()

In [None]:
@udf(StringType())
def normalize_text(text:str):

    if text is None:
        return ""
    
    # 1. Normalizar (separar letras de tildes)
    texto_normalizado = unicodedata.normalize('NFKD', text)
    
    # 2. Eliminar acentos (caracteres "combining")
    texto_sin_acentos = ''.join(
        c for c in texto_normalizado if not unicodedata.combining(c)
    )
    
    # 3. Eliminar puntuación (todo lo que no sea letra, número o espacio)
    texto_sin_puntuacion = re.sub(r'[^A-Za-z0-9\s]', '', texto_sin_acentos)
    
    # 4. Eliminar espacios extra
    texto_limpio = ' '.join(texto_sin_puntuacion.split())
    
    # 5. Convertir a mayúsculas
    return texto_limpio.upper()

In [None]:

df = utils.read_iceberg_table(spark,db_name,table_name)

# convert each column to normalized type
df = df.select(
    col("AÑO").alias("año"),
    col("MES").alias("mes"),
    normalize_text(col("AEROPUERTO_AENA")).alias("earopuertoAena"),
    col("PASAJEROS_POR_DESTINO").alias("passajerosPorDestino").cast(IntegerType())
)

In [None]:
utils.overwrite_iceberg_table(spark,df,db_name,table_name)