In [1]:
import pyspark
from pyspark.sql import SparkSession, DataFrame
from pyspark.sql.functions import upper, col
import utils as utils

In [7]:
spark = utils.create_context()

# Source configuration (landing zone)
source_db = "landing"
source_table = "turismo_Provincia"

# Target configuration (trusted zone)
target_db = "trusted"
target_table = "turismo_Provincia_clean"

try:
    # Read data from landing zone
    print(f"Reading data from {source_db}.{source_table}")
    landing_df = spark.table(f"{source_db}.{source_table}")
    
    print(f"Original record count: {landing_df.count()}")
    
    # Data cleaning and normalization
    print("Applying data cleaning and normalization...")
    
    # 1. Remove rows with any null values
    clean_df = landing_df.dropna()
    
    print(f"Record count after removing nulls: {clean_df.count()}")
    
    # 2. Convert text columns to uppercase
    # Identify string columns that need uppercase transformation
    string_columns = ["CCAA_ORIGEN", "PROVINCIA_ORIGEN", "CCAA_DESTINO", "PROVINCIA_DESTINO"]
    
    # Apply uppercase transformation to string columns
    for column in string_columns:
        if column in clean_df.columns:
            clean_df = clean_df.withColumn(column, upper(col(column)))
    
    # Optional: Show sample of cleaned data
    print("Sample of cleaned data:")
    clean_df.show(5, truncate=False)
    
    # Write to trusted zone
    print(f"Writing cleaned data to {target_db}.{target_table}")
    utils.overwrite_iceberg_table(spark, clean_df, target_db, target_table)
    
    print(f"Trusted zone processing completed successfully!")
    print(f"Final record count: {clean_df.count()}")
    
except Exception as e:
    print(f"Error during trusted zone processing: {str(e)}")
    raise e
    
finally:
    # Stop Spark session
    spark.stop()

ConnectionRefusedError: [WinError 10061] No se puede establecer una conexión ya que el equipo de destino denegó expresamente dicha conexión