# Exports Historical Formatting

In [6]:
from database_settings.spark_utilities import get_spark_df, spark_session
from pyspark.sql.functions import col, lpad, concat_ws, regexp_replace, trim
from database_settings.postgres_utilities import spark2postgres
from data_formatter import utilities

In [7]:
# Spark session
spark = spark_session()

In [8]:
# Get headings to work
headings = utilities.get_headings()
headings_filter = r"^("+ "|".join(headings) + ")" # filter out headings that aren't in the list
# Define the exports folder in HDFS
path = '/thesis/peru/exports/*.parquet'

In [9]:
# Formatting applied:
# - Merge the descriptions in a single column, replace NaN values by '' and trim white spaces
# - Replace those exporter codes with value 'No Disponib' by unknown
print("Performing query on Persistent Zone...")
#
df = get_spark_df(spark_session=spark, file_path=path)\
    .select('PART_NANDI','VPESNET', 'VPESBRU', 'VFOBSERDOL', 'CPAIDES','NDOC','FEMB','DCOM','DMER2','DMER3','DMER4','DMER5','BATCH_WEEK')\
    .withColumn("heading", lpad(col("PART_NANDI").cast("string"), 10, "0")) \
    .filter(col("heading").rlike(headings_filter))\
    .withColumn("description", concat_ws(" ", col("DCOM"), col("DMER2"), col("DMER3"), col("DMER4"), col("DMER5"))) \
    .withColumn("description", regexp_replace(col("description"), "NaN", "")) \
    .withColumn("description", trim(col("description")))\
    .withColumn("NDOC", regexp_replace(col("NDOC"), "No Disponib", "unknown")) \
    .withColumnRenamed('NDOC', 'exp_id')\
    .withColumnRenamed('VPESNET', 'net_weight')\
    .withColumnRenamed('VPESBRU', 'gross_weight')\
    .withColumnRenamed('VFOBSERDOL', 'value_usd')\
    .withColumnRenamed('CPAIDES', 'country')\
    .withColumnRenamed('FEMB', 'boarding_date')\
    .withColumnRenamed('BATCH_WEEK', 'batch_week')\
    .select('heading','exp_id','net_weight','gross_weight','value_usd','country','boarding_date',"description",'batch_week')\
    .orderBy(col('heading').asc())

Performing query on Persistent Zone...


In [10]:
# Write the DataFrame to the PostgreSQL database
print("Sending data to PostgreSQL...")
try:
    df.write \
    .format("jdbc") \
    .option("dbtable", "peru_exports") \
    .options(**spark2postgres()) \
    .mode('append')\
    .save()
    print("Data sent to Formatted Zone successfully: {} rows added".format(df.count()))

except Exception as e:
    print(f"Error while sending data to Formatted Zone: {e}")

Sending data to PostgreSQL
Data sent to Formatted Zone successfully: 1518837 rows added
