# Exports formatting

In [1]:
from database_settings import spark_utilities
from pyspark.sql.functions import col, lpad, concat_ws, regexp_replace, trim
import pandas as pd

### Get the data from the persistent zone

In [2]:
# Choose the headings to preprocess (see PDF):
# 07XXXXXXXX: Hortalizas, plantas, raíces y tubérculos alimenticios
# 08XXXXXXXX: Frutas y frutos comestibles; cortezas de agrios (cítricos), melones o sandías.
headings = ['07', '08']

In [3]:
# Filters
headings_filter = r"^("+ "|".join(headings) + ")" # filter out headings that aren't in the list
observations_threshold = 10 # filter out headings with low count
net_weight_thresh = 100 # filter out exports with low net weights (samples)
usd_value_thresh = 500 # filter out exports with low usd value (samples)

In [9]:
# Formatting applied:
# - Filter to keep only the headings in the list
# - Filter to keep only exports over a certain net weight
# - Filter to keep only exports over a certain usd value
# - Merge the descriptions in a single column, replace NaN values by '' and trim white spaces
# - Replace those exporter codes with value 'No Disponib' by unknown
# - Keep only those headings whose count is over a certain threshold
df = spark_utilities.get_spark_df('peru_exports')\
    .select('PART_NANDI','VPESNET', 'VPESBRU', 'VFOBSERDOL', 'CPAIDES','NDOC','FEMB','DCOM','DMER2','DMER3','DMER4','DMER5','BATCH_WEEK')\
    .withColumn("HEADING", lpad(col("PART_NANDI").cast("string"), 10, "0")) \
    .filter(col("HEADING").rlike(headings_filter))\
    .filter(col('VPESNET')>net_weight_thresh)\
    .filter(col('VFOBSERDOL')>usd_value_thresh)\
    .withColumn("DESCRIPTION", concat_ws(" ", col("DCOM"), col("DMER2"), col("DMER3"), col("DMER4"), col("DMER5"))) \
    .withColumn("DESCRIPTION", regexp_replace(col("DESCRIPTION"), "NaN", "")) \
    .withColumn("DESCRIPTION", trim(col("DESCRIPTION")))\
    .withColumn("NDOC", regexp_replace(col("NDOC"), "No Disponib", "unknown")) \
    .withColumnRenamed('NDOC', 'EXP_ID')\
    .withColumnRenamed('VPESNET', 'NET_WEIGHT')\
    .withColumnRenamed('VPESBRU', 'GROSS_WEIGHT')\
    .withColumnRenamed('VFOBSERDOL', 'VALUE_USD')\
    .withColumnRenamed('CPAIDES', 'COUNTRY')\
    .withColumnRenamed('FEMB', 'BOARDING_DATE')
# Get the count of each heading to keep only the ones above the threshold
count_hs = df.groupBy('HEADING').count()
df = df.join(count_hs, 'HEADING') \
    .filter(col('count') > observations_threshold) \
    .select('HEADING','EXP_ID','NET_WEIGHT','GROSS_WEIGHT','VALUE_USD','COUNTRY','BOARDING_DATE',"DESCRIPTION",'BATCH_WEEK')\
    .toPandas()

ERROR:root:Exception while sending command.
Traceback (most recent call last):
  File "C:\Users\sergio\anaconda3\envs\thesis\lib\site-packages\py4j\clientserver.py", line 475, in send_command
    answer = smart_decode(self.stream.readline()[:-1])
  File "C:\Users\sergio\anaconda3\envs\thesis\lib\socket.py", line 705, in readinto
    return self._sock.recv_into(b)
ConnectionResetError: [WinError 10054] An existing connection was forcibly closed by the remote host

During handling of the above exception, another exception occurred:

Traceback (most recent call last):
  File "C:\Users\sergio\anaconda3\envs\thesis\lib\site-packages\py4j\java_gateway.py", line 1038, in send_command
    response = connection.send_command(command)
  File "C:\Users\sergio\anaconda3\envs\thesis\lib\site-packages\py4j\clientserver.py", line 503, in send_command
    raise Py4JNetworkError(
py4j.protocol.Py4JNetworkError: Error while sending or receiving


ConnectionRefusedError: [WinError 10061] No connection could be made because the target machine actively refused it

In [8]:
df.sort_values(by='HEADING', ascending=True)

Unnamed: 0,HEADING,EXP_ID,NET_WEIGHT,GROSS_WEIGHT,VALUE_USD,COUNTRY,BOARDING_DATE,DESCRIPTION
1275551,0701900000,unknown,29000.0,29010.000,3770.0,BO,20221115,PAPA FRESCA EN 250 SACOS DE POLIPROPILENO DE 1...
781404,0701900000,unknown,29000.0,29040.000,12180.0,BO,20221019,PAPA FRESCA EN 290 SACOS DE POLIPROPILENO DE 1...
781403,0701900000,20601566916,28810.0,28810.000,12180.0,BO,20221016,PAPA FRESCA EN 290 SACOS DE POLIPROPILENO DE 1...
781402,0701900000,20609694743,29000.0,29007.000,10788.0,BO,20221020,PAPA FRESCA EN 250 SACOS DE POLIPROPILENO DE 1...
1391103,0701900000,20519791413,930.0,1062.957,2976.0,ES,20230329,PAPA AMARILLA CONGELADA BOLSAS 500 GR CAJAS US...
...,...,...,...,...,...,...,...,...
373272,0814009000,20110378956,149462.0,149840.000,149462.0,ES,20220801,CASCARA DESHIDRATADA DE LIMON EN SACOS SE ACOG...
840940,0814009000,20517142965,14010.0,14280.000,65972.0,DE,20191105,CASCARA DE NARANJA ORGANICA EN SACOS DE 15 K...
473894,0814009000,20517142965,10545.0,10830.000,56943.0,US,20181103,CASCARA DE NARANJA ORGANICA DE 15 KILOS C/U ...
512515,0814009000,20517142965,5010.0,5114.007,20040.0,US,20210701,CASCARA DE NARANJA ORGANICA ORGANIC ORANGE PEE...


In [5]:
# ENVIAR A POSTGRES Y LUEGO HACER EL INCREMENTAL