In [11]:
# %pip install pyspark==3.4.1

In [12]:
import pyspark
from pyspark.sql import SparkSession
from pyspark.sql import functions as F
# from pyspark.sql.functions import array, col
from pyspark.sql.types import StructType,StructField, StringType, IntegerType
import datetime

In [13]:
import os
cwd = os.getcwd()

In [14]:
spark = SparkSession.builder \
                    .master("spark://127.0.0.1:7077") \
                    .appName('TradeDetailedTradeMatrix') \
                    .getOrCreate()
# spark = SparkSession.builder.master("local[1]") \
#                     .appName('TradeDetailedTradeMatrix') \
#                     .getOrCreate()

In [15]:
tdtm_df = spark.read.option("header", True)\
    .csv(f"{cwd}/dataset/Trade_DetailedTradeMatrix_E_All_Data/Trade_DetailedTradeMatrix_E_All_Data.csv")
# tdtm_df.printSchema()

In [16]:
col_name = []
for col in tdtm_df.dtypes:
    col_name.append(col[0])
# col_name

In [17]:
year_list = []
for x in col_name:
    if 'Y' in x and 'F' not in x:
        year_list.append(int(x.replace('Y', '')))
# year_list

In [19]:
for year in year_list:
    tdtm_df = tdtm_df.withColumn(f"NewColumn_{year}", F.array(f"Y{year}", f"Y{year}F", F.lit(year)))
    tdtm_df = tdtm_df.drop(f"Y{year}", f"Y{year}F")
    
tdtm_df = tdtm_df.withColumn(f"merge_column", F.array(*[f"NewColumn_{year}" for year in year_list]))
transformed_date = datetime.datetime.now()
for year in year_list:
    tdtm_df = tdtm_df.drop(f"NewColumn_{year}")
tdtm_df = tdtm_df.select(
    tdtm_df['Reporter Country Code'].cast('int').alias('reporter_country_id'),
    tdtm_df['Partner Country Code'].cast('int').alias('partner_country_id'),
    tdtm_df['Item Code'].cast('int').alias('item_prod_id'),
    tdtm_df['Element Code'].cast('int').alias('element_trade_id'),
    F.explode('merge_column').alias('year_col')
)
tdtm_df = tdtm_df.withColumn('value', F.lit(tdtm_df.year_col[0])) \
                .withColumn('flag', F.lit(tdtm_df.year_col[1])) \
                .withColumn('year', F.lit(tdtm_df.year_col[2])) \
                .withColumn('transformed_date', F.lit(transformed_date))
tdtm_df = tdtm_df.drop('year_col')
tdtm_df.show()

+-------------------+------------------+------------+----------------+-----+----+----+--------------------+
|reporter_country_id|partner_country_id|item_prod_id|element_trade_id|value|flag|year|    transformed_date|
+-------------------+------------------+------------+----------------+-----+----+----+--------------------+
|                  2|                 4|         230|            5910| null|null|1986|2023-07-25 16:50:...|
|                  2|                 4|         230|            5910| null|null|1987|2023-07-25 16:50:...|
|                  2|                 4|         230|            5910| null|null|1988|2023-07-25 16:50:...|
|                  2|                 4|         230|            5910| null|null|1989|2023-07-25 16:50:...|
|                  2|                 4|         230|            5910| null|null|1990|2023-07-25 16:50:...|
|                  2|                 4|         230|            5910| null|null|1991|2023-07-25 16:50:...|
|                  2|       

In [20]:
tdtm_df.write\
    .mode("overwrite") \
    .parquet("hdfs://127.0.0.1:9000/FAOSTAT_prj/DataWarehouse/Trade_Detailed_Matrix")

In [23]:
spark.stop()