In [2]:
# %pip install pyspark==3.4.1

In [3]:
import pyspark
from pyspark.sql import SparkSession
from pyspark.sql import functions as F
# from pyspark.sql.functions import array, col
from pyspark.sql.types import StructType,StructField, StringType, IntegerType
import datetime

In [4]:
import os
cwd = os.getcwd()

In [5]:
spark = SparkSession.builder \
                    .master("spark://127.0.0.1:7077") \
                    .appName('AnnualValue') \
                    .getOrCreate()
# spark = SparkSession.builder \
#                     .master("local[1]") \
#                     .appName('AnnualValue') \
#                     .getOrCreate()

In [6]:
pcl_df = spark.read.option("header", True)\
    .csv(f"{cwd}/dataset/Production_Crops_Livestock_E_All_Data/Production_Crops_Livestock_E_All_Data.csv")
# pcl_df.printSchema()

In [7]:
col_name = []
for col in pcl_df.dtypes:
    col_name.append(col[0])
# col_name

In [8]:
year_list = []
for x in col_name:
    if 'Y' in x and 'F' not in x:
        year_list.append(int(x.replace('Y', '')))
# year_list

In [9]:
ElementProd_df = pcl_df.select(
    pcl_df['Element Code'].cast('int').alias('id'),
    pcl_df['Element'].cast('string').alias('element'),
    pcl_df['Unit'].cast('string').alias('unit'),
).orderBy('id')
ElementProd_df = ElementProd_df.dropDuplicates(["id"])
ElementProd_df.show()

+----+--------------------+---------+
|  id|             element|     unit|
+----+--------------------+---------+
|5111|              Stocks|     Head|
|5112|              Stocks|1000 Head|
|5114|              Stocks|       No|
|5312|      Area harvested|       ha|
|5313|              Laying|1000 Head|
|5314|        Prod Popultn|       No|
|5318|        Milk Animals|     Head|
|5320|Producing Animals...|     Head|
|5321|Producing Animals...|1000 Head|
|5410|               Yield| 100mg/An|
|5413|               Yield|    No/An|
|5417|Yield/Carcass Weight|    hg/An|
|5419|               Yield|    hg/ha|
|5420|               Yield|    hg/An|
|5422|               Yield|       hg|
|5424|Yield/Carcass Weight|  0.1g/An|
|5510|          Production|   tonnes|
|5513|          Production|  1000 No|
+----+--------------------+---------+



In [10]:
for year in year_list:
    pcl_df = pcl_df.withColumn(f"NewColumn_{year}", F.array(f"Y{year}", f"Y{year}F", F.lit(year)))
    pcl_df = pcl_df.drop(f"Y{year}", f"Y{year}F")
    
pcl_df = pcl_df.withColumn(f"merge_column", F.array(*[f"NewColumn_{year}" for year in year_list]))
transformed_date = datetime.datetime.now()
for year in year_list:
    pcl_df = pcl_df.drop(f"NewColumn_{year}")
pcl_df = pcl_df.select(
    pcl_df['Area Code'].cast('int').alias('country_id'),
    pcl_df['Item Code'].cast('int').alias('item_prod_id'),
    pcl_df['Element Code'].cast('int').alias('element_prod_id'),
    F.explode('merge_column').alias('year_col')
)
pcl_df = pcl_df.withColumn('value', F.lit(pcl_df.year_col[0])) \
                .withColumn('flag', F.lit(pcl_df.year_col[1])) \
                .withColumn('year', F.lit(pcl_df.year_col[2])) \
                .withColumn('transformed_date', F.lit(transformed_date))
pcl_df = pcl_df.drop('year_col')
pcl_df.show()

+----------+------------+---------------+-----------+----+----+--------------------+
|country_id|item_prod_id|element_prod_id|      value|flag|year|    transformed_date|
+----------+------------+---------------+-----------+----+----+--------------------+
|         2|         221|           5312|       null|null|1961|2023-07-26 19:57:...|
|         2|         221|           5312|       null|null|1962|2023-07-26 19:57:...|
|         2|         221|           5312|       null|null|1963|2023-07-26 19:57:...|
|         2|         221|           5312|       null|null|1964|2023-07-26 19:57:...|
|         2|         221|           5312|       null|null|1965|2023-07-26 19:57:...|
|         2|         221|           5312|       null|null|1966|2023-07-26 19:57:...|
|         2|         221|           5312|       null|null|1967|2023-07-26 19:57:...|
|         2|         221|           5312|       null|null|1968|2023-07-26 19:57:...|
|         2|         221|           5312|       null|null|1969|20

In [11]:
pcl_df.write\
    .mode("overwrite") \
    .parquet("hdfs://127.0.0.1:9000/FAOSTAT_prj/DataWarehouse/Production_Crops_Livestock")

In [12]:
ElementProd_df.write\
    .mode("overwrite") \
    .parquet("hdfs://127.0.0.1:9000/FAOSTAT_prj/DataWarehouse/Element_Prod")

In [13]:
spark.stop()