In [39]:
# %pip install pyspark==3.4.1

In [40]:
import pyspark
from pyspark.sql import SparkSession
from pyspark.sql import functions as F
# from pyspark.sql.functions import array, col
from pyspark.sql.types import StructType,StructField, StringType, IntegerType
import datetime

In [41]:
import os
cwd = os.getcwd()

In [42]:
# spark = SparkSession.builder \
#                     .master("spark://127.0.0.1:7077") \
#                     .appName('Deflator') \
#                     .getOrCreate()
spark = SparkSession.builder.master("local[1]") \
                    .appName('Deflator') \
                    .getOrCreate()

In [43]:
d_df = spark.read.option("header", True)\
    .csv(f"{cwd}/dataset/Deflators_E_All_Data/Deflators_E_All_Data.csv")
# d_df.printSchema()

In [44]:
col_name = []
for col in d_df.dtypes:
    col_name.append(col[0])
# col_name

In [45]:
year_list = []
for x in col_name:
    if 'Y' in x and 'F' not in x:
        year_list.append(int(x.replace('Y', '')))
# year_list

In [46]:
ElementDeflator_df = d_df.select(
    d_df['Element Code'].cast('int').alias('id'),
    d_df['Element'].cast('string').alias('element'),
    d_df['Unit'].cast('string').alias('unit'),
).orderBy('id')
ElementDeflator_df = ElementDeflator_df.dropDuplicates(["id"])
ElementDeflator_df.show()

+----+--------------------+----+
|  id|             element|unit|
+----+--------------------+----+
|6179|Value US$, 2015 p...| US$|
|6180|Value Local Curre...| LCU|
+----+--------------------+----+



In [47]:
ItemDeflator_df = d_df.select(
    d_df['Item Code'].cast('int').alias('id'),
    d_df['Item'].cast('string').alias('item'),
).orderBy('id')
ItemDeflator_df = ItemDeflator_df.dropDuplicates(["id"])
ItemDeflator_df.show()

+-----+--------------------+
|   id|                item|
+-----+--------------------+
|22024|        GDP Deflator|
|22025|Gross Fixed Capit...|
|22026|Value Added Defla...|
|22028|Value Added Defla...|
+-----+--------------------+



In [48]:
for year in year_list:
    d_df = d_df.withColumn(f"NewColumn_{year}", F.array(f"Y{year}", f"Y{year}F", F.lit(year)))
    d_df = d_df.drop(f"Y{year}", f"Y{year}F")
    
d_df = d_df.withColumn(f"merge_column", F.array(*[f"NewColumn_{year}" for year in year_list]))
transformed_date = datetime.datetime.now()
for year in year_list:
    d_df = d_df.drop(f"NewColumn_{year}")
d_df = d_df.select(
    d_df['Area Code'].cast('int').alias('country_id'),
    d_df['Item Code'].cast('int').alias('item_deflator_id'),
    d_df['Element Code'].cast('int').alias('element_deflator_id'),
    F.explode('merge_column').alias('year_col')
)
d_df = d_df.withColumn('value', F.lit(d_df.year_col[0])) \
            .withColumn('flag', F.lit(d_df.year_col[1])) \
            .withColumn('year', F.lit(d_df.year_col[2])) \
            .withColumn('transformed_date', F.lit(transformed_date))
d_df = d_df.drop('year_col')
d_df.show()

+----------+----------------+-------------------+------------+----+----+--------------------+
|country_id|item_deflator_id|element_deflator_id|       value|flag|year|    transformed_date|
+----------+----------------+-------------------+------------+----+----+--------------------+
|         2|           22024|               6180|0.0121732610|   E|1970|2023-07-25 16:29:...|
|         2|           22024|               6180|0.0127808140|   E|1971|2023-07-25 16:29:...|
|         2|           22024|               6180|0.0133249140|   E|1972|2023-07-25 16:29:...|
|         2|           22024|               6180|0.0140519150|   E|1973|2023-07-25 16:29:...|
|         2|           22024|               6180|0.0162203650|   E|1974|2023-07-25 16:29:...|
|         2|           22024|               6180|0.0172100080|   E|1975|2023-07-25 16:29:...|
|         2|           22024|               6180|0.0176550000|   E|1976|2023-07-25 16:29:...|
|         2|           22024|               6180|0.020703027

In [49]:
# d_df.write\
#         .partitionBy("area_code") \
#         .mode("overwrite") \
#         .csv(f"{cwd}/Transform_Data/Deflators_area_code")

In [50]:
# d_df.write\
#         .partitionBy("year") \
#         .mode("overwrite") \
#         .csv(f"{cwd}/Transform_Data/Deflators_year")

In [51]:
spark.stop()