In [2]:
# %pip install pyspark==3.4.1

In [3]:
import pyspark
from pyspark.sql import SparkSession
from pyspark.sql import functions as F
# from pyspark.sql.functions import array, col
from pyspark.sql.types import StructType,StructField, StringType, IntegerType

In [4]:
import os
cwd = os.getcwd()

In [5]:
# spark = SparkSession.builder.master("local[1]") \
#                     .appName('Country') \
#                     .getOrCreate()
spark = SparkSession.builder \
                    .master("spark://127.0.0.1:7077") \
                    .appName('Country') \
                    .getOrCreate()

In [6]:
pcl_df = spark.read.option("header", True)\
    .csv(f"{cwd}/dataset/UNSD_Methodology.csv")
area_code_df = spark.read.option("header", True)\
    .csv(f"{cwd}/dataset/ConsumerPriceIndices_E_All_Data/ConsumerPriceIndices_E_AreaCodes.csv")
pcl_df.printSchema()
area_code_df.printSchema()

root
 |-- Global Code: string (nullable = true)
 |-- Global Name: string (nullable = true)
 |-- Region Code: string (nullable = true)
 |-- Region Name: string (nullable = true)
 |-- Sub-region Code: string (nullable = true)
 |-- Sub-region Name: string (nullable = true)
 |-- Intermediate Region Code: string (nullable = true)
 |-- Intermediate Region Name: string (nullable = true)
 |-- Country or Area: string (nullable = true)
 |-- M49 Code: string (nullable = true)
 |-- ISO-alpha2 Code: string (nullable = true)
 |-- ISO-alpha3 Code: string (nullable = true)
 |-- Least Developed Countries (LDC): string (nullable = true)
 |-- Land Locked Developing Countries (LLDC): string (nullable = true)
 |-- Small Island Developing States (SIDS): string (nullable = true)

root
 |-- Area Code: string (nullable = true)
 |-- M49 Code: string (nullable = true)
 |-- Area: string (nullable = true)



In [7]:
pcl_df = pcl_df.select(
    pcl_df['M49 Code'].alias('m49_id'),
    pcl_df['Country or Area'].alias('country'),
    pcl_df['Sub-region Code'].alias('sub_region_code'),
    pcl_df['Sub-region Name'].alias('sub_region_name'),
    pcl_df['Region Code'].alias('region_code'),
    pcl_df['Region Name'].alias('region_name'),
)
pcl_df.show()

+------+--------------------+---------------+------------------+-----------+-----------+
|m49_id|             country|sub_region_code|   sub_region_name|region_code|region_name|
+------+--------------------+---------------+------------------+-----------+-----------+
|    12|             Algeria|             15|   Northern Africa|          2|     Africa|
|   818|               Egypt|             15|   Northern Africa|          2|     Africa|
|   434|               Libya|             15|   Northern Africa|          2|     Africa|
|   504|             Morocco|             15|   Northern Africa|          2|     Africa|
|   729|               Sudan|             15|   Northern Africa|          2|     Africa|
|   788|             Tunisia|             15|   Northern Africa|          2|     Africa|
|   732|      Western Sahara|             15|   Northern Africa|          2|     Africa|
|    86|British Indian Oc...|            202|Sub-Saharan Africa|          2|     Africa|
|   108|             

In [8]:
area_code_df = area_code_df.select(
    area_code_df['Area Code'].alias('id'),
    F.translate(F.col("M49 Code"), "'", "").cast(IntegerType()).alias("m49_id"),
    area_code_df['Area'].alias('country'),
)
area_code_df.show()

+----+------+--------------------+
|  id|m49_id|             country|
+----+------+--------------------+
|   2|     4|         Afghanistan|
|5100|     2|              Africa|
| 284|   248|       �land Islands|
|   3|     8|             Albania|
|   4|    12|             Algeria|
|5200|    19|            Americas|
|   6|    20|             Andorra|
|   7|    24|              Angola|
| 258|   660|            Anguilla|
|   8|    28| Antigua and Barbuda|
|   9|    32|           Argentina|
|   1|    51|             Armenia|
|  22|   533|               Aruba|
|5300|   142|                Asia|
|  10|    36|           Australia|
|5501|    53|Australia and New...|
|  11|    40|             Austria|
|  52|    31|          Azerbaijan|
|  12|    44|             Bahamas|
|  13|    48|             Bahrain|
+----+------+--------------------+
only showing top 20 rows



In [9]:
final_df = pcl_df.join(area_code_df, pcl_df.m49_id == area_code_df.m49_id, 'inner').select(
    area_code_df.id,
    pcl_df.country,
    pcl_df.sub_region_code,
    pcl_df.sub_region_name,
    pcl_df.region_code,
    pcl_df.region_name,
)
final_df.show()

+---+-----------+---------------+------------------+-----------+-----------+
| id|    country|sub_region_code|   sub_region_name|region_code|region_name|
+---+-----------+---------------+------------------+-----------+-----------+
|  4|    Algeria|             15|   Northern Africa|          2|     Africa|
| 59|      Egypt|             15|   Northern Africa|          2|     Africa|
|124|      Libya|             15|   Northern Africa|          2|     Africa|
|143|    Morocco|             15|   Northern Africa|          2|     Africa|
|276|      Sudan|             15|   Northern Africa|          2|     Africa|
|222|    Tunisia|             15|   Northern Africa|          2|     Africa|
| 29|    Burundi|            202|Sub-Saharan Africa|          2|     Africa|
| 45|    Comoros|            202|Sub-Saharan Africa|          2|     Africa|
| 72|   Djibouti|            202|Sub-Saharan Africa|          2|     Africa|
|238|   Ethiopia|            202|Sub-Saharan Africa|          2|     Africa|

In [10]:
final_df.write\
    .mode("overwrite") \
    .parquet("hdfs://127.0.0.1:9000/FAOSTAT_prj/DataWarehouse/Country")

In [11]:
spark.stop()