Data Analysis on " International Trade Dataset : June 2023 Quarter " by Stat Government of New Zealand

Link to dataset : https://www.stats.govt.nz/assets/Uploads/International-trade/International-trade-June-2023-quarter/Download-data/international-trade-june-2023-quarter.zip

In [84]:
#Importing all the necessary library 

from pyspark.sql import SparkSession
import pyspark.sql.functions as F
from pyspark.sql.functions import   avg, rank, concat
from pyspark.sql.types import  FloatType, DoubleType
from pyspark.sql.window import Window
from pyspark.sql.functions import udf

In [54]:
#Initiation of Spark session

spark = SparkSession.builder.appName("International_Trade").getOrCreate()

Loading the dataset

In [55]:
#revised_data
data_path = "/Users/anujkhadka/Fusemachines47/ALL SPARK/Spark_Group_project/data/revised_final_data1.csv"  
revised_final = spark.read.csv(data_path, header=True, inferSchema=True)

#goods_classifications
data_path = "/Users/anujkhadka/Fusemachines47/ALL SPARK/Spark_Group_project/data/goods_classification.csv"  
goods_final = spark.read.csv(data_path, header=True, inferSchema=True)

#country_classification
data_path = "/Users/anujkhadka/Fusemachines47/ALL SPARK/Spark_Group_project/data/country_classification.csv"  
country_final = spark.read.csv(data_path, header=True, inferSchema=True)

#services_classification
data_path = "/Users/anujkhadka/Fusemachines47/ALL SPARK/Spark_Group_project/data/services_classification.csv"  
services_final = spark.read.csv(data_path, header=True, inferSchema=True)

                                                                                

1. Generate a pivot table that shows the total import values of Services for the top 4 importing countries in June 2023, with  "Transportation" service categories as columns.

In [56]:
from pyspark.sql.functions import col, sum

#Using filter for the revised dataset for importing

import_df = revised_df.filter((col("account") == "Imports") & (col("time_ref") == "2023-06-30"))

In [59]:
# Joining the dataframe "import_df" & "service_df" on the code(key) column

joined_df = import_df.join(services_final, import_df["code"] == services_final["code"])

In [60]:

# Using 'Group by' for country_code and service_label

grouped_df = joined_df.groupBy("country_code", "service_label").sum("value")

In [61]:
 # ranking of countries 
 
ranked_df = grouped_df.orderBy(col("sum(value)").desc())

In [62]:
#Top 4 importing countries extraction

top_countries = ranked_df.select("country_code").limit(4)

In [63]:
# creating pivot table

pivot_table = ranked_df \
    .join(top_countries, ["country_code"], "inner") \
    .groupBy("country_code").pivot("service_label", ["Services", "Transportation"]) \
    .sum("sum(value)")

In [64]:

pivot_table.show()

                                                                                

+------------+-------------+--------------+
|country_code|     Services|Transportation|
+------------+-------------+--------------+
|          AU|1.881982847E9|  2.49329771E8|
|          US|1.018642986E9|   7.6008244E7|
|          XX| 6.81217569E8|   6.3025441E7|
|          SG| 9.09750687E8|  3.26007453E8|
+------------+-------------+--------------+



2. Analyze the total import value of "Services" for each country in June 2023-06-30, and ranking the countries in descending order of import value.

In [65]:
# Data is stored in 'revised_df' DataFrame 

import_df = revised_final


In [66]:
#'Services' column & import in 2023-06-30 is filtered

import_df = import_df.filter((col("account") == "Imports") & (col("time_ref") == "2023-06-30") & (col("product_type") == "Services"))


In [67]:
 # groupBy 'country_code' 
 
import_df = import_df.groupBy("country_code").agg(sum("value").alias("Total Import Value"))

# descending order  ranking

import_df = import_df.orderBy(col("Total Import Value").desc())

# Columns renaming
import_df = import_df.withColumnRenamed("country_code", "Country Code")

In [68]:

import_df.show()

+------------+------------------+
|Country Code|Total Import Value|
+------------+------------------+
|          AU|     6.571340755E9|
|          US|      3.46553267E9|
|          XX|      2.03107177E9|
|          SG|     1.831464711E9|
|          CH|     1.136472799E9|
|          GB|     1.116098864E9|
|          DK|     1.080059039E9|
|          CN|      6.56124353E8|
|          IE|      3.62344309E8|
|          IN|      3.32702047E8|
|          CK|      2.83971901E8|
|          JP|      2.45157897E8|
|          BM|      2.13337641E8|
|          PH|      1.95167659E8|
|          WS|      1.93461138E8|
|          FR|      1.78350741E8|
|          ID|      1.72441524E8|
|          HK|      1.69274501E8|
|          DE|      1.52909324E8|
|          CA|      1.51301869E8|
+------------+------------------+
only showing top 20 rows



3. Quantify the supreme 20 countries with the highest average export value of "Goods" for each month in the updated time_ref & use window functions to calculate the average and rank the countries accordingly

In [69]:
# Dataframe filtering

df= df.filter(df['country_code'] != 'TOT (OMT CIF)')

df = df.filter(df['country_code'] != 'TOT (OMT VFD)')

In [70]:
# updating the time_ref 

filtered_df = df.filter((col("product_type") == "Goods") & (col("time_ref") == "2023-06-30"))

In [74]:
# Window specification 

window_spec = Window.partitionBy("time_ref").orderBy(col("avg_export_value").desc())

In [75]:
# average export value 

avg_export_df = (
    filtered_df
    .groupBy("time_ref", "country_code")
    .agg(avg(col("value")).alias("avg_export_value"))
    .withColumn("rank", rank().over(window_spec))
)

# top 3 countries 

top3_countries_by_month = avg_export_df.filter(col("rank") <= 20)

In [76]:
top3_countries_by_month.show()



+----------+------------+--------------------+----+
|  time_ref|country_code|    avg_export_value|rank|
+----------+------------+--------------------+----+
|2023-06-30|          DZ|        7.94222382E7|   1|
|2023-06-30|          ZB|               6.9E7|   2|
|2023-06-30|          ZE| 4.138399009090909E7|   3|
|2023-06-30|          CN|1.7777409388304863E7|   4|
|2023-06-30|          LY|         1.4830665E7|   5|
|2023-06-30|          ZP|         1.4671178E7|   6|
|2023-06-30|          TT|1.1182984674418604E7|   7|
|2023-06-30|          US|   7460452.486175115|   8|
|2023-06-30|          AU|   6452173.105574741|   9|
|2023-06-30|          JP|   6377873.931760741|  10|
|2023-06-30|          KR|  5682210.5516528925|  11|
|2023-06-30|          SA|   5121354.310880829|  12|
|2023-06-30|          SG|  4614410.8317460315|  13|
|2023-06-30|          NG|   4525148.837837838|  14|
|2023-06-30|          MY|   3657571.621019108|  15|
|2023-06-30|          ID|   3116630.939914163|  16|
|2023-06-30|

                                                                                

4. Computing the total import value for each country in the 'revised_final' table & presenting the peak 10 countries with the highest total export values.

In [77]:

from pyspark.sql.functions import sum, desc

In [78]:
revised_final = revised_final.filter(revised_final['country_code'] != 'TOT (OMT CIF)')

In [79]:
revised_final = revised_final.filter(revised_final['country_code'] != 'TOT (OMT VFD)')

In [80]:
# Filtering

import_data = revised_final.filter((revised_final["account"] == "Imports") & (revised_final["time_ref"] == "2023-06-30"))

In [81]:

# total import value 

total_imports = import_data.groupBy("country_code").agg(sum("value").alias("total_import_value"))

In [82]:
# countries with the highest total import values

top_10_importing_countries = total_imports.orderBy(desc("total_import_value")).limit(10)

In [83]:

top_10_importing_countries.show()

+------------+------------------+
|country_code|total_import_value|
+------------+------------------+
|          AU|   1.2831898537E10|
|          CN|   1.1493681947E10|
|          US|     9.651088289E9|
|          SG|     4.870192374E9|
|          JP|     4.309482252E9|
|          KR|     3.588781722E9|
|          GB|      2.65543796E9|
|          MY|     2.601556325E9|
|          DE|      2.59538621E9|
|          TH|     2.213766428E9|
+------------+------------------+



5. Compute the total export value for each product type i.e 'Goods & Services' in the "revised_csv" file for the June 2023 quarter using a UDF.

In [85]:
# Defining a UDF 

def calculate_total_export_value(values):
    total = 0.0
    
    # Iteration
    
    for val in values:
        try:
            total += float(val)
        except (ValueError, TypeError):
            pass  
    
    return total

# Registering the UDF

calculate_total_export_udf = udf(calculate_total_export_value, DoubleType())

# total export value

result_df = df.filter(df.time_ref == "2023-06-30") \
    .groupBy("product_type") \
    .agg(calculate_total_export_udf(F.collect_list("value")).alias("total_export_value"))


result_df = result_df.withColumn("total_export_value", F.format_number("total_export_value", 1))

In [86]:
result_df.show()

+------------+------------------+
|product_type|total_export_value|
+------------+------------------+
|    Services|  38,639,599,055.0|
|       Goods| 114,311,488,643.0|
+------------+------------------+



                                                                                

6. Identify countries where the export value for 'Financial Services' exceeds the import value for 'Financial Services' in 2023-06-30. Implement a UDF for this comparison.

In [88]:
# Filtering

time_ref = "2023-06-30"
revised_df_filtered = revised_df.filter(col("time_ref") == time_ref)
df_joined = services_final.join(revised_df_filtered, revised_df_filtered['code']==services_final['code'])

df_imp = df_joined.where(col('account')=='Imports')
df_exp  = df_joined.where(col('account')=='Exports')
df_exp = df_exp.withColumnRenamed('value',"value_exp")
df_joined = df_imp.join(df_exp,df_imp['country_code']==df_exp['country_code'])


df_joined.show()

                                                                                

+-------+--------------------+----------+-------+-------+------------+------------+--------+------+----+-------------+----------+-------+----+------------+------------+---------+------+
|   code|       service_label|  time_ref|account|   code|country_code|product_type|   value|status|code|service_label|  time_ref|account|code|country_code|product_type|value_exp|status|
+-------+--------------------+----------+-------+-------+------------+------------+--------+------+----+-------------+----------+-------+----+------------+------------+---------+------+
|A120423|Other personal tr...|2023-06-30|Imports|A120423|          DZ|    Services|110323.0|     F| A12|     Services|2023-06-30|Exports| A12|          DZ|    Services| 330185.0|     F|
| A12042|     Personal travel|2023-06-30|Imports| A12042|          DZ|    Services|110323.0|     F| A12|     Services|2023-06-30|Exports| A12|          DZ|    Services| 330185.0|     F|
| A12041|     Business travel|2023-06-30|Imports| A12041|          DZ|

In [89]:
df_joined = df_joined.withColumn("Comparison", df_joined['value_exp']-df_joined['value'])

In [90]:
from pyspark.sql.functions import lit
def compare_services(comp):
    if comp>lit(0.0):
        return "Export > Import"
    else:
        return "export<import"
# Register the UDF
Financial_services = udf(compare_services, FloatType())


In [91]:
result_df = df_joined.withColumn('Financial_service', compare_services('Comparison'))


ValueError: Cannot convert column into bool: please use '&' for 'and', '|' for 'or', '~' for 'not' when building DataFrame boolean expressions.

In [92]:
df_joined.printSchema()

root
 |-- code: string (nullable = true)
 |-- service_label: string (nullable = true)
 |-- time_ref: date (nullable = true)
 |-- account: string (nullable = true)
 |-- code: string (nullable = true)
 |-- country_code: string (nullable = true)
 |-- product_type: string (nullable = true)
 |-- value: double (nullable = true)
 |-- status: string (nullable = true)
 |-- code: string (nullable = true)
 |-- service_label: string (nullable = true)
 |-- time_ref: date (nullable = true)
 |-- account: string (nullable = true)
 |-- code: string (nullable = true)
 |-- country_code: string (nullable = true)
 |-- product_type: string (nullable = true)
 |-- value_exp: double (nullable = true)
 |-- status: string (nullable = true)
 |-- Comparison: double (nullable = true)



Because of multiple error while extracting data from separate import and export, I couldnot to complete question no. 6