In [100]:
from pyspark.sql import SparkSession
from pyspark.sql.functions import col, sum, regexp_replace

In [101]:
spark = SparkSession.builder \
    .appName("Read CSV with Header and Delimiter") \
    .getOrCreate()

In [102]:
pricing_path='./pricing_project_dataset.csv'
platform_path='./platform_number.csv'


In [103]:
df = spark.read \
    .option("header", "true") \
    .option("delimiter", ",") \
    .csv(pricing_path)

df.show()


+------------+--------------+--------------+--------------+---------------+------------------+-------------------+----------------------+------------------+----------------------+--------------+------------+-----------------------------------+
|grass_region|category_group|   seller_type|shopee_item_id|shopee_model_id|competitor_item_id|competitor_model_id|shopee_model_price_usd|shopee_model_price|competitor_model_price|shopee_gmv_usd|shopee_order|shopee_model_competitiveness_status|
+------------+--------------+--------------+--------------+---------------+------------------+-------------------+----------------------+------------------+----------------------+--------------+------------+-----------------------------------+
|          ID|   Electronics|Long Tail (LT)|    5646734211|     3953474697|        9153508336|          992466002|           50.33644918|            779233|                647903|   8758.537644|         174|                       Shopee > CPT|
|          SG|          

In [104]:
df1 = df.select("grass_region", "shopee_model_competitiveness_status")

df1.show()

+------------+-----------------------------------+
|grass_region|shopee_model_competitiveness_status|
+------------+-----------------------------------+
|          ID|                       Shopee > CPT|
|          SG|                       Shopee < CPT|
|          TH|                       Shopee < CPT|
|          PH|                       Shopee > CPT|
|          TH|                       Shopee < CPT|
|          ID|                       Shopee < CPT|
|          TH|                       Shopee > CPT|
|          ID|                       Shopee > CPT|
|          MY|                       Shopee > CPT|
|          ID|                       Shopee > CPT|
|          ID|                       Shopee < CPT|
|          TH|                       Shopee > CPT|
|          ID|                       Shopee < CPT|
|          MY|                       Shopee > CPT|
|          ID|                       Shopee < CPT|
|          VN|                       Shopee > CPT|
|          ID|                 

In [105]:
country_list = ['SG','TH','VN','ID','PH','MY']

for country in country_list:
    result = df1.filter(df1["grass_region"] == country). \
                groupBy(df1["grass_region"], df1["shopee_model_competitiveness_status"]). \
                count(). \
                orderBy(df1["grass_region"])
    #result = #df1.groupBy(df1["grass_region"], df1["shopee_model_competitiveness_status"]).count().orderBy(df1["grass_region"])
    result.show()

+------------+-----------------------------------+-----+
|grass_region|shopee_model_competitiveness_status|count|
+------------+-----------------------------------+-----+
|          SG|                       Shopee > CPT| 1085|
|          SG|                       Shopee = CPT|  159|
|          SG|                       Shopee < CPT|  950|
+------------+-----------------------------------+-----+

+------------+-----------------------------------+-----+
|grass_region|shopee_model_competitiveness_status|count|
+------------+-----------------------------------+-----+
|          TH|                       Shopee > CPT| 2045|
|          TH|                       Shopee = CPT|   23|
|          TH|                       Shopee < CPT| 2069|
+------------+-----------------------------------+-----+

+------------+-----------------------------------+-----+
|grass_region|shopee_model_competitiveness_status|count|
+------------+-----------------------------------+-----+
|          VN|               

In [106]:
df2 = df.select("grass_region", "shopee_order"). \
        groupBy(df["grass_region"]). \
        agg(sum(col("shopee_order")).alias("shopee_order_sum"))

df2.show()

+------------+----------------+
|grass_region|shopee_order_sum|
+------------+----------------+
|          ID|        848601.0|
|          MY|        207739.0|
|          SG|        219624.0|
|          PH|        202688.0|
|          TH|        408024.0|
|          VN|        213484.0|
+------------+----------------+



In [107]:
df_platform = spark.read \
    .option("header", "true") \
    .option("delimiter", ",") \
    .csv(platform_path)

df_platform.show()


+------+--------------+----------------+
|region|platform order|platform_gmv_usd|
+------+--------------+----------------+
|    ID|  1,553,364.13|  243,883,794.95|
|    MY|    333,184.27|   73,416,837.53|
|    PH|    392,687.73|   75,383,303.11|
|    SG|    526,181.77|   99,170,298.94|
|    TH|  1,018,358.54|  173,400,899.62|
|    VN|    413,006.15|   74,549,163.52|
+------+--------------+----------------+



In [108]:
joined_df = df2.join(df_platform, df2["grass_region"] == df_platform["region"], "inner")
joined_df.show()

+------------+----------------+------+--------------+----------------+
|grass_region|shopee_order_sum|region|platform order|platform_gmv_usd|
+------------+----------------+------+--------------+----------------+
|          ID|        848601.0|    ID|  1,553,364.13|  243,883,794.95|
|          MY|        207739.0|    MY|    333,184.27|   73,416,837.53|
|          SG|        219624.0|    SG|    526,181.77|   99,170,298.94|
|          PH|        202688.0|    PH|    392,687.73|   75,383,303.11|
|          TH|        408024.0|    TH|  1,018,358.54|  173,400,899.62|
|          VN|        213484.0|    VN|    413,006.15|   74,549,163.52|
+------------+----------------+------+--------------+----------------+

