### Uploaded a sample e-commerce csv file from kaggle
https://www.kaggle.com/datasets/tuannguyenvananh/sample-ecommerce/data



In [0]:
df = spark.read \
    .option("header", "true") \
    .option("inferSchema", "true") \
    .csv("/Volumes/workspace/ecommerce/ecommerce_data_sample/sample_df.csv")

In [0]:
df.show(5)

In [0]:
rows = df.count()
cols = len(df.columns)

print(f"Rows: {rows}, Columns: {cols}")


In [0]:
from pyspark.sql.functions import count
df.groupBy('event_type').agg(count('*')).show()

In [0]:
df.select('category_code').show(5)

### Created two new columns, category and sub_category, from the category_code column.

In [0]:
from pyspark.sql.functions import col, split, when, lit
df = df.withColumn(
    "category",
    when(col("category_code").isNull(), "Unknown")
    .otherwise(split(col("category_code"), r"\.")[0])
).withColumn(
    "sub_category",
    when(col("category_code").isNull(), "Unknown")
    .otherwise(split(col("category_code"), r"\.", 2)[1])
)


In [0]:
rows = df.count()
cols = len(df.columns)

print(f"Rows: {rows}, Columns: {cols}")


In [0]:
df.show(5)

In [0]:
df.select("event_type","category","sub_category","brand","price").show(10)

### Top Brand 

In [0]:
top_10 = (
          df
          .filter(col('brand').isNotNull())
          .groupBy('brand')
          .agg(count('*')
          .alias('count'))
          .orderBy(col('count').desc()).limit(10)
)
top_10.show()

### Top 10 Brand by Total Purchase value


In [0]:
from pyspark.sql.functions import sum, col, round
top_10_by_purchase = (
          df
          .filter(col('brand').isNotNull())
          .groupBy('event_type','brand')
          .agg(round(sum('price'))
          .alias('total_purchase_value'))
          .orderBy(col('total_purchase_value').desc())
          .filter(col('event_type') == 'purchase')
          .limit(10)
)
top_10_by_purchase.show()

### Top 10 Brand by Total Cart value

In [0]:
from pyspark.sql.functions import sum, col, round
top_10_by_cart = (
          df
          .filter(col('brand').isNotNull())
          .groupBy('event_type','brand')
          .agg(round(sum('price'))
          .alias('total_purchase_value'))
          .orderBy(col('total_purchase_value').desc())
          .filter(col('event_type') == 'cart')
          .limit(10)
)
top_10_by_cart.show()

In [0]:
df.write \
    .mode("overwrite") \
    .option("header", True) \
    .csv("/Volumes/workspace/ecommerce/ecommerce_data_sample/sample_ecommerce_dataV1")
