In [0]:
from pyspark.sql.types import StructType, StructField, TimestampType, StringType, LongType, DoubleType

schema = StructType([
    StructField("event_time", TimestampType(), True),
    StructField("event_type", StringType(), True),
    StructField("product_id", LongType(), True),
    StructField("category_id", LongType(), True),
    StructField("category_code", StringType(), True),
    StructField("brand", StringType(), True),
    StructField("price", DoubleType(), True),
    StructField("user_id", LongType(), True),
    StructField("user_session", StringType(), True)
])

In [0]:
df_october = spark.read.csv("/Volumes/workspace/ecommerce/ecommerce_data/2019-Oct.csv",header="true",schema=schema)

df_november = spark.read.csv("/Volumes/workspace/ecommerce/ecommerce_data/2019-Nov.csv",header="true",schema=schema)


In [0]:
display(df_october.select("event_type").distinct())

In [0]:
df = df_november.unionByName(df_october)

In [0]:
from pyspark.sql.functions import avg, count

Top10Purchase = df.filter((df.event_type == "purchase")&(df.category_code.isNotNull())).groupBy("category_code", "brand","event_type").agg(avg("price").alias("avg_price"), count("category_code").alias("category_count")).orderBy("category_count", ascending=False).limit(10)

#display(Top10Category)

In [0]:
%sql

show databases

In [0]:
%sql

Use database ecommerce

In [0]:
df.createOrReplaceTempView("ecommerce.ecommerce_tbl")


In [0]:
%sql

CREATE OR REPLACE TABLE ecommerce.top10Cart AS
select distinct category_code, brand, event_type, avg(price) as avg_price, count(category_code) as category_count
from ecommerce_tbl
where event_type = 'cart' and category_code is not null
group by category_code, brand, event_type
order by avg_price desc
limit 10;

In [0]:
%sql

CREATE OR REPLACE TABLE  ecommerce.top10View AS
select distinct category_code, brand, event_type, avg(price) as avg_price, count(category_code) as category_count
from ecommerce_tbl
where event_type = 'view' and category_code is not null
group by category_code, brand, event_type
order by avg_price desc
limit 10;

In [0]:
%sql
show tables

In [0]:
%sql
select * From ecommerce.top10cart


In [0]:
%sql

select * FROM ecommerce.top10view


In [0]:
spark.table("ecommerce.top10Cart").write.mode("overwrite").format("delta").save("/Volumes/workspace/ecommerce/ecommerce_data/top10Cart")
spark.table("ecommerce.top10View").write.mode("overwrite").format("delta").save("/Volumes/workspace/ecommerce/ecommerce_data/top10View")
Top10Purchase.write.mode("overwrite").format("delta").save("/Volumes/workspace/ecommerce/ecommerce_data/top10Purchase")

In [0]:
Top10category = spark.read.format("delta").load("/Volumes/workspace/ecommerce/ecommerce_data/top10Cart") \
    .unionByName(spark.read.format("delta").load("/Volumes/workspace/ecommerce/ecommerce_data/top10View")) \
    .unionByName(spark.read.format("delta").load("/Volumes/workspace/ecommerce/ecommerce_data/top10Purchase"))
display(Top10category.orderBy("category_count", ascending=False))