In [0]:
val schema = "add_to_cart_order INT, reordered INT, order_number INT, order_dow INT, order_hour_of_day INT, days_since_prior_order DOUBLE, product_name STRING, aisle STRING, department STRING"

In [1]:
val cleanDF = spark.read.schema(schema).parquet("instacart/instacart-clean.parquet")

In [2]:
z.show(cleanDF)

In [3]:
cleanDF.printSchema

## Orders vs Days of the Week



In [5]:
val orderDayDF = cleanDF.groupBy("order_dow").agg(count("order_dow") as "order_count")

val reorderedDayDF = cleanDF.where(col("reordered") === 1).groupBy("order_dow").agg(count("order_dow") as "order_count_reordered")

In [6]:
val joined = orderDayDF.join(reorderedDayDF, Seq("order_dow"))

z.show(joined)

## Orders vs Hour of the Day

In [8]:
val orderHourDF = cleanDF.groupBy("order_hour_of_day").agg(count("order_hour_of_day") as "order_count")

val reorderedHourDF = cleanDF.where(col("reordered") === 1).groupBy("order_hour_of_day").agg(count("order_hour_of_day") as "order_count_reordered")

In [9]:
val joined = orderHourDF.join(reorderedHourDF, Seq("order_hour_of_day"))

z.show(joined)

## Frequently ordered product departments and aisles

In [11]:
val orderDepartmentsDF = cleanDF.groupBy("department").agg(count("department") as "order_count").orderBy(desc("order_count"))

In [12]:
z.show(orderDepartmentsDF)

In [13]:
val orderAisleDF = cleanDF.groupBy("aisle").agg(count("aisle") as "order_count").orderBy(desc("order_count"))

In [14]:
z.show(orderAisleDF.limit(25))

## Most commonly reordered products

In [16]:
val reorderedItems = cleanDF.filter(col("reordered") === 1)

val productReorderCounts = reorderedItems.groupBy("product_name").agg(count("reordered") as "reorder_count").orderBy(desc("reorder_count"))

In [17]:
z.show(productReorderCounts.limit(20))

## Position in the shopping cart vs Reorder Rate

In [19]:
val positionReorderRate = cleanDF.groupBy("add_to_cart_order")
  .agg(
    count("reordered") as "total_items",
    sum("reordered") as "reordered_items"
  )
  .withColumn("reorder_rate", col("reordered_items") / col("total_items"))


In [20]:
z.show(positionReorderRate)