In [1]:
from pyspark import SparkContext, SparkConf
from pyspark.sql import SparkSession
from pyspark.sql.types import *

In [2]:
sc = SparkContext.getOrCreate()
ss = SparkSession.builder.getOrCreate()

In [4]:
instacart = ss.read.csv('/Users/christopherolley/data/consolidated_df.csv', header=True, inferSchema=True)

In [6]:
instacart.printSchema()

root
 |-- _c0: integer (nullable = true)
 |-- order_id: integer (nullable = true)
 |-- user_id: integer (nullable = true)
 |-- eval_set: string (nullable = true)
 |-- order_number: integer (nullable = true)
 |-- order_dow: integer (nullable = true)
 |-- order_hour_of_day: integer (nullable = true)
 |-- days_since_prior_order: double (nullable = true)
 |-- product_id: double (nullable = true)
 |-- add_to_cart_order: double (nullable = true)
 |-- reordered: double (nullable = true)
 |-- product_name: string (nullable = true)
 |-- aisle_id: string (nullable = true)
 |-- department_id: string (nullable = true)
 |-- department: string (nullable = true)
 |-- aisle: string (nullable = true)



In [32]:
instacart.write.saveAsTable('Instacart')

# Product Features

Number of times a product was ordered:

In [14]:
instacart.filter("eval_set=='prior'").groupBy('product_id').count().orderBy("count", ascending=False).show()

+----------+------+
|product_id| count|
+----------+------+
|   24852.0|472565|
|   13176.0|379450|
|   21137.0|264683|
|   21903.0|241921|
|   47209.0|213584|
|   47766.0|176815|
|   47626.0|152657|
|   16797.0|142951|
|   26209.0|140627|
|   27845.0|137905|
|   27966.0|137057|
|   22935.0|113426|
|   24964.0|109778|
|   45007.0|104823|
|   39275.0|100060|
|   49683.0| 97315|
|   28204.0| 89632|
|    5876.0| 87746|
|    8277.0| 85020|
|   40706.0| 84255|
+----------+------+
only showing top 20 rows



Average position in cart:

In [19]:
instacart.filter("eval_set=='prior'").groupBy('product_id', 'product_name').avg("add_to_cart_order").orderBy("avg(add_to_cart_order)").show()

+----------+--------------------+----------------------+
|product_id|        product_name|avg(add_to_cart_order)|
+----------+--------------------+----------------------+
|   37088.0|Lndbrg White Quin...|                   1.0|
|   39759.0|Indoor & Outdoor ...|                   1.0|
|   44495.0|Lowfat Cherry Lim...|                   1.0|
|   25441.0|       Spicy Falafel|                   1.0|
|   41800.0|       Easter Basket|                   1.0|
|   43144.0|Flax Oil, Organic...|                   1.0|
|   33440.0|Rosa Mosqueta Ros...|                   1.0|
|    2769.0|Pappardelle Nests...|                   1.0|
|   12841.0|      King Crab Legs|                   1.0|
|   34962.0|  Prenatal Nutrients|                   1.0|
|   25888.0|Super Nutrition V...|                   1.0|
|   27863.0|        Cooking Fuel|                   1.0|
|   47680.0| Indian Wells Merlot|                   1.0|
|    2796.0|      American Blend|                   1.0|
|   36597.0|Vanilla Flavor Mu..

Reorder rate: number of reorders/total number of times ordered

# User x Product Features

Reorder count: number of time each user has reordered a product:

In [31]:
instacart.filter("eval_set=='prior'").groupBy('product_id', 'user_id').sum('reordered').show()

+----------+-------+--------------+
|product_id|user_id|sum(reordered)|
+----------+-------+--------------+
|   35547.0|    385|           0.0|
|   27845.0|    433|           1.0|
|   34050.0|    601|           4.0|
|   44949.0|    853|           9.0|
|   35221.0|   1123|           0.0|
|   13113.0|   1607|           2.0|
|   18224.0|   2187|           3.0|
|   30620.0|   2451|           0.0|
|    5612.0|   2736|           7.0|
|   33198.0|   2954|           0.0|
|   24852.0|   2991|           6.0|
|   15424.0|   3686|           2.0|
|   30779.0|   3781|           1.0|
|   45989.0|   4381|          10.0|
|   14705.0|   4630|           1.0|
|   35547.0|   5104|           4.0|
|   38387.0|   5117|           2.0|
|   37687.0|   5245|           2.0|
|    4987.0|   5387|           4.0|
|    6377.0|   5470|           1.0|
+----------+-------+--------------+
only showing top 20 rows



Number of orders since a user last ordered a given item (work in progress)

In [74]:
ss.sql("select Instacart.order_id, Instacart.user_id, Instacart.product_id, rhs.order_num from Instacart\
        left join\
       (select order_id, user_id, row_number() over (partition by user_id order by order_id) as order_num from\
        (select distinct order_id, user_id from Instacart where eval_set = 'prior') as iq) as rhs\
        on Instacart.order_id=rhs.order_id and Instacart.user_id=rhs.user_id").show(50)

+--------+-------+----------+---------+
|order_id|user_id|product_id|order_num|
+--------+-------+----------+---------+
|     150| 126892|   14127.0|        1|
|     150| 126892|   45645.0|        1|
|     150| 126892|   21506.0|        1|
|     229|  47358|   21108.0|        1|
|     229|  47358|   34276.0|        1|
|     229|  47358|   27730.0|        1|
|     229|  47358|   42495.0|        1|
|     229|  47358|   11782.0|        1|
|     229|  47358|   41690.0|        1|
|     229|  47358|   17821.0|        1|
|     229|  47358|   18266.0|        1|
|     229|  47358|   25435.0|        1|
|     229|  47358|   14579.0|        1|
|     229|  47358|   40824.0|        1|
|     789| 143268|   18918.0|        1|
|     789| 143268|    5077.0|        1|
|     789| 143268|   16797.0|        1|
|     789| 143268|    1481.0|        1|
|     789| 143268|    9163.0|        1|
|     789| 143268|   13287.0|        1|
|     789| 143268|    6073.0|        1|
|     981|  91324|   11520.0|        1|


# User Features

Maximum number of times a user has ordered any product:

In [54]:
ss.sql("select uid, max(ct) from\
       (select product_id as pid, user_id as uid, count(*) as ct from Instacart\
       where eval_set = 'prior'\
       group by 1,2) as iq\
       group by 1").show()

+------+-------+
|   uid|max(ct)|
+------+-------+
| 11141|      2|
| 46266|     10|
| 16386|     27|
|160492|     10|
|156366|     33|
| 11748|      4|
| 17679|     56|
|  7993|      7|
| 63271|     34|
|156197|      6|
| 57380|     14|
|  3794|     38|
|  4900|     13|
| 43103|      4|
| 43714|     16|
|  9376|     20|
|156749|      4|
| 43527|     30|
| 43302|      8|
|160563|      7|
+------+-------+
only showing top 20 rows

