In [37]:
from pyspark import SparkContext, SparkConf
from pyspark.sql import SparkSession
from pyspark.sql.types import *
from pyspark.sql.functions import asc, desc, dense_rank, col
from pyspark.sql.window import Window

In [2]:
sc = SparkContext.getOrCreate()
ss = SparkSession.builder.getOrCreate()

In [None]:
schema = StructType([StructField("order_id", IntegerType(), True),
StructField("user_id", IntegerType(), True),
StructField("eval_set", StringType(), True),
StructField("order_number", IntegerType(), True),
StructField("order_dow", IntegerType(), True),
StructField("order_hour_of_day", IntegerType(), True),
StructField("days_since_prior_order", IntegerType(), True),
StructField("product_id", IntegerType(), True),
StructField("add_to_cart_order", IntegerType(), True),
StructField("reordered", IntegerType(), True),
StructField("product_name", StringType(), True),
StructField("aisle_id", IntegerType(), True),
StructField("department_id", IntegerType(), True),
StructField("department", StringType(), True),
StructField("aisle", StringType(), True)
])
# did not work with ss.read.csv
# inferSchema accomplishes the same col types

In [3]:
df = ss.read.csv('../consolidated_df.csv', header=True, inferSchema=True)

In [4]:
df.printSchema()

root
 |-- order_id: integer (nullable = true)
 |-- user_id: integer (nullable = true)
 |-- eval_set: string (nullable = true)
 |-- order_number: integer (nullable = true)
 |-- order_dow: integer (nullable = true)
 |-- order_hour_of_day: integer (nullable = true)
 |-- days_since_prior_order: double (nullable = true)
 |-- product_id: double (nullable = true)
 |-- add_to_cart_order: double (nullable = true)
 |-- reordered: double (nullable = true)
 |-- product_name: string (nullable = true)
 |-- aisle_id: string (nullable = true)
 |-- department_id: string (nullable = true)
 |-- department: string (nullable = true)
 |-- aisle: string (nullable = true)



### Reference Stats

In [46]:
# Number of unique prior user_id-product_id combinations
df.filter("eval_set == 'prior'").groupby('user_id', 'product_id').count().count()

13307953

In [7]:
# Number of train (order_id-product_id) rows
df.filter("eval_set == 'train'").count()

1384617

In [8]:
# Number of prior (order_id-product_id) rows
df.filter("eval_set == 'prior'").count()

32434489

### Constructing Target Variable

In [49]:
last_order = df.filter("eval_set == 'train'").select('user_id', 'product_id') \
    .withColumnRenamed('user_id','user_id2') \
    .withColumnRenamed('product_id','product_id2')

final_df = df.filter("eval_set == 'prior'").select(
    'user_id', 'product_id').distinct()

In [50]:
#join leaves out new products never ordered before in train order
df3 = final_df \
    .join(last_order, 
          on=(final_df.user_id == last_order.user_id2) 
          & (final_df.product_id == last_order.product_id2), how="left")
df3.show()

+-------+----------+--------+-----------+
|user_id|product_id|user_id2|product_id2|
+-------+----------+--------+-----------+
|      7|    8277.0|    null|       null|
|      7|   27156.0|    null|       null|
|      7|   40852.0|       7|    40852.0|
|      8|   34358.0|    null|       null|
|     14|   40540.0|    null|       null|
|     18|   40723.0|    null|       null|
|     27|    5322.0|    null|       null|
|     31|   21131.0|    null|       null|
|     31|   45104.0|    null|       null|
|     32|   49478.0|    null|       null|
|     38|   11078.0|    null|       null|
|     41|   19678.0|    null|       null|
|     42|    1263.0|    null|       null|
|     52|   35561.0|    null|       null|
|     58|   43115.0|    null|       null|
|     61|    6187.0|    null|       null|
|     63|   38061.0|    null|       null|
|     71|   41408.0|    null|       null|
|     79|   16616.0|      79|    16616.0|
|     79|   28204.0|    null|       null|
+-------+----------+--------+-----

In [51]:
df4 = df3.withColumn('ordered_true', df3.user_id2.isNotNull()) \
    .select('user_id','product_id','ordered_true')
df4.show()

+-------+----------+------------+
|user_id|product_id|ordered_true|
+-------+----------+------------+
|      7|    8277.0|       false|
|      7|   27156.0|       false|
|      7|   40852.0|        true|
|      8|   34358.0|       false|
|     14|   40540.0|       false|
|     18|   40723.0|       false|
|     27|    5322.0|       false|
|     31|   21131.0|       false|
|     31|   45104.0|       false|
|     32|   49478.0|       false|
|     38|   11078.0|       false|
|     41|   19678.0|       false|
|     42|    1263.0|       false|
|     52|   35561.0|       false|
|     58|   43115.0|       false|
|     61|    6187.0|       false|
|     63|   38061.0|       false|
|     71|   41408.0|       false|
|     79|   16616.0|        true|
|     79|   28204.0|       false|
+-------+----------+------------+
only showing top 20 rows



### Number of times a user ordered a product

In [47]:
num_prod_ord = df.filter("eval_set == 'prior'") \
    .groupby('user_id', 'product_id') \
    .count()

In [48]:
# Sample output
num_prod_ord.sort(desc('count')).show(100)

+-------+----------+-----+
|user_id|product_id|count|
+-------+----------+-----+
|  41356|   38652.0|   99|
|  17997|    4210.0|   99|
| 141736|   25133.0|   99|
|  41356|    6583.0|   99|
|  41356|   14366.0|   99|
| 120897|   12013.0|   98|
|  41356|   29671.0|   98|
| 103593|   28204.0|   98|
|  98085|     196.0|   97|
|  99707|   24852.0|   97|
|  84478|   31981.0|   97|
|  69919|   24852.0|   96|
| 141736|   14947.0|   96|
|  99753|   27845.0|   96|
| 103593|    4920.0|   95|
|  76678|   19660.0|   95|
|  99753|   38689.0|   95|
| 123746|    1160.0|   95|
| 178107|   24852.0|   95|
| 140440|   18926.0|   95|
|  36335|   11784.0|   95|
|  84478|   45190.0|   94|
|  75124|   18926.0|   94|
|  75124|   40571.0|   94|
|  74315|   31981.0|   94|
| 147173|   15424.0|   94|
|  75124|   24954.0|   94|
|  75124|   22124.0|   94|
|  75124|    4957.0|   94|
|  75124|    8061.0|   94|
|  75124|   11925.0|   94|
| 126311|   19660.0|   93|
| 141736|   42342.0|   93|
|  75124|   19156.0|   93|
|

### Number of times user ordered products in last 5 orders

In [62]:
df4.cache()

window = Window.partitionBy('user_id') \
                .orderBy(desc('order_number')) \
                .rowsBetween(Window.unboundedPreceding, Window.currentRow)

num_prod_ordl5 = df.filter("eval_set == 'prior'") \
    .select('user_id', 'product_id', 'order_number',  
          dense_rank().over(window).alias('rank')) \
                .filter(col('rank') <= 5) \
                .groupby('user_id', 'product_id') \
                .count()

num_prod_ordl5.cache()
num_prod_ordl5 = num_prod_ordl5 \
    .withColumnRenamed('user_id','user_id2') \
    .withColumnRenamed('product_id','product_id2')

num_prod_ordl5 = df4.join(num_prod_ordl5, 
          on=(df4.user_id == num_prod_ordl5.user_id2) 
          & (df4.product_id == num_prod_ordl5.product_id2), how="left")

num_prod_ordl5.select('user_id','product_id','count','ordered_true').show()

+-------+----------+-----+------------+
|user_id|product_id|count|ordered_true|
+-------+----------+-----+------------+
|      7|    8277.0|    1|       false|
|      7|   27156.0|    1|       false|
|      7|   40852.0|    3|        true|
|      8|   34358.0|    2|       false|
|     14|   40540.0| null|       false|
|     18|   40723.0| null|       false|
|     27|    5322.0| null|       false|
|     31|   21131.0| null|       false|
|     31|   45104.0|    1|       false|
|     32|   49478.0|    1|       false|
|     38|   11078.0|    1|       false|
|     41|   19678.0|    1|       false|
|     42|    1263.0|    1|       false|
|     52|   35561.0|    1|       false|
|     58|   43115.0| null|       false|
|     61|    6187.0|    1|       false|
|     63|   38061.0| null|       false|
|     71|   41408.0|    1|       false|
|     79|   16616.0| null|        true|
|     79|   28204.0| null|       false|
+-------+----------+-----+------------+
only showing top 20 rows

