In [5]:
from pyspark import SparkContext, SparkConf
from pyspark.sql import SparkSession
from pyspark.sql.types import *
from pyspark.sql.functions import asc, desc, dense_rank, col
from pyspark.sql.window import Window

In [6]:
sc = SparkContext.getOrCreate()
ss = SparkSession.builder.getOrCreate()

In [7]:
schema = StructType([StructField("order_id", IntegerType(), True),
StructField("user_id", IntegerType(), True),
StructField("eval_set", StringType(), True),
StructField("order_number", IntegerType(), True),
StructField("order_dow", IntegerType(), True),
StructField("order_hour_of_day", IntegerType(), True),
StructField("days_since_prior_order", IntegerType(), True),
StructField("product_id", IntegerType(), True),
StructField("add_to_cart_order", IntegerType(), True),
StructField("reordered", IntegerType(), True),
StructField("product_name", StringType(), True),
StructField("aisle_id", IntegerType(), True),
StructField("department_id", IntegerType(), True),
StructField("department", StringType(), True),
StructField("aisle", StringType(), True)
])
# did not work with ss.read.csv
# inferSchema accomplishes the same col types

In [9]:
df = ss.read.csv('../shardonnay697.csv', header=True, inferSchema=True)

In [10]:
df.printSchema()

root
 |-- _c0: integer (nullable = true)
 |-- order_id: integer (nullable = true)
 |-- user_id: integer (nullable = true)
 |-- eval_set: string (nullable = true)
 |-- order_number: integer (nullable = true)
 |-- order_dow: integer (nullable = true)
 |-- order_hour_of_day: integer (nullable = true)
 |-- days_since_prior_order: double (nullable = true)
 |-- product_id: double (nullable = true)
 |-- add_to_cart_order: double (nullable = true)
 |-- reordered: double (nullable = true)
 |-- product_name: string (nullable = true)
 |-- aisle_id: double (nullable = true)
 |-- department_id: double (nullable = true)
 |-- department: string (nullable = true)
 |-- aisle: string (nullable = true)



### Reference Stats

In [11]:
# Number of unique prior user_id-product_id combinations
df.filter("eval_set == 'prior'").groupby('user_id', 'product_id').count().count()

470

In [12]:
# Number of train (order_id-product_id) rows
df.filter("eval_set == 'train'").count()

104

In [13]:
# Number of prior (order_id-product_id) rows
df.filter("eval_set == 'prior'").count()

894

### Constructing Target Variable

In [14]:
last_order = df.filter("eval_set == 'train'").select('user_id', 'product_id') \
    .withColumnRenamed('user_id','user_id2') \
    .withColumnRenamed('product_id','product_id2')

final_df = df.filter("eval_set == 'prior'").select(
    'user_id', 'product_id').distinct()

In [15]:
#join leaves out new products never ordered before in train order
df3 = final_df \
    .join(last_order, 
          on=(final_df.user_id == last_order.user_id2) 
          & (final_df.product_id == last_order.product_id2), how="left")
df3.show()

+-------+----------+--------+-----------+
|user_id|product_id|user_id2|product_id2|
+-------+----------+--------+-----------+
|      7|   40852.0|       7|    40852.0|
|      7|    8277.0|    null|       null|
|      7|   27156.0|    null|       null|
|      8|   34358.0|    null|       null|
|      2|    2002.0|    null|       null|
|      2|   16521.0|    null|       null|
|     11|   47912.0|    null|       null|
|      3|   14992.0|    null|       null|
|     10|   40706.0|    null|       null|
|      2|   47766.0|    null|       null|
|      8|   39812.0|    null|       null|
|     10|   32299.0|    null|       null|
|      9|     311.0|    null|       null|
|      3|   40604.0|    null|       null|
|      3|   48523.0|    null|       null|
|      5|   24535.0|    null|       null|
|      9|    8952.0|    null|       null|
|     10|   44359.0|    null|       null|
|      2|   19240.0|    null|       null|
|      8|   39110.0|    null|       null|
+-------+----------+--------+-----

In [16]:
df4 = df3.withColumn('ordered_true', df3.user_id2.isNotNull()) \
    .select('user_id','product_id','ordered_true')
df4.show()

+-------+----------+------------+
|user_id|product_id|ordered_true|
+-------+----------+------------+
|      7|   40852.0|        true|
|      7|    8277.0|       false|
|      7|   27156.0|       false|
|      8|   34358.0|       false|
|      2|    2002.0|       false|
|      2|   16521.0|       false|
|     11|   47912.0|       false|
|      3|   14992.0|       false|
|     10|   40706.0|       false|
|      2|   47766.0|       false|
|      8|   39812.0|       false|
|     10|   32299.0|       false|
|      9|     311.0|       false|
|      3|   40604.0|       false|
|      3|   48523.0|       false|
|      5|   24535.0|       false|
|      9|    8952.0|       false|
|     10|   44359.0|       false|
|      2|   19240.0|       false|
|      8|   39110.0|       false|
+-------+----------+------------+
only showing top 20 rows



### Number of times a user ordered a product

In [17]:
num_prod_ord = df.filter("eval_set == 'prior'") \
    .groupby('user_id', 'product_id') \
    .count()

In [27]:
# Sample output
f1 = num_prod_ord.sort(desc('count')).withColumnRenamed('count','usr_prod_ct')
f1.show(10)

+-------+----------+-----------+
|user_id|product_id|usr_prod_ct|
+-------+----------+-----------+
|      7|   40852.0|         13|
|      7|   37602.0|         12|
|      7|   17638.0|         11|
|      1|     196.0|         10|
|      1|   12427.0|         10|
|      3|   39190.0|         10|
|      7|   21137.0|         10|
|      3|   47766.0|          9|
|      2|   32792.0|          9|
|      1|   10258.0|          9|
+-------+----------+-----------+
only showing top 10 rows



### Number of times user ordered products in last 5 orders

In [36]:
df4.cache()

window = Window.partitionBy('user_id') \
                .orderBy(desc('order_number')) \
                .rowsBetween(Window.unboundedPreceding, Window.currentRow)

num_prod_ordl5 = df.filter("eval_set == 'prior'") \
    .select('user_id', 'product_id', 'order_number',  
          dense_rank().over(window).alias('rank')) \
                .filter(col('rank') <= 5) \
                .groupby('user_id', 'product_id') \
                .count()

num_prod_ordl5.cache()
num_prod_ordl5 = num_prod_ordl5 \
    .withColumnRenamed('user_id','user_id2') \
    .withColumnRenamed('product_id','product_id2') \
    .withColumnRenamed('count','new_count')

num_prod_ordl5 = df4.join(num_prod_ordl5, 
          on=(df4.user_id == num_prod_ordl5.user_id2) 
          & (df4.product_id == num_prod_ordl5.product_id2), how="left")

f2 = num_prod_ordl5.select('user_id2','product_id2','new_count','ordered_true')
f2.show()

+--------+-----------+---------+------------+
|user_id2|product_id2|new_count|ordered_true|
+--------+-----------+---------+------------+
|       7|    40852.0|        3|        true|
|       7|     8277.0|        1|       false|
|       7|    27156.0|        1|       false|
|       8|    34358.0|        2|       false|
|       2|     2002.0|        2|       false|
|       2|    16521.0|        1|       false|
|      11|    47912.0|        1|       false|
|    null|       null|     null|       false|
|      10|    40706.0|        3|       false|
|    null|       null|     null|       false|
|       8|    39812.0|        1|       false|
|      10|    32299.0|        1|       false|
|       9|      311.0|        1|       false|
|    null|       null|     null|       false|
|       3|    48523.0|        2|       false|
|       5|    24535.0|        3|       false|
|       9|     8952.0|        1|       false|
|      10|    44359.0|        1|       false|
|       2|    19240.0|        1|  

Median Days Since last order


In [41]:
f3 = df.groupBy('user_id', 'product_id').avg('days_since_prior_order').withColumnRenamed('user_id','user_id3').withColumnRenamed('product_id','product_id3')
#f3.show()

f1.join(f2, on=(f1.user_id == f2.user_id2) & (f1.product_id == f2.product_id2), how="left") \
.join(f3, on=(f1.user_id == f3.user_id3) & (f1.product_id == f3.product_id3), how="left") \
.drop('user_id2','product_id2', 'user_id3','product_id3').show()




+-------+----------+-----------+---------+------------+---------------------------+
|user_id|product_id|usr_prod_ct|new_count|ordered_true|avg(days_since_prior_order)|
+-------+----------+-----------+---------+------------+---------------------------+
|      7|    8277.0|          3|        1|       false|         11.666666666666666|
|      7|   27156.0|          1|        1|       false|                        7.0|
|      7|   40852.0|         13|        3|        true|         12.285714285714286|
|      8|   34358.0|          2|        2|       false|                       30.0|
|      2|    2002.0|          4|        2|       false|                      17.75|
|      2|   16521.0|          1|        1|       false|                       30.0|
|     11|   47912.0|          1|        1|       false|                       null|
|      3|   14992.0|          2|     null|        null|                        7.0|
|     10|   40706.0|          3|        3|       false|         16.333333333