In [1]:
from pyspark import SparkContext, SparkConf
from pyspark.sql import SparkSession
from pyspark.sql.types import *

In [None]:
sc = SparkContext.getOrCreate()
ss = SparkSession.builder.getOrCreate()

In [2]:
schema = StructType([StructField("order_id", IntegerType(), True),
StructField("user_id", IntegerType(), True),
StructField("eval_set", StringType(), True),
StructField("order_number", IntegerType(), True),
StructField("order_dow", IntegerType(), True),
StructField("order_hour_of_day", IntegerType(), True),
StructField("days_since_prior_order", IntegerType(), True),
StructField("product_id", IntegerType(), True),
StructField("add_to_cart_order", IntegerType(), True),
StructField("reordered", IntegerType(), True),
StructField("product_name", StringType(), True),
StructField("aisle_id", IntegerType(), True),
StructField("department_id", IntegerType(), True),
StructField("department", StringType(), True),
StructField("aisle", StringType(), True)
])
# did not work with ss.read.csv
# inferSchema accomplishes the same col types

In [3]:
df = ss.read.csv('consolidated_df.csv', header=True, inferSchema=True)

In [4]:
df.printSchema()

root
 |-- order_id: integer (nullable = true)
 |-- user_id: integer (nullable = true)
 |-- eval_set: string (nullable = true)
 |-- order_number: integer (nullable = true)
 |-- order_dow: integer (nullable = true)
 |-- order_hour_of_day: integer (nullable = true)
 |-- days_since_prior_order: double (nullable = true)
 |-- product_id: double (nullable = true)
 |-- add_to_cart_order: double (nullable = true)
 |-- reordered: double (nullable = true)
 |-- product_name: string (nullable = true)
 |-- aisle_id: string (nullable = true)
 |-- department_id: string (nullable = true)
 |-- department: string (nullable = true)
 |-- aisle: string (nullable = true)



### Number of times a user ordered a product

In [5]:
df.filter("eval_set == 'train'").groupby('user_id', 'product_id').count().show()

+-------+----------+-----+
|user_id|product_id|count|
+-------+----------+-----+
|     86|   45007.0|    1|
|    435|     812.0|    1|
|    685|   27966.0|    1|
|   1015|   43065.0|    1|
|   1073|     663.0|    1|
|   1407|   43452.0|    1|
|   1779|    6184.0|    1|
|   2000|   32331.0|    1|
|   2118|   39059.0|    1|
|   2333|   17794.0|    1|
|   2339|   28476.0|    1|
|   2472|    4913.0|    1|
|   2563|   19660.0|    1|
|   2581|   17953.0|    1|
|   2732|   28204.0|    1|
|   2753|   36196.0|    1|
|   3044|   14366.0|    1|
|   3157|   24852.0|    1|
|   3194|   31717.0|    1|
|   3345|   36334.0|    1|
+-------+----------+-----+
only showing top 20 rows



### Constructing Target Variable

In [6]:
last_order = df.filter("eval_set == 'train'").select('user_id', 'product_id') \
    .withColumnRenamed('user_id','user_id2') \
    .withColumnRenamed('product_id','product_id2')
final_df = df.filter("eval_set == 'prior'").select(
    'user_id', 'product_id').distinct()

In [7]:
df3 = final_df \
    .join(last_order, 
          on=(final_df.user_id == last_order.user_id2) 
          & (final_df.product_id == last_order.product_id2), how="left")
df3.show()

+-------+----------+--------+-----------+
|user_id|product_id|user_id2|product_id2|
+-------+----------+--------+-----------+
|      7|    8277.0|    null|       null|
|      7|   27156.0|    null|       null|
|      7|   40852.0|       7|    40852.0|
|      8|   34358.0|    null|       null|
|     14|   40540.0|    null|       null|
|     18|   40723.0|    null|       null|
|     27|    5322.0|    null|       null|
|     31|   21131.0|    null|       null|
|     31|   45104.0|    null|       null|
|     32|   49478.0|    null|       null|
|     38|   11078.0|    null|       null|
|     41|   19678.0|    null|       null|
|     42|    1263.0|    null|       null|
|     52|   35561.0|    null|       null|
|     58|   43115.0|    null|       null|
|     61|    6187.0|    null|       null|
|     63|   38061.0|    null|       null|
|     71|   41408.0|    null|       null|
|     79|   16616.0|      79|    16616.0|
|     79|   28204.0|    null|       null|
+-------+----------+--------+-----

In [8]:
df4 = df3.withColumn('ordered_true', df3.user_id2.isNotNull()) \
    .select('user_id','product_id','ordered_true')
df4.show()

+-------+----------+------------+
|user_id|product_id|ordered_true|
+-------+----------+------------+
|      7|    8277.0|       false|
|      7|   27156.0|       false|
|      7|   40852.0|        true|
|      8|   34358.0|       false|
|     14|   40540.0|       false|
|     18|   40723.0|       false|
|     27|    5322.0|       false|
|     31|   21131.0|       false|
|     31|   45104.0|       false|
|     32|   49478.0|       false|
|     38|   11078.0|       false|
|     41|   19678.0|       false|
|     42|    1263.0|       false|
|     52|   35561.0|       false|
|     58|   43115.0|       false|
|     61|    6187.0|       false|
|     63|   38061.0|       false|
|     71|   41408.0|       false|
|     79|   16616.0|        true|
|     79|   28204.0|       false|
+-------+----------+------------+
only showing top 20 rows

