In [1]:
from pyspark import SparkContext, SparkConf
from pyspark.sql import SparkSession
from pyspark.sql.types import *
from pyspark.sql.functions import asc, desc, dense_rank, col, when, count, avg, sum
from pyspark.sql.window import Window

In [2]:
sc = SparkContext.getOrCreate()
ss = SparkSession.builder.getOrCreate()

In [3]:
schema = StructType([StructField("order_id", IntegerType(), True),
StructField("user_id", IntegerType(), True),
StructField("eval_set", StringType(), True),
StructField("order_number", IntegerType(), True),
StructField("order_dow", IntegerType(), True),
StructField("order_hour_of_day", IntegerType(), True),
StructField("days_since_prior_order", IntegerType(), True),
StructField("product_id", IntegerType(), True),
StructField("add_to_cart_order", IntegerType(), True),
StructField("reordered", IntegerType(), True),
StructField("product_name", StringType(), True),
StructField("aisle_id", IntegerType(), True),
StructField("department_id", IntegerType(), True),
StructField("department", StringType(), True),
StructField("aisle", StringType(), True)
])
# did not work with ss.read.csv
# inferSchema accomplishes the same col types

## Change data_path variable to local consolidated_df.csv file

In [4]:
data_path = '/Users/sankeerti/Documents/data/consolidated_df.csv'

In [5]:
df = ss.read.csv(data_path, header=True, inferSchema=True)

In [6]:
df.printSchema()

root
 |-- order_id: integer (nullable = true)
 |-- user_id: integer (nullable = true)
 |-- eval_set: string (nullable = true)
 |-- order_number: integer (nullable = true)
 |-- order_dow: integer (nullable = true)
 |-- order_hour_of_day: integer (nullable = true)
 |-- days_since_prior_order: double (nullable = true)
 |-- product_id: double (nullable = true)
 |-- add_to_cart_order: double (nullable = true)
 |-- reordered: double (nullable = true)
 |-- product_name: string (nullable = true)
 |-- aisle_id: string (nullable = true)
 |-- department_id: string (nullable = true)
 |-- department: string (nullable = true)
 |-- aisle: string (nullable = true)



In [7]:
df.write.saveAsTable('Instacart') # saving as table for complex queries

### Reference Stats

In [8]:
# Number of unique prior user_id-product_id combinations
df.filter("eval_set == 'prior'").groupby('user_id', 'product_id').count().count()

13307953

In [9]:
# Number of train (order_id-product_id) rows
df.filter("eval_set == 'train'").count()

1384617

In [11]:
# Number of prior (order_id-product_id) rows
df.filter("eval_set == 'prior'").count()

32434489

# ~~ Where I stopped running the code last night ~~~

### Constructing Target Variable

In [12]:
last_orders_df = df.filter("eval_set == 'train'").select('user_id', 'product_id') \
    .withColumnRenamed('user_id','user_id2') \
    .withColumnRenamed('product_id','product_id2')

final_index_df = df.filter("eval_set == 'prior'").select(
    'user_id', 'product_id').distinct()

In [14]:
#join leaves out new products never ordered before in train order
df3 = final_index_df \
    .join(last_orders_df, 
          on=(final_index_df.user_id == last_orders_df.user_id2) 
          & (final_index_df.product_id == last_orders_df.product_id2), how="left")
df3.show()

+-------+----------+--------+-----------+
|user_id|product_id|user_id2|product_id2|
+-------+----------+--------+-----------+
|      7|    8277.0|    null|       null|
|      7|   27156.0|    null|       null|
|      7|   40852.0|       7|    40852.0|
|      8|   34358.0|    null|       null|
|     14|   40540.0|    null|       null|
|     18|   40723.0|    null|       null|
|     27|    5322.0|    null|       null|
|     31|   21131.0|    null|       null|
|     31|   45104.0|    null|       null|
|     32|   49478.0|    null|       null|
|     38|   11078.0|    null|       null|
|     41|   19678.0|    null|       null|
|     42|    1263.0|    null|       null|
|     52|   35561.0|    null|       null|
|     58|   43115.0|    null|       null|
|     61|    6187.0|    null|       null|
|     63|   38061.0|    null|       null|
|     71|   41408.0|    null|       null|
|     79|   16616.0|      79|    16616.0|
|     79|   28204.0|    null|       null|
+-------+----------+--------+-----

In [15]:
feature_target_df = df3.withColumn('ordered_true', df3.user_id2.isNotNull()) \
    .select('user_id','product_id','ordered_true')
# feature_target_df.show()

In [16]:
priors_df = df.filter("eval_set == 'prior'")

priors_df.cache()

DataFrame[order_id: int, user_id: int, eval_set: string, order_number: int, order_dow: int, order_hour_of_day: int, days_since_prior_order: double, product_id: double, add_to_cart_order: double, reordered: double, product_name: string, aisle_id: string, department_id: string, department: string, aisle: string]

### Number of times a user ordered a product
### Average days since prior order
### Number of times user reordered each product

In [17]:
features = priors_df.groupby('user_id', 'product_id') \
    .agg(count('order_id').alias('usr_prod_ct'),
         avg('days_since_prior_order').alias('avg_days_since_ord_wnull'),
         sum('reordered').alias('num_reordered'))

features = features.withColumn('avg_days_since_ord', when(col('avg_days_since_ord_wnull').isNull(),365).otherwise(col('avg_days_since_ord_wnull'))) \
                    .drop('user_id2', 'product_id2','avg_days_since_ord_wnull')

features.cache()
# features.show()

DataFrame[user_id: int, product_id: double, usr_prod_ct: bigint, num_reordered: double, avg_days_since_ord: double]

### Number of times user ordered products in last 5 orders

In [18]:
window = Window.partitionBy('user_id') \
                .orderBy(desc('order_number')) \
                .rowsBetween(Window.unboundedPreceding, Window.currentRow)

num_prod_ordl5 = priors_df.select('user_id', 'product_id', 'order_number',  
                                  dense_rank().over(window).alias('rank')) \
                .filter(col('rank') <= 5) \
                .groupby('user_id', 'product_id') \
                .count() \
                .withColumnRenamed('user_id','user_id2') \
                .withColumnRenamed('product_id','product_id2')

features = features.join(num_prod_ordl5, 
          on=(features.user_id == num_prod_ordl5.user_id2) 
          & (features.product_id == num_prod_ordl5.product_id2), how="left")

features = features.withColumn('num_prod_ordl5', when(col('count').isNull(),0).otherwise(col('count'))) \
                    .drop('user_id2', 'product_id2','count')


# features.show()

### Ratio of orders user ordered products in last 5 orders

In [19]:
ratio_prod_ordl5 = priors_df.select('user_id', 'product_id', 'order_number',  
          dense_rank().over(window).alias('rank')) \
                .filter(col('rank') <= 5) \
                .groupby('user_id', 'product_id') \
                .agg((count('order_number')/5).alias('ratio_wnull'))\
                .withColumnRenamed('user_id','user_id2') \
                .withColumnRenamed('product_id','product_id2')

features = features.join(ratio_prod_ordl5, 
          on=(features.user_id == ratio_prod_ordl5.user_id2) 
          & (features.product_id == ratio_prod_ordl5.product_id2), how="left")

features = features.withColumn('last5_ratio', when(col('ratio_wnull').isNull(),0).otherwise(col('ratio_wnull'))) \
                    .drop('user_id2', 'product_id2','ratio_wnull')

# features.show()

### Number of orders since a user last ordered a given item
done by generating chrononological order_num from order_id, and returns max order_num (grouped by user) - max order_num (grouped by user and product)

In [20]:
num_ords_since_last = ss.sql("select distinct product_id as product_id2, user_id as user_id2,\
        max(order_num) over (partition by user_id) - max(order_num) over (partition by user_id, product_id) as num_ords_since_last from\
        (select Instacart.order_id, Instacart.user_id, Instacart.product_id, rhs.order_num\
        from Instacart\
        left join\
        (select order_id, user_id, row_number() over (partition by user_id order by order_id) as order_num from\
        (select distinct order_id, user_id from Instacart where eval_set = 'prior') as iq) as rhs\
        on Instacart.order_id=rhs.order_id and Instacart.user_id=rhs.user_id\
        where eval_set = 'prior') as iq2")

features = features.join(num_ords_since_last, 
          on=(features.user_id == num_ords_since_last.user_id2) 
          & (features.product_id == num_ords_since_last.product_id2), how="left")

features = features.drop('user_id2', 'product_id2')

In [21]:
# features.show()

### Rate of user item reorder: # of reorders of an item / # of orders since first time ordering item.
Get max(order_num) grouped by user_id, then min(order_num) grouped by user_id and product, subtract the two to get number of orders since first purchase of an item. Then sum(reordered) grouped by item, user to get the number of times an item was reordered by a user

In [22]:
reorder_rate = ss.sql("select product_id as product_id2, user_id as user_id2, \
        num_reorders/orders_since_first as reorder_rate_wnull from\
        (select distinct product_id, user_id,\
        max(order_num) over (partition by user_id) - min(order_num) over (partition by user_id, product_id) as orders_since_first,\
        sum(reordered) over (partition by user_id, product_id) as num_reorders from\
        (select Instacart.order_id, Instacart.user_id, Instacart.product_id, Instacart.reordered, rhs.order_num\
        from Instacart\
        left join\
        (select order_id, user_id, row_number() over (partition by user_id order by order_id) as order_num from\
        (select distinct order_id, user_id from Instacart where eval_set = 'prior') as iq) as rhs\
        on Instacart.order_id=rhs.order_id and Instacart.user_id=rhs.user_id\
        where eval_set = 'prior') as iq2) as iq3")

features = features.join(reorder_rate, 
          on=(features.user_id == reorder_rate.user_id2) 
          & (features.product_id == reorder_rate.product_id2), how="left")

features = features.withColumn('reorder_rate', when(col('reorder_rate_wnull').isNull(),0).otherwise(col('reorder_rate_wnull'))) \
                    .drop('reorder_rate_wnull')

features = features.drop('user_id2', 'product_id2')

In [23]:
features = features.withColumn('reorder_rate_new', when(col('reorder_rate').isNull(),0).otherwise(col('reorder_rate'))) \
                    .drop('reorder_rate')
# features.show()

In [24]:
features.count()

13307953

In [25]:
features.show(3)

+-------+----------+-----------+-------------+------------------+--------------+-----------+-------------------+-------------------+
|user_id|product_id|usr_prod_ct|num_reordered|avg_days_since_ord|num_prod_ordl5|last5_ratio|num_ords_since_last|   reorder_rate_new|
+-------+----------+-----------+-------------+------------------+--------------+-----------+-------------------+-------------------+
|      7|    8277.0|          3|          2.0|11.666666666666666|             1|        0.2|                  2|0.10526315789473684|
|      7|   27156.0|          1|          0.0|               7.0|             1|        0.2|                  4|                0.0|
|      7|   40852.0|         13|         12.0| 12.76923076923077|             3|        0.6|                  0|  0.631578947368421|
+-------+----------+-----------+-------------+------------------+--------------+-----------+-------------------+-------------------+
only showing top 3 rows



In [26]:
feature_target_df.show(3)

+-------+----------+------------+
|user_id|product_id|ordered_true|
+-------+----------+------------+
|      7|    8277.0|       false|
|      7|   27156.0|       false|
|      7|   40852.0|        true|
+-------+----------+------------+
only showing top 3 rows



In [28]:
feature_target_df.count()

13307953

In [36]:
feature_target_df2 = feature_target_df.withColumnRenamed('user_id', 'user_id2')\
                                      .withColumnRenamed('product_id', 'product_id2')
final_df = features.join(feature_target_df2, 
          on=(features.user_id == feature_target_df2.user_id2) 
          & (features.product_id == feature_target_df2.product_id2), how="left")
final_df = final_df.drop('user_id2', 'product_id2')

In [37]:
final_df.show(3)

+-------+----------+-----------+-------------+------------------+--------------+-----------+-------------------+-------------------+------------+
|user_id|product_id|usr_prod_ct|num_reordered|avg_days_since_ord|num_prod_ordl5|last5_ratio|num_ords_since_last|   reorder_rate_new|ordered_true|
+-------+----------+-----------+-------------+------------------+--------------+-----------+-------------------+-------------------+------------+
|      7|    8277.0|          3|          2.0|11.666666666666666|             1|        0.2|                  2|0.10526315789473684|       false|
|      7|   27156.0|          1|          0.0|               7.0|             1|        0.2|                  4|                0.0|       false|
|      7|   40852.0|         13|         12.0| 12.76923076923077|             3|        0.6|                  0|  0.631578947368421|        true|
+-------+----------+-----------+-------------+------------------+--------------+-----------+-------------------+------------

In [41]:
final_df.printSchema()

root
 |-- user_id: integer (nullable = true)
 |-- product_id: double (nullable = true)
 |-- usr_prod_ct: long (nullable = false)
 |-- num_reordered: double (nullable = true)
 |-- avg_days_since_ord: double (nullable = true)
 |-- num_prod_ordl5: long (nullable = true)
 |-- last5_ratio: double (nullable = true)
 |-- num_ords_since_last: integer (nullable = true)
 |-- reorder_rate_new: double (nullable = true)
 |-- ordered_true: boolean (nullable = true)



# Random Forest Modeling

In [38]:
from pyspark.sql import SQLContext
from pyspark.ml.feature import VectorAssembler
from pyspark.ml.evaluation import MulticlassClassificationEvaluator
from pyspark.ml.classification import RandomForestClassifier

In [39]:
# Converting strings to numeric values.
from pyspark.ml.feature import StringIndexer

def indexStringColumns(df, cols):
    # variable newdf will be updated several times
    newdf = df
    
    for c in cols:
        #For each given colum, fits StringIndexerModel.
        si = StringIndexer(inputCol=c, outputCol=c+"-num")
        sm = si.fit(newdf)
        #Creates a DataFame by putting the transformed values in the new colum with suffix "-num" 
        #and then drops the original columns.
        #and drop the "-num" suffix. 
        newdf = sm.transform(newdf).drop(c)
        newdf = newdf.withColumnRenamed(c+"-num", c)
    return newdf

In [40]:
dfnum = indexStringColumns(final_df, ['ordered_true'])

IllegalArgumentException: 'requirement failed: The input column ordered_true must be either string type or numeric type, but got BooleanType.'