# Product Bundle Recommendation

### Load Data

In [2]:
import pandas as pd

Order_Products_Prior_DF = pd.read_csv('../data/order_products_prior.csv')
ordersDF = pd.read_csv('../data/orders.csv')
productsDF = pd.read_csv('../data/products.csv')

### Data Preprocessing

In [3]:
# orders in prior merged with product names
Order_Product_Name_Prior = pd.merge(Order_Products_Prior_DF, 
                                    productsDF, how='left', on='product_id')
# Prior orders with user_id, product_id, product_name
Prior_User_Order_Product = pd.merge(Order_Product_Name_Prior, 
                                    ordersDF, how='left', on='order_id')

Prior_User_Order_Product['product_name'][0]

'Organic Egg Whites'

As we can see, the product name is a string seperated with whitespace. We want to replace all whitespace with underscore "_", so that each product name is actually one word with no space in between.

In [4]:
products = Prior_User_Order_Product['product_name']
product_no_space = []
for product in products:
    product = product.replace(" ", "_")
    product_no_space.append(product)

# drop original column, replace it with one with no space
Prior_User_Order_Product.drop(['product_name'], axis=1)
Prior_User_Order_Product['product_name'] = product_no_space

Now we want to have a dataframe with each row correspons to one order. The first column is each order_id. The second column is the names of all products correspond to each order_id.

In [5]:
# add product name to each user
name_list = []
for p_name in Prior_User_Order_Product.groupby('order_id')['product_name']:
    name_list.append(' '.join(p_name[1]))
    
order_id = Prior_User_Order_Product.groupby('order_id')['product_name'].agg('count').index
order_products = pd.DataFrame({'order_id':order_id, 'products':name_list})

Take a glimpse of our dataframe:

In [6]:
order_products.head()

Unnamed: 0,order_id,products
0,2,Organic_Egg_Whites Michigan_Organic_Kale Garli...
1,3,Total_2%_with_Strawberry_Lowfat_Greek_Strained...
2,4,Plain_Pre-Sliced_Bagels Honey/Lemon_Cough_Drop...
3,5,"Bag_of_Organic_Bananas Just_Crisp,_Parmesan Fr..."
4,6,Cleanse Dryer_Sheets_Geranium_Scent Clean_Day_...


As we are going to use PySpark to extract bigrams, we need to prepare the dataframe in a format required by PySpark.

In [7]:
dataFrameList = []
index = 0
for row in order_products['products']:
    productsName = row.split(' ')
    tup = (index, productsName)
    dataFrameList.append(tup)
    index += 1

We split the data into train and test datasets.

In [8]:
# randomly split data into train (70%) and test (30%)
import random
import numpy
random.shuffle(dataFrameList)

train_data = dataFrameList[:2250411]
test_data = dataFrameList[2250411:]

### Use PySpark to Extract Bigrams

First convert the data to spark dataframe. To reduce computation, we will read 10000 lines each time, and then combine all into one spark dataframe.

In [9]:
from pyspark.sql import SparkSession
from pyspark.ml.feature import Word2Vec

spark = SparkSession.builder.appName("Bigram").getOrCreate()

# spark dataframe： read in batch of 10000 due to large computation
N = len(train_data)//10000
mod = len(train_data) % 10000
trainDF = spark.createDataFrame(dataFrameList[0:10000], ['id',"product_name"])

for i in range(1,N):
    trainDF_sub = spark.createDataFrame(train_data[10000*i:10000*(i+1)], ['id',"product_name"])
    traintDF = trainDF.union(trainDF_sub)
    
trainDF_sub = spark.createDataFrame(train_data[10000*N:len(train_data)], ['id',"product_name"])
trainDF = trainDF.union(trainDF_sub)

Then train bigrams.

In [10]:
# get bigram
from pyspark.ml.feature import NGram

ngram = NGram(n=2, inputCol="product_name", outputCol="bigrams")
ngramDataFrame = ngram.transform(trainDF)

ngramDataFrame.head()

Row(id=3072566, product_name=['Twelve_Essentials_Fruit_and_Vegetable_Juice', 'Juice,_Vegetable_&_Fruit,_Fuel', 'Juice,_Vegetable_&_Fruit,_Purify'], bigrams=['Twelve_Essentials_Fruit_and_Vegetable_Juice Juice,_Vegetable_&_Fruit,_Fuel', 'Juice,_Vegetable_&_Fruit,_Fuel Juice,_Vegetable_&_Fruit,_Purify'])

Now, after we got the bigrams, we start counting the frequency of each.

Bigrams are stored in a nested dictionary:
+ first layer key is the first word in a bigram 
+ second layer key is the second word in a bigram
+ the second layer value is the frequency. 


In [11]:
# count frequency:
# Bigrams are stored in a nested dictionary:
# first layer key is the first word in a bigram 
# second layer key is the second word in a bigram
# the second layer value is the frequency. 
# {'Organic_Mint_Bunch': {'Organic_Navel_Orange':2, 'c':2}}

bigrams = ngramDataFrame.toPandas()['bigrams']
table = {}
total = len(bigrams)
completion = 0
for bigram in bigrams:
    for combination in bigram:
        components = combination.split(' ')
        key = components[0]
        valKey = components[1]
        if key in table:
            valueDict = table[key]
            if valKey in valueDict:
                valueDict[valKey] = valueDict[valKey] + 1
            else:
                valueDict[valKey] = 1
        else:
            # create new value for key
            valueDict =  {valKey: 1}
            table[key] = valueDict
    completion += 1
#     print("==>", float(completion / total) * 100, "%")

Let's see which combination appears more than 20 times:

In [12]:
for firstWord in table:
    for secondWord in table[firstWord]:
        if table[firstWord][secondWord] > 20:
            print(firstWord, " + ", secondWord, ": ", table[firstWord][secondWord])

Organic_Hass_Avocado  +  Bag_of_Organic_Bananas :  26
Banana  +  Organic_Avocado :  36
Banana  +  Organic_Fuji_Apple :  25
Banana  +  Honeycrisp_Apple :  23
Banana  +  Organic_Strawberries :  27
Bag_of_Organic_Bananas  +  Organic_Strawberries :  31
Bag_of_Organic_Bananas  +  Organic_Hass_Avocado :  30
Bag_of_Organic_Bananas  +  Organic_Baby_Spinach :  23
Organic_Avocado  +  Banana :  22
Large_Lemon  +  Limes :  24


### Generate Recommendations

This part will use the frequencies above to generate recommendations for each product. We define functions to realize this.

In [14]:
def getPureData(prodName):
    
    '''sort the bigram frequencies in descending order, 
       then return merely the corresponding product names in the same order'''
    
    if prodName not in table:
        return []
    sortedOringalList = sorted(table[prodName].items(), key=lambda x: x[1], reverse=True)
#     print(sortedOringalList)
    data = {}
    for tp in sortedOringalList:
        product = tp[0]
        number = tp[1]
        if number in data:
            productList = data[number]
            productList.append(product)
        else:
            productList = [product]
        data[number] = productList
#     print(data)
#     print("==> Get pure data name:")
    pureData = data.values()
#     print(pureData)
    return list(pureData)

def pickRecommendProds(pureData, numberOfRecommend):
    
    '''Pick certain number of products from the sorted product names'''
    
    recommendProds = []
    for prods in pureData:
        if len(prods) <= numberOfRecommend:
            recommendProds += prods
            numberOfRecommend -= len(prods)
        else:
            recommendProds += random.sample(prods, numberOfRecommend)
            numberOfRecommend = 0

        if numberOfRecommend == 0:
            break
    
    return recommendProds

# recommend products bought together with 'name'
# name: the product to start with
def getRecommend(name, numberOfRecommend):
    
    '''Recommend certain number of products bought after the given input name'''
    
    # numberOfRecommend = 10
    recommendProducts = []
    productName = name
    index = 0

    while (numberOfRecommend):
#         print("->Target: ", productName)
#         print("->numberOfRecommend: ", numberOfRecommend)
#         print("->Index: ", index)
        data = getPureData(productName)
    #     print("Pure data:", data)
        intermediate = pickRecommendProds(data, numberOfRecommend)
        recommendProducts += intermediate
#         print("Recommend: ", recommendProducts)
#         print("Recommend: ", recommendProducts)
        if len(intermediate) == 0 and index == len(recommendProducts):
            break
        numberOfRecommend -= len(intermediate)
        if numberOfRecommend > 0:
#             print("Still left: ", numberOfRecommend)
            productName = recommendProducts[index]
            index += 1

#         print("==================")

    return recommendProducts

Try an example: 15 Products recommended after "Organic_Mint_Bunch".

In [16]:
print(getRecommend("Organic_Mint_Bunch", 15))

['Organic_Italian_Parsley_Bunch', 'Garlic', 'Organic_Carrot_Bunch', 'Fresh_Cauliflower', 'Organic_Cilantro', 'Organic_Baby_Spinach_Salad', 'Organic_Cilantro_Bunch', 'Organic_Thyme', '100%_Pressed_Apple__Fruit_Juice', 'Organic_Mountain_Forest_Honey_Light_Amber', 'Organic_Cilantro', 'Large_Lemon', 'Organic_Mint', 'Organic_Basil', 'Organic_Garlic']


### Evaluation

Now we see how it performs on test data.

We evaluate it by seeing in each order, how many products bought are in recommend list.

For example, test_order_1 in test data contains 10 products.

We start by recommend what can be bought with the first product, and we will give 10 recommendations (which is of same size as the actual order). We compare the next 9 actually bought products with this 10 recommendations. If there's a match, we will add 1 to the total score. Then we move to the second actually bought product, and give another 10 recommendations bought with the second product. Compare again, and compute total scores. ... After iterate through all actually bought products in this order, we have the total score, and divide the score by the order size to get the final_score_1 for test order 1.

We save all final scores in one list and compute the average score in the end.

In [18]:
def TestScore(test_data):
    
    scores = []

    for order_info in test_data:
        this_order = order_info[1]
        order_len = len(this_order)
        #print('order:', this_order)
        #print('length of order', order_len)
        i = 0
        this_score = 0

        while (i < order_len):
            if this_order[i] in table:
                # use original order length as the num of recommendation
                recommends = getRecommend(this_order[i], order_len)
                #print('====> recommends of ', this_order[i], " : ", recommends)
                laterProds = this_order[i:]
                # check if the recommended products is included in order
                for prod in laterProds:
                    if prod in recommends:
                        #print("-->", prod)
                        this_score += 1
                i += 1
            else:
                # if the product is not trained in model, skip
                i += 1
                order_len -= 1

        #print(this_score)
        if not order_len == 0:
            scores.append(this_score/order_len)
        #print(scores)
        
    # return a list of predicted scores
    return(scores)

In [19]:
scores = TestScore(test_data)
print("======> Mean Test Scores: ", numpy.mean(scores))

