# General Products-in-same-order, graph-based recommendation models


* Recommends products based on which products are generally purchased together (globablly).
* 



Options:

* Same order: Undirectional graph
* Previous orders: Directed graph

Data model can be either 

* Adjacency list: `dict[node1][node2] = value`, or
* Edgelist: table with three columns (node1, node2, value).



In [1]:
import pandas as pd

from collections import defaultdict
import timeit
import itertools
import random
import matplotlib
from matplotlib import pyplot
from pprint import pprint
from importlib import reload

In [2]:
# Load data
df = df_raw = pd.read_csv("../data/raw/data.csv", keep_default_na=False)
print(f"Table rows:", len(df))
df.head()

Table rows: 541909


Unnamed: 0,InvoiceNo,StockCode,Description,Quantity,InvoiceDate,UnitPrice,CustomerID,Country
0,536365,85123A,WHITE HANGING HEART T-LIGHT HOLDER,6,12/1/2010 8:26,2.55,17850,United Kingdom
1,536365,71053,WHITE METAL LANTERN,6,12/1/2010 8:26,3.39,17850,United Kingdom
2,536365,84406B,CREAM CUPID HEARTS COAT HANGER,8,12/1/2010 8:26,2.75,17850,United Kingdom
3,536365,84029G,KNITTED UNION FLAG HOT WATER BOTTLE,6,12/1/2010 8:26,3.39,17850,United Kingdom
4,536365,84029E,RED WOOLLY HOTTIE WHITE HEART.,6,12/1/2010 8:26,3.39,17850,United Kingdom


In [24]:
# Remove bad data:
df = df_raw
print("Rows before QA filtering:", len(df))

# Remove lines with N/A values:
df = df_dropna = df.dropna()
print("Rows after dropping N/A:", len(df))

print("Columns with empty values:")
for column in df.columns:
    df = df[~(df[column] == '')]
print("Rows after dropping rows with empty values:", len(df))

# Remove lines that does not represent an actual product:
print("Non-product stock codes:")
non_product_stock_codes = ['BANK CHARGES', 'C2', 'CRUK', 'D', 'DOT', 'M', 'PADS', 'POST']
df = df[~df['StockCode'].isin(['BANK CHARGES', 'C2', 'CRUK', 'D', 'DOT', 'M', 'PADS', 'POST'])]
print("Rows after dropping non-product lines:", len(df))

print("Rows after QA filtering:", len(df))

Rows before QA filtering: 541909
Rows after dropping N/A: 541909
Columns with empty values:
Rows after dropping rows with empty values: 406829
Non-product stock codes:
Rows after dropping non-product lines: 404909
Rows after QA filtering: 404909


In [26]:
from src.data import data_utils 
reload(data_utils)
products_df = data_utils.build_products_df_from_sales(df)
# products_df.head(5)

## Pt. 1: Same-order recommendation model:

Same-order graph: Undirectional graph representing products that were purchased 



In [4]:
# Build same-order undirected graph:
# dict[StockCode1][StockCode2] = count

grouped_by_invoiceno = df.groupby("InvoiceNo")

print("Total orders (invoices):", grouped_by_invoiceno.ngroups)
print("Dataset:")
print(f"- {len(df)} rows,")
print(f"- {df.groupby('InvoiceNo').ngroups} orders/invoices,")
print(f"- {len(df['StockCode'].unique())} unique StockCodes.")

def build_sameorder_product_dod_using_cartesian_forloop(df):
    # We use a double default-dict, but since most StockCodes are present, 
    # it is probably faster to pre-populate the outer dict with *all* StockCodes.
    # Also, we only have 3684, so it would be possible to use an adjacency matrix, 
    # instead of adjacency list - only about 10M values so about 40-80 MB. 
    
    sameorder_dod = defaultdict(lambda: defaultdict(int))
    for invoiceno, order_df in df.groupby("InvoiceNo"):
        # Can use either itertools.product or itertools.combinations:
        # itertools.combinations only gives unique combinations, but can do more than two.
        for stock_code1, stock_code2 in itertools.product(order_df['StockCode'], repeat=2):
            if stock_code1 == stock_code2:
                continue
            sameorder_dod[stock_code1][stock_code2] += 1
    return sameorder_dod



print("Computing sameorder graph dict-of-dict...")
t1 = timeit.default_timer()
sameorder_g_dod = build_sameorder_product_dod_using_cartesian_forloop(df)
t2 = timeit.default_timer()
ttc = t2 - t1

print(f"\nTime to compute same-order dict-of-dict, "
      f"using for-loop on cartesian product of StockCodes in each order: {ttc:.01f} sec")
print(" - single-threaded and inefficient for-loop.")
print(" - number of nodes:", len(sameorder_g_dod), "(single-product orders do not contribute to same-order graph)")
# sameorder_g_dod = build_sameorder_product_dod(df)



Total orders (invoices): 21788
Dataset:
- 404909 rows,
- 21788 orders/invoices,
- 3676 unique StockCodes.
Computing sameorder graph dict-of-dict...

Time to compute same-order dict-of-dict, using for-loop on cartesian product of StockCodes in each order: 5.7 sec
 - single-threaded and inefficient for-loop.
 - number of nodes: 3667 (single-product orders do not contribute to same-order graph)


In [5]:
# Recommendation based on sameorder graph:

def recommend_sameorder_stockcodes(sameorder_dod, basket, k=1):
    if isinstance(basket, str):
        basket = [basket]
    item_weights = defaultdict(int)
    for item in basket:
        # self.sameorder_dod[item] = {stockcode: weight}
        if item not in sameorder_dod:
            print(f"NOTICE: Item stockcode {item} not present in SameOrder graph.")
            print(f"Perhaps the item has never been bought together with other items?")
        for stockcode, count in sameorder_dod[item].items():
            item_weights[stockcode] += count
    item_weights = dict(item_weights)
#     print("Combined recommender weights for basket items:", basket)
#     pprint(item_weights)
    codes, weights = zip(*item_weights.items())
    return random.choices(codes, weights=weights, k=k)


### SameOrderGraphRecommender class

In [36]:
# Model class:
import numpy as np
import random
import timeit


class SameOrderGraphRecommender:

    def __init__(self, sales_df):

        # Build same-order undirected graph:
        # dict[StockCode1][StockCode2] = count
        print("Dataset:")
        print(f"- {len(sales_df)} rows/lines,")
        print(f"- {sales_df.groupby('InvoiceNo').ngroups} total orders/invoices,")
        print(f"- {len(sales_df['StockCode'].unique())} unique StockCodes.")

        print("Building SameOrder graph (dict-of-dict)... This should take about 5-10 seconds.")
        t1 = timeit.default_timer()
        self.sameorder_dod = build_sameorder_product_dod_using_cartesian_forloop(sales_df)
        ttc = timeit.default_timer() - t1
        print(f" - Done ({ttc:.2f} s).")
    
    def get_items_weights_for_basket(self, basket):
        if isinstance(basket, str):
            basket = [basket]
        item_weights = defaultdict(int)
        for item in basket:
            # self.sameorder_dod[item] = {stockcode: weight}
            if item not in self.sameorder_dod:
                print(f"NOTICE: Item stockcode {item} not present in SameOrder graph.")
                print(f"Perhaps the item has never been bought together with other items?")
            for stockcode, count in self.sameorder_dod[item].items():
                item_weights[stockcode] += count
        return item_weights
    
    def recommend_stockcodes(self, basket, k=1):
        return recommend_sameorder_stockcodes(self.sameorder_dod, basket, k=k)
    
    def recommend_top_stockcodes(self, basket, k=1):
        item_weights = self.get_items_weights_for_basket(basket)
        item_weights = dict(item_weights)
        codes, weights = zip(*item_weights.items())
        codes, weights = np.array(codes), np.array(weights)
        sidxs = np.argsort(weights)
        codes = codes[sidxs]
        return codes[:k]


sameorder_recommender = SameOrderGraphRecommender(df)

random_recommendation = sameorder_recommender.recommend_stockcodes(basket='85123A', k=5)
assert len(random_recommendation) == 5
top_recommendation = sameorder_recommender.recommend_top_stockcodes(basket='85123A', k=6)
assert len(top_recommendation) == 6


Dataset:
- 404909 rows,
- 21788 orders/invoices,
- 3676 unique StockCodes.
Building SameOrder graph (dict-of-dict)... This should take about 5-10 seconds.
 - Done.


In [40]:
# Eksempel: Brug SameOrderGraphRecommender til at anbefale 5 produkter for produkt '85123A'

print("Produkt '85123A':")
display(products_df.loc[['85123A']])

recommended_items = sameorder_recommender.recommend_stockcodes(basket='85123A', k=5)

all_items_weights = sameorder_recommender.get_items_weights_for_basket(basket='85123A')
recommended_weights = [all_items_weights[item] for item in recommended_items]

print("Recommended items:  ", recommended_items)
print("Recommended weights:", recommended_weights)
display(products_df.loc[recommended_items])

Produkt '85123A':


Unnamed: 0_level_0,Description,InvoiceDate,UnitPrice,OrdersCount,UsersCount
StockCode,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1
85123A,WHITE HANGING HEART T-LIGHT HOLDER,12/1/2010 8:26,2.55,2077,858


Recommended items:   ['22960', '22114', '17096', '23199', '21584']
Recommended weights: [136, 64, 3, 94, 33]


Unnamed: 0_level_0,Description,InvoiceDate,UnitPrice,OrdersCount,UsersCount
StockCode,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1
22960,JAM MAKING SET WITH JARS,12/1/2010 8:34,4.25,974,574
22114,HOT WATER BOTTLE TEA AND SYMPATHY,12/1/2010 9:32,3.45,524,325
17096,ASSORTED LAQUERED INCENSE HOLDERS,12/12/2010 12:03,0.17,13,12
23199,JUMBO BAG APPLES,4/11/2011 16:47,2.08,747,344
21584,RETROSPOT SMALL TUBE MATCHES,12/1/2010 12:23,1.65,160,119


In [38]:
# Eksempel: Brug SameOrderGraphRecommender til at anbefale 10 produkter for produkt '21928'
basket = ['21928']

print("Basket:", basket)
display(products_df.loc[basket])

# recommend_sameorder_stockcodes(sameorder_recommender.sameorder_dod, basket='85123A', k=10)
recommended_items = sameorder_recommender.recommend_stockcodes(basket=basket, k=10)

print("Recommended items:", recommended_items)
display(products_df.loc[recommended_items])

Basket: ['21928']


Unnamed: 0_level_0,Description,InvoiceDate,UnitPrice,OrdersCount,UsersCount
StockCode,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1
21928,JUMBO BAG SCANDINAVIAN PAISLEY,12/1/2010 14:11,1.95,559,264


Recommended items: ['20725', '23088', '22192', '22714', '23321', '22844', '22028', '22596', '22274', '22752']


Unnamed: 0_level_0,Description,InvoiceDate,UnitPrice,OrdersCount,UsersCount
StockCode,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1
20725,LUNCH BAG RED RETROSPOT,12/1/2010 9:37,1.65,1359,532
23088,ZINC HEART FLOWER T-LIGHT HOLDER,5/13/2011 12:13,1.25,57,52
22192,BLUE DINER WALL CLOCK,12/1/2010 10:03,8.5,345,180
22714,CARD BIRTHDAY COWBOY,12/1/2010 11:41,0.42,156,125
23321,SMALL WHITE HEART OF WICKER,6/9/2011 12:21,1.69,609,378
22844,VINTAGE CREAM DOG FOOD CONTAINER,12/3/2010 13:40,8.5,304,176
22028,PENNY FARTHING BIRTHDAY CARD,12/2/2010 10:39,0.42,188,124
22596,CHRISTMAS STAR WISH LIST CHALKBOARD,1/27/2011 12:00,0.72,246,197
22274,FELTCRAFT DOLL EMILY,12/1/2010 11:49,2.95,240,168
22752,SET 7 BABUSHKA NESTING BOXES,12/1/2010 8:26,7.65,306,188


In [22]:
# Eksempel: Brug SameOrderGraphRecommender.recommend_top_stockcodes til at anbefale top-produkter (i stedet for tilfældigt)
basket = ['21928']
print("Basket:", basket)
display(products_df.loc[basket])

recommended_items = sameorder_recommender.recommend_top_stockcodes(basket=basket, k=10)

print("Recommended items:", recommended_items)
display(products_df.loc[recommended_items])


Basket: ['21928']


Unnamed: 0_level_0,Description,InvoiceDate,UnitPrice,OrdersCount,UsersCount
StockCode,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1
21928,JUMBO BAG SCANDINAVIAN PAISLEY,12/1/2010 14:11,1.95,559,264


Recommended items: ['23576' '22873' '22874' '22875' '22876' '22877' '84637' '84865' '21634'
 '35241']


Unnamed: 0_level_0,Description,InvoiceDate,UnitPrice,OrdersCount,UsersCount
StockCode,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1
23576,SNACK TRAY RED VINTAGE DOILY,11/27/2011 12:13,1.95,30,28
22873,NUMBER TILE COTTAGE GARDEN 5,12/15/2010 15:47,1.95,13,12
22874,NUMBER TILE COTTAGE GARDEN 6,12/15/2010 15:47,1.95,12,10
22875,NUMBER TILE COTTAGE GARDEN 7,12/15/2010 15:47,1.95,14,14
22876,NUMBER TILE COTTAGE GARDEN 8,12/1/2010 14:54,1.95,11,11
22877,NUMBER TILE COTTAGE GARDEN 9,12/2/2010 17:14,1.95,14,13
84637,KITCHEN FLOWER POTS WALL PLAQUE,12/10/2010 11:19,5.95,13,11
84865,NEW BAROQUE BLACK PHOTO ALBUM,12/3/2010 15:19,8.5,12,8
21634,ASSORTED MINI MADRAS NOTEBOOK,3/4/2011 10:21,1.25,23,22
35241,ENAMEL BLUE RIM BISCUIT BIN,12/5/2010 10:36,4.95,30,27


In [None]:
# TBD: Visualize sameorder graph (network diagram; undirected, weighted):

## Pt. 2: Previous-orders recommendation model:

This model takes a time-directional customer-based approach:

Denne model bruger en tids-afhængig retningsbestemt tilgang:

* HVIS en kunde har købt produkt X, hvilke produkter købte kunden efterfølgende?
* Summér over alle kunder.
* Anbefal disse produkter til kunder som kigger på produkt X (eller har lagt produkt X i sin indkøbsvogn). 

OBS: Dette er stadig ikke en "personaliseret" recommendation model. Vi sammenligner ikke kunder mod hinanden, kun produkt-køb.
