In [1]:
from neo4j import GraphDatabase
import getpass
import pandas as pd
import numpy as np
from itertools import combinations
from mlxtend.preprocessing import TransactionEncoder
from mlxtend.frequent_patterns import apriori, association_rules, fpgrowth

In [2]:
uri = "bolt://localhost:7687"
username = "neo4j"
password = getpass.getpass("Enter your Neo4j password: ")

driver = GraphDatabase.driver(uri, auth=(username, password))

def execute_query(driver, query):
    with driver.session() as session:
        result = session.run(query)
        return [record for record in result]

# get ALL nodes and edges WHERE eval_set <> "test"

query_order_nodes = """
MATCH (o:Order)
WHERE o.eval_set <> 'test'
RETURN id(o) as node_id, labels(o) as labels, properties(o) as properties
"""


order_nodes = execute_query(driver, query_order_nodes)

query_product_nodes = """
MATCH (p:Product)
RETURN id(p) as node_id, labels(p) as labels, properties(p) as properties
"""

product_nodes = execute_query(driver, query_product_nodes)


query_edges =   """
                MATCH (o:Order)-[r:ORDERED]->(p:Product)
                WHERE o.eval_set <> 'test'
                RETURN id(o) as start_node, id(p) as end_node, properties(r) as properties
                """

edges = execute_query(driver, query_edges)

driver.close()




# Product association rules

## A-priori Algorithm 

I did not implement from scratch as I did not think we needed to

In [3]:
product_id_mapping = {product['node_id']: product['properties']['product_name'] for product in product_nodes}
order_id_mapping = {order['node_id']: order['properties']['order_id'] for order in order_nodes}

transactions_dict = {}

for edge in edges:
    order_id = order_id_mapping[edge['start_node']]
    product_name = product_id_mapping[edge['end_node']]
    
    if order_id not in transactions_dict.keys():
        transactions_dict[order_id] = []
    
    transactions_dict[order_id].append(product_name)


transactions = list(transactions_dict.values())
for transaction in transactions:
    transaction.sort()

In [4]:
transactions[:3]

[['Creamy Almond Butter',
  'Organic String Cheese',
  'Original Beef Jerky',
  'Pistachios',
  'Soda'],
 ['Bag of Organic Bananas',
  'Bartlett Pears',
  'Honeycrisp Apples',
  'Organic Fuji Apples',
  'Organic String Cheese',
  'Original Beef Jerky',
  'Pistachios',
  'Soda'],
 ['Cinnamon Toast Crunch',
  'Organic String Cheese',
  'Original Beef Jerky',
  'Pistachios',
  'Soda']]

In [5]:
# Just to confirm that there are many empty orders

orders_with_products = {edge['start_node'] for edge in edges}
orders_without_products = set(order_id_mapping.keys()) - orders_with_products

print(f"Number of orders: {len(order_id_mapping)}")
print(f"Number of orders with products: {len(orders_with_products)}")
print(f"Number of orders without products: {len(orders_without_products)}")



Number of orders: 3346083
Number of orders with products: 1279158
Number of orders without products: 2066925


In [6]:

# Initialize TransactionEncoder
te = TransactionEncoder()
te_ary = te.fit(transactions).transform(transactions)
df = pd.DataFrame(te_ary, columns=te.columns_)

df.head()

Unnamed: 0,#2 Coffee Filters,#2 Cone White Coffee Filters,#2 Mechanical Pencils,#4 Natural Brown Coffee Filters,& Go! Hazelnut Spread + Pretzel Sticks,'Swingtop' Premium Lager,(70% Juice!) Mountain Raspberry Juice Squeeze,+Energy Black Cherry Vegetable & Fruit Juice,".5\"" Waterproof Tape",0 Calorie Acai Raspberry Water Beverage,...,with Twist Ties Sandwich & Storage Bags,with Xylitol Cinnamon 18 Sticks Sugar Free Gum,with Xylitol Island Berry Lime 18 Sticks Sugar Free Gum,with Xylitol Minty Sweet Twist 18 Sticks Sugar Free Gum,with Xylitol Original Flavor 18 Sticks Sugar Free Gum,with Xylitol Unwrapped Original Flavor 50 Sticks Sugar Free Gum,with Xylitol Unwrapped Spearmint 50 Sticks Sugar Free Gum,with Xylitol Watermelon Twist 18 Sticks Sugar Free Gum,with a Splash of Mango Coconut Water,with a Splash of Pineapple Coconut Water
0,False,False,False,False,False,False,False,False,False,False,...,False,False,False,False,False,False,False,False,False,False
1,False,False,False,False,False,False,False,False,False,False,...,False,False,False,False,False,False,False,False,False,False
2,False,False,False,False,False,False,False,False,False,False,...,False,False,False,False,False,False,False,False,False,False
3,False,False,False,False,False,False,False,False,False,False,...,False,False,False,False,False,False,False,False,False,False
4,False,False,False,False,False,False,False,False,False,False,...,False,False,False,False,False,False,False,False,False,False


In [7]:

# Adjust the min_support 
min_support_threshold = 0.01
frequent_itemsets = apriori(df, min_support=min_support_threshold, use_colnames=True)

# Adjust the min_threshold 
min_confidence_threshold = 0.04
rules = association_rules(frequent_itemsets, metric="confidence", min_threshold=min_confidence_threshold)


In [8]:
rules

Unnamed: 0,antecedents,consequents,antecedent support,consequent support,support,confidence,lift,leverage,conviction,zhangs_metric
0,(Bag of Organic Bananas),(Organic Baby Spinach),0.118126,0.074927,0.015717,0.133049,1.775708,0.006866,1.067042,0.495359
1,(Organic Baby Spinach),(Bag of Organic Bananas),0.074927,0.118126,0.015717,0.209758,1.775708,0.006866,1.115953,0.472227
2,(Bag of Organic Bananas),(Organic Hass Avocado),0.118126,0.065256,0.019182,0.162387,2.488453,0.011474,1.115961,0.678265
3,(Organic Hass Avocado),(Bag of Organic Bananas),0.065256,0.118126,0.019182,0.293951,2.488453,0.011474,1.249027,0.639901
4,(Organic Raspberries),(Bag of Organic Bananas),0.042407,0.118126,0.012622,0.29765,2.51976,0.007613,1.255604,0.629847
5,(Bag of Organic Bananas),(Organic Raspberries),0.118126,0.042407,0.012622,0.106855,2.51976,0.007613,1.072159,0.683926
6,(Bag of Organic Bananas),(Organic Strawberries),0.118126,0.082734,0.019693,0.166709,2.014992,0.00992,1.100774,0.571193
7,(Organic Strawberries),(Bag of Organic Bananas),0.082734,0.118126,0.019693,0.238023,2.014992,0.00992,1.15735,0.549154
8,(Large Lemon),(Banana),0.048882,0.14692,0.013071,0.2674,1.820038,0.005889,1.164456,0.473717
9,(Banana),(Large Lemon),0.14692,0.048882,0.013071,0.088967,1.820038,0.005889,1.044,0.528158


In [9]:

filtered_frequent_itemsets = frequent_itemsets[frequent_itemsets['itemsets'].apply(lambda x: len(x) >= 2)]

filtered_frequent_itemsets


Unnamed: 0,support,itemsets
103,0.015717,"(Bag of Organic Bananas, Organic Baby Spinach)"
104,0.019182,"(Bag of Organic Bananas, Organic Hass Avocado)"
105,0.012622,"(Organic Raspberries, Bag of Organic Bananas)"
106,0.019693,"(Bag of Organic Bananas, Organic Strawberries)"
107,0.013071,"(Large Lemon, Banana)"
108,0.016487,"(Banana, Organic Avocado)"
109,0.015904,"(Banana, Organic Baby Spinach)"
110,0.010385,"(Organic Fuji Apple, Banana)"
111,0.017375,"(Banana, Organic Strawberries)"
112,0.01308,"(Banana, Strawberries)"


## FP growth Algorithm

I did not implement from scratch here either


In [10]:
frequent_itemsets = fpgrowth(df, min_support=min_support_threshold, use_colnames=True)
rules = association_rules(frequent_itemsets, metric="confidence", min_threshold=min_confidence_threshold)
rules

Unnamed: 0,antecedents,consequents,antecedent support,consequent support,support,confidence,lift,leverage,conviction,zhangs_metric
0,(Bag of Organic Bananas),(Organic Hass Avocado),0.118126,0.065256,0.019182,0.162387,2.488453,0.011474,1.115961,0.678265
1,(Organic Hass Avocado),(Bag of Organic Bananas),0.065256,0.118126,0.019182,0.293951,2.488453,0.011474,1.249027,0.639901
2,(Organic Strawberries),(Organic Hass Avocado),0.082734,0.065256,0.012555,0.151753,2.325492,0.007156,1.101971,0.621394
3,(Organic Hass Avocado),(Organic Strawberries),0.065256,0.082734,0.012555,0.192398,2.325492,0.007156,1.135789,0.609775
4,(Organic Baby Spinach),(Organic Hass Avocado),0.074927,0.065256,0.010616,0.141678,2.171106,0.005726,1.089036,0.583095
5,(Organic Hass Avocado),(Organic Baby Spinach),0.065256,0.074927,0.010616,0.162675,2.171106,0.005726,1.104796,0.577062
6,(Organic Raspberries),(Bag of Organic Bananas),0.042407,0.118126,0.012622,0.29765,2.51976,0.007613,1.255604,0.629847
7,(Bag of Organic Bananas),(Organic Raspberries),0.118126,0.042407,0.012622,0.106855,2.51976,0.007613,1.072159,0.683926
8,(Organic Raspberries),(Organic Strawberries),0.042407,0.082734,0.010759,0.253701,3.06646,0.00725,1.229086,0.703734
9,(Organic Strawberries),(Organic Raspberries),0.082734,0.042407,0.010759,0.130039,3.06646,0.00725,1.100731,0.734674


In [13]:

def eclat(prefix, items, frequent_itemsets, min_support, transactions_len):
    while items:
        item, tidset = items.pop(0)
        itemset = prefix.copy()
        itemset.add(item)
        support = len(tidset) / transactions_len
        if support >= min_support:

            frequent_itemsets.append((itemset, support))
            suffix = []
            for other_item, other_tidset in items:
                new_tidset = tidset & other_tidset  # Intersection of tidsets
                if len(new_tidset) >= min_support * transactions_len:
                    suffix.append((other_item, new_tidset))

            eclat(itemset, suffix, frequent_itemsets, min_support, transactions_len)


def run_eclat(transactions, min_support):

    data = {}
    transactions_len = len(transactions)
    for i, transaction in enumerate(transactions):
        for item in transaction:
            if item not in data:
                data[item] = set()
            data[item].add(i)

    # Convert to list of tuples and sort by frequency
    sorted_items = sorted(data.items(), key=lambda x: len(x[1]), reverse=True)


    frequent_itemsets = []
    eclat(set(), sorted_items, frequent_itemsets, min_support, transactions_len)
    return frequent_itemsets


min_support_threshold = 0.007 
frequent_itemsets = run_eclat(transactions, min_support_threshold)

for itemset in frequent_itemsets:
    if len(itemset[0]) >=2:
        print(f"itemset: {itemset[0]} --> Confidence: {round(itemset[1],3)}")

itemset: {'Banana', 'Organic Strawberries'} --> Confidence: 0.017
itemset: {'Banana', 'Organic Baby Spinach'} --> Confidence: 0.016
itemset: {'Banana', 'Organic Hass Avocado'} --> Confidence: 0.01
itemset: {'Banana', 'Organic Avocado'} --> Confidence: 0.016
itemset: {'Large Lemon', 'Banana'} --> Confidence: 0.013
itemset: {'Banana', 'Strawberries'} --> Confidence: 0.013
itemset: {'Banana', 'Limes'} --> Confidence: 0.01
itemset: {'Banana', 'Organic Raspberries'} --> Confidence: 0.007
itemset: {'Banana', 'Organic Whole Milk'} --> Confidence: 0.01
itemset: {'Banana', 'Cucumber Kirby'} --> Confidence: 0.01
itemset: {'Organic Fuji Apple', 'Banana'} --> Confidence: 0.01
itemset: {'Banana', 'Seedless Red Grapes'} --> Confidence: 0.008
itemset: {'Banana', 'Honeycrisp Apple'} --> Confidence: 0.009
itemset: {'Organic Strawberries', 'Bag of Organic Bananas'} --> Confidence: 0.02
itemset: {'Bag of Organic Bananas', 'Organic Baby Spinach'} --> Confidence: 0.016
itemset: {'Bag of Organic Bananas', '