# Association rule mining

## 1️⃣ Load Parquet Files

In [1]:
import pandas as pd
import os
# notebook is in: project_root/notebooks
PROJECT_ROOT = os.path.abspath("..")

INTERIM_DIR = os.path.join(PROJECT_ROOT, "data", "interim")
PROCESSED_DIR = os.path.join(PROJECT_ROOT, "data", "processed")

op_train = pd.read_parquet(os.path.join(PROCESSED_DIR, "op_train_temporal.parquet"))
op_test  = pd.read_parquet(os.path.join(PROCESSED_DIR, "op_test_temporal.parquet"))

print(op_train.shape, op_test.shape)

(32434489, 4) (1384617, 4)


### 2️⃣ Prepare Transactions

In [2]:
train_transactions = op_train.groupby("order_id")["product_id"].apply(list)
test_transactions  = op_test.groupby("order_id")["product_id"].apply(list)

train_transactions.head()

order_id
2    [33120, 28985, 9327, 45918, 30035, 17794, 4014...
3    [33754, 24838, 17704, 21903, 17668, 46667, 174...
4    [46842, 26434, 39758, 27761, 10054, 21351, 225...
5    [13176, 15005, 47329, 27966, 23909, 48370, 132...
6                                [40462, 15873, 41897]
Name: product_id, dtype: object

## 3️⃣ Convert to One-Hot Encoding (Needed for Apriori & FP-Growth)
We will use `mlxtend` for this encoding, we can install `pip install mlxtend`. 

In [3]:
from mlxtend.preprocessing import TransactionEncoder

te = TransactionEncoder()
te_array = te.fit(train_transactions).transform(train_transactions)

df_train_encoded = pd.DataFrame(te_array, columns=te.columns_)

MemoryError: Unable to allocate 149. GiB for an array with shape (3214874, 49677) and data type bool

In [6]:
#basket_sizes = transactions_df["items"].apply(len)
#basket_sizes.describe(percentiles=[0.5, 0.75, 0.9, 0.95])

In [3]:
item_counter = Counter()

for items in transactions_df["items"]:
    item_counter.update(set(items))

item_freq_df = (
    pd.DataFrame(item_counter.items(), columns=["item", "count"])
    .sort_values("count", ascending=False)
    .reset_index(drop=True)
)

item_freq_df.head(15)


Unnamed: 0,item,count
0,Banana,473023
1,Bag of Organic Bananas,376928
2,Organic Strawberries,267943
3,Organic Baby Spinach,244707
4,Organic Hass Avocado,216041
5,Organic Avocado,180406
6,Large Lemon,157288
7,Limes,143889
8,Strawberries,142586
9,Organic Raspberries,139544


In [4]:
pair_counter = Counter()

for items in transactions_df["items"]:
    unique_items = sorted(set(items))
    for pair in combinations(unique_items, 2):
        pair_counter[pair] += 1

pair_freq_df = (
    pd.DataFrame(pair_counter.items(), columns=["item_pair", "count"])
    .sort_values("count", ascending=False)
    .reset_index(drop=True)
)

pair_freq_df.head(10)


Unnamed: 0,item_pair,count
0,"(Bag of Organic Bananas, Organic Hass Avocado)",64524
1,"(Bag of Organic Bananas, Organic Strawberries)",64401
2,"(Banana, Organic Strawberries)",58147
3,"(Banana, Organic Avocado)",55442
4,"(Banana, Organic Baby Spinach)",53271
5,"(Bag of Organic Bananas, Organic Baby Spinach)",52343
6,"(Banana, Strawberries)",43010
7,"(Banana, Large Lemon)",42915
8,"(Organic Hass Avocado, Organic Strawberries)",42207
9,"(Bag of Organic Bananas, Organic Raspberries)",42127


In [5]:
num_transactions = len(transactions_df)

# Map item -> count
item_count_map = dict(item_freq_df[["item", "count"]].values)

rules = []

for (item_a, item_b), pair_count in pair_counter.items():
    support = pair_count / num_transactions

    confidence_a_to_b = pair_count / item_count_map[item_a]
    confidence_b_to_a = pair_count / item_count_map[item_b]

    lift = support / (
        (item_count_map[item_a] / num_transactions) *
        (item_count_map[item_b] / num_transactions)
    )

    rules.append({
        "antecedent": item_a,
        "consequent": item_b,
        "pair_count": pair_count,
        "support": support,
        "confidence": confidence_a_to_b,
        "lift": lift
    })

rules_df = pd.DataFrame(rules)
rules_df.sort_values("lift", ascending=False).head(10)


Unnamed: 0,antecedent,consequent,pair_count,support,confidence,lift
260458,Unsweetened Whole Milk Peach Greek Yogurt,Unsweetened Whole Milk Strawberry Yogurt,985,0.000371,0.496472,642.461856
41981,Oh My Yog! Organic Wild Quebec Blueberry Cream...,Oh My Yog! Pacific Coast Strawberry Trilayer Y...,1521,0.000574,0.661592,608.682772
205226,"Mighty 4 Kale, Strawberry, Amaranth & Greek Yo...","Mighty 4 Sweet Potato, Blueberry, Millet & Gre...",1188,0.000448,0.529176,543.003126
31938,Mighty 4 Purple Carrot Blackberry Quinoa & Gre...,"Mighty 4 Sweet Potato, Blueberry, Millet & Gre...",948,0.000358,0.480974,493.541807
96751,"Fiber & Protein Organic Pears, Raspberries, Bu...",Organic Fiber & Protein Pear Blueberry & Spina...,1099,0.000414,0.493046,458.70926
184102,Organic Stage 2 Carrots Baby Food,Sweet Potatoes Stage 2,835,0.000315,0.359914,452.927509
12989,Raspberry Essence Water,Unsweetened Blackberry Water,1064,0.000401,0.522337,452.461898
110404,"Mighty 4 Kale, Strawberry, Amaranth & Greek Yo...",Mighty 4 Purple Carrot Blackberry Quinoa & Gre...,755,0.000285,0.336303,452.416802
65503,Oh My Yog! Madagascar Vanilla Trilayer Yogyurt,Oh My Yog! Organic Wild Quebec Blueberry Cream...,1008,0.00038,0.385763,444.914843
46875,Oh My Yog! Madagascar Vanilla Trilayer Yogyurt,Oh My Yog! Pacific Coast Strawberry Trilayer Y...,1252,0.000472,0.479143,440.824462


In [6]:
filtered_rules_df = rules_df[
    (rules_df["support"] >= 0.001) &      # appears in at least 0.1% of orders
    (rules_df["confidence"] >= 0.3) &     # decent implication strength
    (rules_df["lift"] >= 1.5)             # real positive association
].sort_values("lift", ascending=False)

print("Filtered rules:", filtered_rules_df.shape)
filtered_rules_df.head(15)


Filtered rules: (20, 6)


Unnamed: 0,antecedent,consequent,pair_count,support,confidence,lift
33030,Almond Milk Blueberry Yogurt,Almond Milk Strawberry Yogurt,2762,0.001042,0.571961,260.131328
71329,Organic Whole Milk Strawberry Beet Berry Yogur...,Yotoddler Organic Pear Spinach Mango Yogurt,2875,0.001084,0.444771,191.199556
12392,Blueberry on the Bottom Nonfat Greek Yogurt,Strawberry on the Bottom Nonfat Greek Yogurt,2847,0.001074,0.437394,149.261197
12426,Peach on the Bottom Nonfat Greek Yogurt,Strawberry on the Bottom Nonfat Greek Yogurt,2751,0.001038,0.346168,118.130244
61967,Fat Free Blueberry Yogurt,Total 0% Raspberry Yogurt,2703,0.001019,0.369464,78.876112
69396,Fat Free Strawberry Yogurt,Total 0% Raspberry Yogurt,3108,0.001172,0.310211,66.226192
42278,Icelandic Style Skyr Blueberry Non-fat Yogurt,Non Fat Raspberry Yogurt,7413,0.002796,0.375513,59.376204
37181,Non Fat Acai & Mixed Berries Yogurt,Non Fat Raspberry Yogurt,3257,0.001228,0.364685,57.664064
18826,Nonfat Icelandic Style Strawberry Yogurt,Vanilla Skyr Nonfat Yogurt,3972,0.001498,0.364437,51.863026
44047,Icelandic Style Skyr Blueberry Non-fat Yogurt,Vanilla Skyr Nonfat Yogurt,6803,0.002566,0.344613,49.041821


In [7]:
OUTPUT_DIR = os.path.join(PROJECT_ROOT, "outputs")
os.makedirs(OUTPUT_DIR, exist_ok=True)

filtered_rules_df.to_csv(
    os.path.join(OUTPUT_DIR, "association_rules_named.csv"),
    index=False
)

print("✅ Saved association_rules_named.csv")


✅ Saved association_rules_named.csv
