# Apriori notebook
This notebook loads the cleaned data saved by `preprocessing.ipynb`, builds the basket, runs Apriori, saves rules for the Streamlit app, and provides an in-notebook recommendation helper.

In [2]:
import pandas as pd
from pathlib import Path

# Load cleaned data produced by preprocessing.ipynb
cleaned_path = Path("cleaned_retail.csv")
if not cleaned_path.exists():
    raise FileNotFoundError(f"{cleaned_path} not found. Run preprocessing.ipynb first to produce cleaned_retail.csv")
data = pd.read_csv(cleaned_path)
data.head()

Unnamed: 0,InvoiceNo,StockCode,Description,Quantity,InvoiceDate,UnitPrice,CustomerID,Country,Year,Month,Day,Hour,DayOfWeek
0,536365,85123A,white hanging heart t-light holder,6,2010-12-01 08:26:00,2.55,17850.0,United Kingdom,2010,12,1,8,Wednesday
1,536365,71053,white metal lantern,6,2010-12-01 08:26:00,3.39,17850.0,United Kingdom,2010,12,1,8,Wednesday
2,536365,84406B,cream cupid hearts coat hanger,8,2010-12-01 08:26:00,2.75,17850.0,United Kingdom,2010,12,1,8,Wednesday
3,536365,84029G,knitted union flag hot water bottle,6,2010-12-01 08:26:00,3.39,17850.0,United Kingdom,2010,12,1,8,Wednesday
4,536365,84029E,red woolly hottie white heart.,6,2010-12-01 08:26:00,3.39,17850.0,United Kingdom,2010,12,1,8,Wednesday


In [3]:
# Prepare basket (one-hot) for Apriori
basket_data = data[["InvoiceNo", "Description"]].copy()
basket = (basket_data.groupby(["InvoiceNo", "Description"])['Description']
          .count().unstack().fillna(0))
# Use 1/0 encoding (mlxtend accepts booleans or 1/0)
basket = (basket > 0).astype(bool)
basket.shape

(18511, 3447)

In [4]:
# Install mlxtend if needed (uncomment to run)
# !pip install mlxtend
from mlxtend.frequent_patterns import apriori, association_rules

# Find frequent itemsets (adjust min_support as needed)
frequent_itemsets = apriori(basket, min_support=0.02, use_colnames=True)
frequent_itemsets.sort_values('support', ascending=False).head()

Unnamed: 0,support,itemsets
294,0.10275,(white hanging heart t-light holder)
213,0.102047,(regency cakestand 3 tier)
114,0.097077,(jumbo bag red retrospot)
171,0.082924,(party bunting)
139,0.077792,(lunch bag red retrospot)


In [5]:
# Generate association rules and filter strong rules
rules = association_rules(frequent_itemsets, metric="lift", min_threshold=1)
# Filter strong rules (tunable thresholds)
strong_rules = rules[(rules['confidence'] > 0.1) & (rules['lift'] > 2)].copy()
strong_rules.shape

(166, 14)

In [6]:
# Save rules for use by the Streamlit app
# Pickle preserves frozenset objects for antecedents/consequents
strong_rules.to_pickle("strong_rules.pkl")
# Also save a CSV-friendly version (antecedents/consequents as pipe-separated strings)
sr = strong_rules.copy()
sr['antecedents_str'] = sr['antecedents'].apply(lambda s: '|'.join(sorted(list(s))))
sr['consequents_str'] = sr['consequents'].apply(lambda s: '|'.join(sorted(list(s))))
sr.to_csv("strong_rules.csv", index=False)
print("Saved strong_rules.pkl and strong_rules.csv")

Saved strong_rules.pkl and strong_rules.csv


In [7]:
# Recommendation helper (in-notebook use)
def recommend_items(item_name, rules_df, top_n=5):
    # Accepts `strong_rules` DataFrame with frozenset antecedents/consequents
    item_name = item_name.strip().upper()
    # Ensure antecedents are uppercase strings when comparing
    def contains_item(fs):
        return any(str(x).upper() == item_name for x in fs)
    rec = rules_df[rules_df['antecedents'].apply(contains_item)]
    rec = rec.sort_values(['confidence', 'lift'], ascending=False)
    recommendations = []
    for cons in rec['consequents']:
        recommendations.extend([str(x).strip() for x in cons])
    # Preserve order and remove the queried item if present
    seen = []
    for r in recommendations:
        if r.upper() != item_name and r not in seen:
            seen.append(r)
    return seen[:top_n]

# Example usage (uncomment to try)
# print(recommend_items('white hanging heart t-light holder', strong_rules))