In [2]:
import pandas as pd
from itertools import combinations, chain

In [3]:
dataset = [
    ['Coffee', 'Donut', 'Sandwich'],
    ['Coffee', 'Donut'],
    ['Coffee', 'Sandwich'],
    ['Coffee', 'Muffin'],
    ['Donut', 'Muffin']
]
n_trans = len(dataset)

In [4]:
def fmt_itemset(it):
    return "(" + ", ".join(it) + ")"

In [5]:
def support_count(itemset, transactions):
    s = set(itemset)
    cnt = sum(1 for t in transactions if s.issubset(set(t)))
    sup = cnt / len(transactions)
    return cnt, sup

In [6]:
all_items = sorted({item for t in dataset for item in t})

In [7]:
print("Q1. Transactions:")
for i, t in enumerate(dataset, 1):
    print(f" T{i}: {t}")
print(f"Total transactions: {n_trans}\n")

Q1. Transactions:
 T1: ['Coffee', 'Donut', 'Sandwich']
 T2: ['Coffee', 'Donut']
 T3: ['Coffee', 'Sandwich']
 T4: ['Coffee', 'Muffin']
 T5: ['Donut', 'Muffin']
Total transactions: 5



In [8]:
ohe_rows = [{item: (1 if item in t else 0) for item in all_items} for t in dataset]
df_ohe = pd.DataFrame(ohe_rows, index=[f"T{i}" for i in range(1, n_trans+1)])
print("Q2. One-hot encoded DataFrame (rows = transactions, columns = items):")
print(df_ohe)
print("\nExplanation: each ROW = a transaction (T1..). each COLUMN = an item. 1 => present, 0 => absent\n")

Q2. One-hot encoded DataFrame (rows = transactions, columns = items):
    Coffee  Donut  Muffin  Sandwich
T1       1      1       0         1
T2       1      1       0         0
T3       1      0       0         1
T4       1      0       1         0
T5       0      1       1         0

Explanation: each ROW = a transaction (T1..). each COLUMN = an item. 1 => present, 0 => absent



In [9]:
min_support = 0.4
# generate all non-empty itemsets
items = all_items
all_itemsets = []
for r in range(1, len(items)+1):
    for comb in combinations(items, r):
        all_itemsets.append(tuple(comb))

support_list = []
for itm in all_itemsets:
    cnt, sup = support_count(itm, dataset)
    support_list.append({"itemset": itm, "count": cnt, "support": round(sup, 3)})

support_df = pd.DataFrame(support_list).sort_values(["support","itemset"], ascending=[False, True]).reset_index(drop=True)
freq_itemsets = support_df[support_df['support'] >= min_support].reset_index(drop=True)

print(f"Q3. Frequent itemsets with min_support = {min_support}:")
print(freq_itemsets.to_string(index=False))
print()

Q3. Frequent itemsets with min_support = 0.4:
           itemset  count  support
         (Coffee,)      4      0.8
          (Donut,)      3      0.6
   (Coffee, Donut)      2      0.4
(Coffee, Sandwich)      2      0.4
         (Muffin,)      2      0.4
       (Sandwich,)      2      0.4



In [10]:
rules = []
# We'll use the support_df values for quick lookups
support_lookup = {tuple(row['itemset']): row['support'] for _, row in support_df.iterrows()}

for _, row in freq_itemsets.iterrows():
    itemset = row['itemset']
    sup_itemset = row['support']
    if len(itemset) < 2:
        continue
    # all non-empty proper subsets A
    for r in range(1, len(itemset)):
        for A in combinations(itemset, r):
            A = tuple(A)
            B = tuple(sorted(set(itemset) - set(A)))
            sup_A = support_lookup.get(A, 0)
            sup_B = support_lookup.get(B, 0)
            if sup_A == 0 or sup_B == 0:
                continue
            confidence = sup_itemset / sup_A
            lift = confidence / sup_B
            rules.append({
                "antecedent": A,
                "consequent": B,
                "itemset": itemset,
                "support": round(sup_itemset, 3),
                "confidence": round(confidence, 3),
                "lift": round(lift, 3)
            })

rules_df = pd.DataFrame(rules).sort_values(['confidence','lift'], ascending=[False, False]).reset_index(drop=True)

print("Q4. All association rules (support, confidence, lift):")
if rules_df.empty:
    print(" No rules (no frequent itemsets of size>=2).")
else:
    # Pretty-print rows
    for i, r in rules_df.iterrows():
        print(f" Rule {i+1}: {fmt_itemset(r['antecedent'])} -> {fmt_itemset(r['consequent'])} | "
              f"support={r['support']}, confidence={r['confidence']}, lift={r['lift']}")
print()

Q4. All association rules (support, confidence, lift):
 Rule 1: (Sandwich) -> (Coffee) | support=0.4, confidence=1.0, lift=1.25
 Rule 2: (Donut) -> (Coffee) | support=0.4, confidence=0.667, lift=0.833
 Rule 3: (Coffee) -> (Sandwich) | support=0.4, confidence=0.5, lift=1.25
 Rule 4: (Coffee) -> (Donut) | support=0.4, confidence=0.5, lift=0.833



In [11]:
min_confidence = 0.6
qualified = rules_df[(rules_df['support'] >= min_support) & (rules_df['confidence'] >= min_confidence)].reset_index(drop=True)

print(f"Q5. Rules with support >= {min_support} and confidence >= {min_confidence}:")
if qualified.empty:
    print(" No rules meet both thresholds.")
else:
    for i, r in qualified.iterrows():
        print(f" Rule {i+1}: {fmt_itemset(r['antecedent'])} -> {fmt_itemset(r['consequent'])} | "
              f"support={r['support']}, confidence={r['confidence']}, lift={r['lift']}")
print()

Q5. Rules with support >= 0.4 and confidence >= 0.6:
 Rule 1: (Sandwich) -> (Coffee) | support=0.4, confidence=1.0, lift=1.25
 Rule 2: (Donut) -> (Coffee) | support=0.4, confidence=0.667, lift=0.833



In [12]:
if not qualified.empty:
    # choose by highest confidence then highest lift
    chosen = qualified.sort_values(['confidence','lift'], ascending=[False, False]).iloc[0]
    A = chosen['antecedent']
    B = chosen['consequent']
    print("Q6. Interpretation of a strong rule:")
    print(f" Rule: {fmt_itemset(A)} -> {fmt_itemset(B)}")
    print(f" Support = {chosen['support']}, Confidence = {chosen['confidence']}, Lift = {chosen['lift']}")
    print(f" Interpretation: If a customer buys {A}, they are likely to also buy {B}.")
else:
    print("Q6. No qualified rules to interpret.")
print()

Q6. Interpretation of a strong rule:
 Rule: (Sandwich) -> (Coffee)
 Support = 0.4, Confidence = 1.0, Lift = 1.25
 Interpretation: If a customer buys ('Sandwich',), they are likely to also buy ('Coffee',).



In [13]:
def experiment(ms, mc):
    freq = support_df[support_df['support'] >= ms]
    # build rules from freq and count rules that meet both
    rcount = 0
    for _, row in freq.iterrows():
        it = row['itemset']
        sup_it = row['support']
        if len(it) < 2:
            continue
        for r in range(1, len(it)):
            for A in combinations(it, r):
                A = tuple(A)
                B = tuple(sorted(set(it) - set(A)))
                sup_A = support_lookup.get(A, 0)
                if sup_A == 0:
                    continue
                conf = sup_it / sup_A
                if sup_it >= ms and conf >= mc:
                    rcount += 1
    return len(freq), rcount

tests = [(0.4,0.6),(0.4,0.7),(0.5,0.6),(0.6,0.6),(0.3,0.5)]
print("Q7. Experiments (min_support, min_confidence) -> (n_frequent_itemsets, n_rules_passing_both):")
for ms, mc in tests:
    n_freq, n_rules = experiment(ms, mc)
    print(f" ({ms}, {mc}) -> ({n_freq}, {n_rules})")
print()
print("Observation: Raising min_support reduces frequent itemsets; raising min_confidence reduces accepted rules.\n")

Q7. Experiments (min_support, min_confidence) -> (n_frequent_itemsets, n_rules_passing_both):
 (0.4, 0.6) -> (6, 2)
 (0.4, 0.7) -> (6, 1)
 (0.5, 0.6) -> (2, 0)
 (0.6, 0.6) -> (2, 0)
 (0.3, 0.5) -> (6, 4)

Observation: Raising min_support reduces frequent itemsets; raising min_confidence reduces accepted rules.



In [14]:
print("Q8. Why Lift > 1 indicates a good association rule:")
print(" - Lift = support(A∪B) / (support(A) * support(B)).")
print(" - If A and B were independent, lift = 1.")
print(" - Lift > 1: A and B occur together more frequently than expected under independence -> positive association.")
print(" - Lift < 1: they co-occur less often than expected -> negative association.")
print()

Q8. Why Lift > 1 indicates a good association rule:
 - Lift = support(A∪B) / (support(A) * support(B)).
 - If A and B were independent, lift = 1.
 - Lift > 1: A and B occur together more frequently than expected under independence -> positive association.
 - Lift < 1: they co-occur less often than expected -> negative association.

