In [66]:
import csv

# Load the dataset and skip rows with canceled invoices
rows = []
with open('online_retail.csv') as f:
    reader = csv.reader(f)
    next(reader)  # Skip the header row
    for cols in reader:
        if len(cols) == 8 and not cols[0].startswith('C'):
            rows.append(cols)


In [67]:
from collections import defaultdict

# Group items by invoice number
invoices = defaultdict(set)
for row in rows:
    invoices[row[0]].add(row[2])  # row[2] contains the Description


In [68]:
# Create a list of all unique items
all_items = sorted({item for items in invoices.values() for item in items})

# Build the presence matrix
presence_matrix = [[int(item in invoice) for item in all_items] for invoice in invoices.values()]

# Convert the matrix to a DataFrame
import pandas as pd
df = pd.DataFrame(data=presence_matrix, columns=all_items)
df_bool = df.astype(bool)


In [69]:
from mlxtend.frequent_patterns import fpgrowth

# Run FP-Growth for various support levels
for minsup in [0.5, 0.1, 0.05, 0.02, 0.01]:
    freq_itemsets = fpgrowth(df_bool, minsup)
    print(f"Support: {minsup}, Itemsets: {len(freq_itemsets)}")


Support: 0.5, Itemsets: 0
Support: 0.1, Itemsets: 1
Support: 0.05, Itemsets: 23
Support: 0.02, Itemsets: 303
Support: 0.01, Itemsets: 1472


In [70]:
freq_itemsets = fpgrowth(df_bool, 0.02)
multi_itemsets = freq_itemsets[freq_itemsets['itemsets'].map(len) > 1]
print(multi_itemsets)

      support            itemsets
246  0.021392        (1824, 1825)
247  0.029007          (162, 166)
248  0.021302          (165, 166)
249  0.024429        (3970, 3966)
250  0.022435        (3904, 2837)
251  0.026242        (1858, 2046)
252  0.037391        (1856, 1858)
253  0.023341        (1856, 1871)
254  0.032814        (1858, 1871)
255  0.026514        (1858, 1846)
256  0.023477        (1857, 1858)
257  0.020531         (2387, 115)
258  0.022027        (3547, 2861)
259  0.030819        (1858, 1869)
260  0.023749        (1869, 1871)
261  0.021211        (1856, 1869)
262  0.022344        (1701, 1702)
263  0.020350        (3904, 1858)
264  0.025018        (2436, 2431)
265  0.025381        (2049, 2046)
266  0.023205        (2049, 2038)
267  0.027466        (2045, 2046)
268  0.024656        (2045, 2038)
269  0.022752        (2041, 2045)
270  0.023840        (2046, 2055)
271  0.021710        (2049, 2055)
272  0.020123        (2038, 2055)
273  0.020305        (2041, 2055)
274  0.029052 

In [71]:
import numpy as np

# Assuming df is populated and M is the binary matrix from df
M = df_bool.values  # Matrix from the df DataFrame

# Calculate support and confidence for the itemsets
support_2656 = len(M[M[:, 2656] == 1]) / len(M)
support_1599 = len(M[M[:, 1599] == 1]) / len(M)
support_both = len(M[(M[:, 2656] == 1) & (M[:, 1599] == 1)]) / len(M)

# Print results
print(f"Confidence 2656 => 1599: {support_both / support_2656}")
print(f"Confidence 1599 => 2656: {support_both / support_1599}")


Confidence 2656 => 1599: 0.8263707571801567
Confidence 1599 => 2656: 0.6236453201970443


In [72]:
from mlxtend.frequent_patterns import association_rules

# Generate rules with minimum confidence of 0.85
rules = association_rules(freq_itemsets, metric='confidence', min_threshold=0.85)
print(rules)


    antecedents consequents  antecedent support  consequent support   support  \
0  (2656, 3003)      (1599)            0.027148            0.046003  0.024565   
1  (2656, 1599)      (3003)            0.028689            0.048314  0.024565   

   confidence       lift  leverage  conviction  zhangs_metric  
0    0.904841  19.669380  0.023316   10.025342       0.975647  
1    0.856240  17.722404  0.023179    6.619970       0.971444  


In [73]:
from mlxtend.frequent_patterns import apriori
import timeit

# Compare the execution time of FP-Growth and Apriori
print("FP-Growth:", timeit.timeit(lambda: fpgrowth(df_bool, 0.02), number=1))
print("Apriori:", timeit.timeit(lambda: apriori(df_bool, 0.02), number=1))


FP-Growth: 1.1358538750000662
Apriori: 0.9845136250000905
