In [1]:
import pandas as pd
import numpy as np
from mlxtend.preprocessing import TransactionEncoder
from mlxtend.frequent_patterns import apriori

In [2]:
# Step 1: Data Collection
data_frame = pd.read_excel('data_retail.xlsx')
data_frame.head()

Unnamed: 0,Invoice,StockCode,Description,Quantity,InvoiceDate,Price,Customer ID,Country
0,489434,85048,15CM CHRISTMAS GLASS BALL 20 LIGHTS,12,2009-12-01 07:45:00,6.95,13085.0,United Kingdom
1,489434,79323P,PINK CHERRY LIGHTS,12,2009-12-01 07:45:00,6.75,13085.0,United Kingdom
2,489434,79323W,WHITE CHERRY LIGHTS,12,2009-12-01 07:45:00,6.75,13085.0,United Kingdom
3,489434,22041,"RECORD FRAME 7"" SINGLE SIZE",48,2009-12-01 07:45:00,2.1,13085.0,United Kingdom
4,489434,21232,STRAWBERRY CERAMIC TRINKET BOX,24,2009-12-01 07:45:00,1.25,13085.0,United Kingdom


In [20]:
# Step 2: Data Cleaning
clean_data = data_frame.dropna(subset=["Description", "StockCode", "Invoice"])
filtered_data = clean_data[~clean_data['StockCode'].astype(str).str.startswith('C')]
grouped_transactions = filtered_data.groupby('Invoice')['Description'].apply(list).values

# Convert transactions to a list and remove duplicates
unique_transactions = [list(set(transaction)) for transaction in grouped_transactions]
print(f"======> Total Transactions: {len(unique_transactions)} rows  <======")

# Display a sample of transactions
sample_transactions = unique_transactions[:7]
for transaction in sample_transactions:
    print(transaction)

['SAVE THE PLANET MUG', 'STRAWBERRY CERAMIC TRINKET BOX', '15CM CHRISTMAS GLASS BALL 20 LIGHTS', 'PINK DOUGHNUT TRINKET POT ', 'RECORD FRAME 7" SINGLE SIZE ', ' WHITE CHERRY LIGHTS', 'PINK CHERRY LIGHTS', 'FANCY FONT HOME SWEET HOME DOORMAT']
['LUNCHBOX WITH CUTLERY FAIRY CAKES ', 'CAT BOWL ', 'HEART MEASURING SPOONS LARGE', 'DOG BOWL , CHASING BALL DESIGN']
['AREA PATROLLED METAL SIGN', 'CHRISTMAS CRAFT WHITE FAIRY ', 'HOME BUILDING BLOCK WORD', 'HEART FILIGREE DOVE LARGE', 'CLASSIC WHITE FRAME', 'LOVE BUILDING BLOCK WORD', 'BATH BUILDING BLOCK WORD', 'SMALL MARSHMALLOWS PINK BOWL', 'PLEASE ONE PERSON  METAL SIGN', 'HEART IVORY TRELLIS LARGE', 'FULL ENGLISH BREAKFAST PLATE', 'DOOR MAT BLACK FLOCK ', ' PEACE WOODEN BLOCK LETTERS', 'BISCUITS SMALL BOWL LIGHT BLUE', 'PIZZA PLATE IN BOX', 'SCOTTIE DOG HOT WATER BOTTLE', 'ASSORTED COLOUR BIRD ORNAMENT', 'BLACK DINER WALL CLOCK', 'SET OF 3 BLACK FLYING DUCKS']
['WOODEN BOX ADVENT CALENDAR ', 'PACK OF 6 SKULL PAPER CUPS', 'BLUE PADDED SOFT M

In [28]:

# Step 3: Create Binary Matrix
str_transactions = [[str(item) for item in set(transaction)] for transaction in unique_transactions]
encoder = TransactionEncoder()
binary_data = encoder.fit(sample_transactions).transform(sample_transactions)
binary_matrix = pd.DataFrame(binary_data, columns=encoder.columns_)
print("======> Binary Matrix <======\n", binary_matrix.astype(int))

     PEACE WOODEN BLOCK LETTERS   VINTAGE DESIGN GIFT TAGS  \
0                            0                          0   
1                            0                          0   
2                            1                          0   
3                            0                          0   
4                            0                          0   
5                            0                          1   
6                            0                          0   

    WHITE CHERRY LIGHTS  15CM CHRISTMAS GLASS BALL 20 LIGHTS  \
0                     1                                    1   
1                     0                                    0   
2                     0                                    0   
3                     0                                    0   
4                     0                                    0   
5                     0                                    0   
6                     0                                    0  

In [29]:
# Step 4: Generate Itemsets
transaction_list = [[str(item) for item in set(transaction)] for transaction in grouped_transactions]
print("Total transactions:", len(transaction_list))

Total transactions: 25880


In [30]:
# Step 5: Set Minimum Support and Confidence
min_support_threshold = 0.02
min_confidence_threshold = 0.3
min_lift_threshold = 1.0

In [31]:
# Step 6: Generate Frequent Itemsets
frequent_itemsets = apriori(transaction_list, min_support=min_support_threshold, 
                             min_confidence=min_confidence_threshold, 
                             min_lift=min_lift_threshold)

# Convert results to a list for readability
frequent_results = list(frequent_itemsets)
print(f"Total rules generated: {len(frequent_results)}")

Total rules generated: 10


In [32]:
# Step 7: Output Association Rules
print("\n=== Association Rules ===")
for rule in frequent_results:
    items = [x for x in rule.items]
    print(f"Rule: {items}")
    print(f"Support: {rule.support:.4f}")
    for ordered_stat in rule.ordered_statistics:
        print(f"Confidence: {ordered_stat.confidence:.4f}")
        print(f"Lift: {ordered_stat.lift:.4f}")
    print("-" * 30)


=== Association Rules ===
Rule: ['PACK OF 60 PINK PAISLEY CAKE CASES', '60 TEATIME FAIRY CAKE CASES']
Support: 0.0230
Confidence: 0.4434
Lift: 9.8917
Confidence: 0.5129
Lift: 9.8917
------------------------------
Rule: ['PACK OF 72 RETRO SPOT CAKE CASES', '60 TEATIME FAIRY CAKE CASES']
Support: 0.0233
Confidence: 0.4501
Lift: 8.2085
Confidence: 0.4257
Lift: 8.2085
------------------------------
Rule: ['HEART OF WICKER SMALL', 'HEART OF WICKER LARGE']
Support: 0.0213
Confidence: 0.5009
Lift: 12.4889
Confidence: 0.5318
Lift: 12.4889
------------------------------
Rule: ['HOME BUILDING BLOCK WORD', 'LOVE BUILDING BLOCK WORD']
Support: 0.0227
Confidence: 0.4377
Lift: 10.2707
Confidence: 0.5322
Lift: 10.2707
------------------------------
Rule: ['PACK OF 60 PINK PAISLEY CAKE CASES', 'PACK OF 72 RETRO SPOT CAKE CASES']
Support: 0.0206
Confidence: 0.4603
Lift: 8.3959
Confidence: 0.3763
Lift: 8.3959
------------------------------
Rule: ['WHITE HANGING HEART T-LIGHT HOLDER', 'RED HANGING HEART

In [33]:
# Step 8: Display Frequent Itemsets
results_df = pd.DataFrame([(tuple(result.items), result.support) 
                            for result in frequent_results], 
                           columns=['Itemset', 'Support'])

print("\nFrequent Itemsets:")
print(results_df)


Frequent Itemsets:
                                             Itemset   Support
0  (PACK OF 60 PINK PAISLEY CAKE CASES, 60 TEATIM...  0.022991
1  (PACK OF 72 RETRO SPOT CAKE CASES, 60 TEATIME ...  0.023338
2     (HEART OF WICKER SMALL, HEART OF WICKER LARGE)  0.021329
3  (HOME BUILDING BLOCK WORD, LOVE BUILDING BLOCK...  0.022682
4  (PACK OF 60 PINK PAISLEY CAKE CASES, PACK OF 7...  0.020634
5  (WHITE HANGING HEART T-LIGHT HOLDER, RED HANGI...  0.030139
6  (SWEETHEART CERAMIC TRINKET BOX, STRAWBERRY CE...  0.031607
7  (WHITE HANGING HEART T-LIGHT HOLDER, STRAWBERR...  0.021020
8  (WHITE HANGING HEART T-LIGHT HOLDER, WOODEN FR...  0.020672
9  (WOODEN PICTURE FRAME WHITE FINISH, WOODEN FRA...  0.023532
