In [1]:
# [Student: Hans] 1. Import required libraries and simulate supermarket transactions
import pandas as pd                       # For working with DataFrames (tables)
import random                             # For generating random selections
from mlxtend.frequent_patterns import apriori, association_rules  # For frequent itemset mining
from mlxtend.preprocessing import TransactionEncoder              # For converting transactions to binary matrix
from IPython.display import display        # For better DataFrame display in notebooks

# Define a pool of 30 unique supermarket items
item_pool = [
    "Milk", "Bread", "Eggs", "Butter", "Cheese", "Juice", "Soda", "Chips", "Chocolate", "Cookies",
    "Cereal", "Yogurt", "Coffee", "Tea", "Sugar", "Salt", "Rice", "Pasta", "Tomatoes", "Onions",
    "Apples", "Bananas", "Oranges", "Chicken", "Beef", "Fish", "Toilet Paper", "Soap", "Shampoo", "Detergent"
]

# Simulate 3000 transactions (each transaction has between 2 to 7 randomly selected items)
random.seed(42)  # For reproducibility — ensures the same random results each time you run it
transactions = []
for _ in range(3000):  # Loop 3000 times for 3000 transactions
    n_items = random.randint(2, 7)  # Random number of items between 2 and 7
    transaction = random.sample(item_pool, n_items)  # Randomly select items
    transactions.append(transaction)  # Add transaction to the list



In [2]:
# [Student: Vivian] Convert list of transactions to DataFrame using one-hot encoding
te = TransactionEncoder()                                # Create encoder object
te_ary = te.fit(transactions).transform(transactions)    # Transform list of transactions to binary array
df = pd.DataFrame(te_ary, columns=te.columns_)           # Create a DataFrame from binary array

# Save raw simulated transactions to CSV file for transparency
df_raw = pd.DataFrame({'transaction': [', '.join(t) for t in transactions]})  # Join each transaction into a single string
df_raw.to_csv('supermarket_transactions.csv', index=False)  # Save to CSV

print("Sample Transactions:")
print(df_raw.head())  # Show first few transactions as text (not encoded)

Sample Transactions:
                                         transaction
0  Butter, Milk, Chicken, Chocolate, Chips, Fish,...
1  Butter, Bananas, Chicken, Pasta, Eggs, Tomatoe...
2                                         Milk, Eggs
3                                Chips, Rice, Onions
4                                        Pasta, Soda


In [3]:
# [Student: Rosamistica] Generate frequent itemsets with min_support = 0.05
frequent_itemsets = apriori(df, min_support=0.05, use_colnames=True)  # Run Apriori algorithm on the encoded DataFrame
frequent_itemsets['length'] = frequent_itemsets['itemsets'].apply(lambda x: len(x))  # Add column for number of items in each itemset
frequent_itemsets.sort_values(by='support', ascending=False, inplace=True)  # Sort by support (most common first)

# Save frequent itemsets to CSV
frequent_itemsets.to_csv('frequent_itemsets.csv', index=False)

print("Top 10 Frequent Itemsets:")
print(frequent_itemsets.head(10))  # Display the top 10 most frequent itemsets

Top 10 Frequent Itemsets:
     support   itemsets  length
1   0.170000  (Bananas)       1
4   0.162333   (Butter)       1
21  0.161000     (Salt)       1
0   0.159333   (Apples)       1
6   0.159000   (Cheese)       1
18  0.156667  (Oranges)       1
26  0.154333      (Tea)       1
7   0.154000  (Chicken)       1
25  0.154000    (Sugar)       1
10  0.152667   (Coffee)       1


In [4]:
# [Student: Faith] Identify closed frequent itemsets
# Closed itemset: No superset has the same support value
def is_closed(itemset, support):
    for _, row in frequent_itemsets.iterrows():
        if itemset < row['itemsets'] and support == row['support']:  # Check if a larger set has the same support
            return False
    return True

frequent_itemsets['is_closed'] = frequent_itemsets.apply(
    lambda row: is_closed(row['itemsets'], row['support']), axis=1
)
closed_itemsets = frequent_itemsets[frequent_itemsets['is_closed'] == True]  # Filter only closed itemsets

# Save closed itemsets to CSV
closed_itemsets.to_csv('closed_itemsets.csv', index=False)

print("\nClosed Frequent Itemsets:")
print(closed_itemsets.head())  # Display some closed itemsets


Closed Frequent Itemsets:
     support   itemsets  length  is_closed
1   0.170000  (Bananas)       1       True
4   0.162333   (Butter)       1       True
21  0.161000     (Salt)       1       True
0   0.159333   (Apples)       1       True
6   0.159000   (Cheese)       1       True


In [5]:
# [Student: Innocent] Identify maximal frequent itemsets
# Maximal itemset: No superset is frequent
def is_maximal(itemset):
    for _, row in frequent_itemsets.iterrows():
        if itemset < row['itemsets']:  # If there is a superset, it's not maximal
            return False
    return True

frequent_itemsets['is_maximal'] = frequent_itemsets['itemsets'].apply(is_maximal)
maximal_itemsets = frequent_itemsets[frequent_itemsets['is_maximal'] == True]  # Filter only maximal itemsets

# Save maximal itemsets to CSV
maximal_itemsets.to_csv('maximal_itemsets.csv', index=False)

print("\nMaximal Frequent Itemsets:")
print(maximal_itemsets.head(10))  # Show top maximal itemsets


Maximal Frequent Itemsets:
     support   itemsets  length  is_closed  is_maximal
1   0.170000  (Bananas)       1       True        True
4   0.162333   (Butter)       1       True        True
21  0.161000     (Salt)       1       True        True
0   0.159333   (Apples)       1       True        True
6   0.159000   (Cheese)       1       True        True
18  0.156667  (Oranges)       1       True        True
26  0.154333      (Tea)       1       True        True
7   0.154000  (Chicken)       1       True        True
25  0.154000    (Sugar)       1       True        True
10  0.152667   (Coffee)       1       True        True
