In [1]:
import random
import numpy as np
import pandas as pd
from sklearn.model_selection import train_test_split
from mlxtend.preprocessing import TransactionEncoder
from mlxtend.frequent_patterns import fpgrowth

In [2]:
# 1. Generate Synthetic Data
# ----------------------------
# Define a list of items and create random transactions.
items = ['A', 'B', 'C', 'D', 'E', 'F']
n_transactions = 100
transactions = []

random.seed(42)  # For reproducibility

for _ in range(n_transactions):
    # Randomly choose a number of items per transaction (at least 1 item)
    n_items = random.randint(1, len(items))
    transaction = random.sample(items, n_items)
    transactions.append(transaction)

print("Sample transactions:")
print(transactions[:5])
print("\n")

Sample transactions:
[['A', 'F', 'C', 'E', 'D', 'B'], ['A', 'E', 'F', 'C', 'B', 'D'], ['A'], ['B', 'E'], ['A', 'E', 'B', 'C', 'D']]




In [3]:
# 2. Preprocessing
# ----------------------------
# Use TransactionEncoder to convert transactions into a one-hot encoded DataFrame.
te = TransactionEncoder()
te_ary = te.fit(transactions).transform(transactions)
df = pd.DataFrame(te_ary, columns=te.columns_)

print("One-hot encoded DataFrame (first 5 rows):")
print(df.head())
print("\n")

One-hot encoded DataFrame (first 5 rows):
       A      B      C      D      E      F
0   True   True   True   True   True   True
1   True   True   True   True   True   True
2   True  False  False  False  False  False
3  False   True  False  False   True  False
4   True   True   True   True   True  False




In [4]:
# 3. Train-Test Split
# ----------------------------
train_df, test_df = train_test_split(df, test_size=0.3, random_state=42)
print(f"Training set size: {train_df.shape[0]} transactions")
print(f"Test set size: {test_df.shape[0]} transactions\n")

Training set size: 70 transactions
Test set size: 30 transactions



In [5]:
# 4. Train FP-Growth Model
# ----------------------------
# Set a minimum support threshold (e.g., 30% of training transactions).
min_support = 0.3
freq_itemsets = fpgrowth(train_df, min_support=min_support, use_colnames=True)
print("Frequent itemsets from training data:")
print(freq_itemsets)
print("\n")

Frequent itemsets from training data:
     support      itemsets
0   0.642857           (B)
1   0.614286           (A)
2   0.585714           (C)
3   0.571429           (D)
4   0.557143           (E)
5   0.485714           (F)
6   0.400000        (A, B)
7   0.442857        (C, B)
8   0.385714        (A, C)
9   0.314286     (A, C, B)
10  0.428571        (D, B)
11  0.385714        (C, D)
12  0.385714        (A, D)
13  0.328571     (C, D, B)
14  0.300000     (A, D, B)
15  0.457143        (C, E)
16  0.442857        (E, B)
17  0.400000        (A, E)
18  0.357143        (D, E)
19  0.385714     (C, E, B)
20  0.342857     (A, C, E)
21  0.328571     (A, E, B)
22  0.300000  (A, C, E, B)
23  0.314286     (D, E, B)
24  0.314286     (C, D, E)
25  0.300000     (A, D, E)
26  0.371429        (F, B)
27  0.357143        (D, F)
28  0.314286        (A, F)
29  0.300000        (E, F)
30  0.300000        (C, F)




In [6]:
# 5. Evaluate on Test Data
# ----------------------------
# Define a helper function to compute support on a given DataFrame.
def compute_support(dataframe, itemset):
    # For a given itemset, compute the fraction of transactions in 'dataframe' that contain all items.
    return dataframe[list(itemset)].all(axis=1).mean()

# Calculate test support for each frequent itemset
freq_itemsets['test_support'] = freq_itemsets['itemsets'].apply(lambda x: compute_support(test_df, x))
print("Frequent itemsets with training and test support:")
print(freq_itemsets)
print("\n")

Frequent itemsets with training and test support:
     support      itemsets  test_support
0   0.642857           (B)      0.700000
1   0.614286           (A)      0.766667
2   0.585714           (C)      0.733333
3   0.571429           (D)      0.633333
4   0.557143           (E)      0.633333
5   0.485714           (F)      0.566667
6   0.400000        (A, B)      0.600000
7   0.442857        (C, B)      0.566667
8   0.385714        (A, C)      0.566667
9   0.314286     (A, C, B)      0.500000
10  0.428571        (D, B)      0.500000
11  0.385714        (C, D)      0.533333
12  0.385714        (A, D)      0.533333
13  0.328571     (C, D, B)      0.466667
14  0.300000     (A, D, B)      0.466667
15  0.457143        (C, E)      0.533333
16  0.442857        (E, B)      0.566667
17  0.400000        (A, E)      0.566667
18  0.357143        (D, E)      0.466667
19  0.385714     (C, E, B)      0.466667
20  0.342857     (A, C, E)      0.500000
21  0.328571     (A, E, B)      0.500000
22  0.3

In [7]:
# 6. Predict New Data
# ----------------------------
# Generate new random transactions.
n_new_transactions = 5
new_transactions = []

for _ in range(n_new_transactions):
    n_items = random.randint(1, len(items))
    transaction = random.sample(items, n_items)
    new_transactions.append(transaction)

print("New transactions:")
print(new_transactions)
print("\n")

New transactions:
[['E', 'B', 'C', 'A'], ['D', 'A', 'B', 'F', 'C', 'E'], ['C', 'E', 'F', 'D', 'A', 'B'], ['C', 'D', 'F', 'B', 'E'], ['B', 'F', 'D', 'C', 'E']]




In [8]:
# Convert new transactions into a one-hot encoded DataFrame using the same encoder.
new_ary = te.transform(new_transactions)
new_df = pd.DataFrame(new_ary, columns=te.columns_)

In [9]:
# For each new transaction, find which frequent itemsets (from training) are contained in it.
def find_frequent_itemsets_in_transaction(transaction, freq_itemsets_df):
    matching_itemsets = []
    for itemset in freq_itemsets_df['itemsets']:
        # Check if all items in the frequent itemset are in the transaction.
        if all(item in transaction for item in itemset):
            matching_itemsets.append(itemset)
    return matching_itemsets

predictions = []
for idx, row in new_df.iterrows():
    # Convert boolean row back to list of items present in the transaction.
    transaction_items = list(new_df.columns[row.values])
    matching_itemsets = find_frequent_itemsets_in_transaction(transaction_items, freq_itemsets)
    predictions.append({
        'transaction': transaction_items,
        'matching_frequent_itemsets': matching_itemsets
    })

print("Predictions on new transactions:")
for prediction in predictions:
    print(prediction)

Predictions on new transactions:
{'transaction': ['A', 'B', 'C', 'E'], 'matching_frequent_itemsets': [frozenset({'B'}), frozenset({'A'}), frozenset({'C'}), frozenset({'E'}), frozenset({'A', 'B'}), frozenset({'C', 'B'}), frozenset({'A', 'C'}), frozenset({'A', 'C', 'B'}), frozenset({'C', 'E'}), frozenset({'E', 'B'}), frozenset({'A', 'E'}), frozenset({'C', 'E', 'B'}), frozenset({'A', 'C', 'E'}), frozenset({'A', 'E', 'B'}), frozenset({'A', 'C', 'E', 'B'})]}
{'transaction': ['A', 'B', 'C', 'D', 'E', 'F'], 'matching_frequent_itemsets': [frozenset({'B'}), frozenset({'A'}), frozenset({'C'}), frozenset({'D'}), frozenset({'E'}), frozenset({'F'}), frozenset({'A', 'B'}), frozenset({'C', 'B'}), frozenset({'A', 'C'}), frozenset({'A', 'C', 'B'}), frozenset({'D', 'B'}), frozenset({'C', 'D'}), frozenset({'A', 'D'}), frozenset({'C', 'D', 'B'}), frozenset({'A', 'D', 'B'}), frozenset({'C', 'E'}), frozenset({'E', 'B'}), frozenset({'A', 'E'}), frozenset({'D', 'E'}), frozenset({'C', 'E', 'B'}), frozenset({'A