In [254]:
import random

# Define the database sizes
D1K_SIZE = 1000
D10K_SIZE = 10000
D50K_SIZE = 50000
D100K_SIZE = 100000

In [255]:

# Define the database file names
D1K_FILE = "D1K.txt"
D10K_FILE = "D10K.txt"
D50K_FILE = "D50K.txt"
D100K_FILE = "D100K.txt"


In [256]:

# Define the function to generate a random transaction
def gen_transaction(num):
    transaction = []
    for i in range(num):
        # Generate a random item index between 0 and 99
        item_index = random.randint(0, 99)
        transaction.append(item_index)
    return transaction


In [257]:

# Define the function to generate a transactional database
def gen_database(file_name, db_size):
    with open(file_name, 'w') as db_file:
        for i in range(db_size):
            # Generate a random transaction size between 5 and 15
            transaction_size = random.randint(5, 15)
            # Generate a random transaction and write it to the file
            transaction = gen_transaction(transaction_size)
            db_file.write(" ".join([f"i{item}" for item in transaction]) + "\n")


In [258]:

# Generate the four transactional databases
gen_database(D1K_FILE, D1K_SIZE)
gen_database(D10K_FILE, D10K_SIZE)
gen_database(D50K_FILE, D50K_SIZE)
gen_database(D100K_FILE, D100K_SIZE)



In [259]:

from collections import defaultdict
from itertools import combinations

# Define a function to read a transactional database from a file
def read_database(file_name):
    with open(file_name, 'r') as db_file:
        transactions = [set(line.strip().split()) for line in db_file]
    return transactions

In [260]:
# Define the Apriori algorithm function
def apriori(db, min_support):
    # Step 1: Find frequent 1-itemsets
    item_counts = defaultdict(int)
    for transaction in db:
        for item in transaction:
            item_counts[item] += 1
    num_transactions = len(db)
    freq_k_itemsets = {frozenset([item]): count / num_transactions for item, count in item_counts.items() if count / num_transactions >= min_support}
    frequent_itemsets = freq_k_itemsets
    # Step 2: Generate k-itemsets and find frequent ones until no more frequent itemsets are found
    k = 1
    while len(freq_k_itemsets) > 0:
        k += 1
        # Generate candidate k-itemsets from frequent (k-1)-itemsets
        candidate_itemsets = set([itemset1.union(itemset2) for itemset1 in frequent_itemsets for itemset2 in frequent_itemsets if len(itemset1.union(itemset2)) == k])
        # Count the support of each candidate itemset
        item_counts = defaultdict(int)
        for transaction in db:
            for candidate_itemset in candidate_itemsets:
                if candidate_itemset.issubset(transaction):
                    item_counts[candidate_itemset] += 1
        # Find frequent k-itemsets
        freq_k_itemsets = {itemset: count / num_transactions for itemset, count in item_counts.items() if count / num_transactions >= min_support}
        frequent_itemsets.update(freq_k_itemsets)
    return frequent_itemsets, k
# db = read_database('D1K.txt')
# frequent_itemsets = apriori(db, 0.01)

In [261]:
# Define a function to write frequent itemsets to a file
def write_frequent_itemsets(file_name, frequent_itemsets):
    with open(file_name, 'w') as freq_file:
        for itemset, support in frequent_itemsets.items():
            freq_file.write("{" + ", ".join([f"i{item}" for item in itemset]) + "} " + f"{support:.2%}\n")


In [307]:
# Define the main function to run the Apriori algorithm on a database and save frequent itemsets to a file
def main(db_file_name, min_support, apriori_fn=apriori):
    print("1", db_file_name)
    db = read_database(db_file_name)
    frequent_itemsets, k = apriori_fn(db, min_support)
    print(f'k = {k-1}')
    freq_file_name = db_file_name.replace(".txt", f"_AprioriAlgo_{apriori_fn.__name__}_{int(min_support*10000)}.freq")
    write_frequent_itemsets(freq_file_name, frequent_itemsets)
    print(f"Number of itemsets found: {len(frequent_itemsets)}\n")


---

### min_support = 8%

In [310]:
main("D1K.txt", 0.08)

1 D1K.txt
k = 1
Number of itemsets found: 98



In [311]:
main("D10K.txt", 0.08)

1 D10K.txt
k = 1
Number of itemsets found: 100



In [312]:
main("D50K.txt", 0.08)

1 D50K.txt
k = 1
Number of itemsets found: 100



In [313]:
main("D100K.txt", 0.08)

1 D100K.txt
k = 1
Number of itemsets found: 100



---

### min_support = 9.5%

In [314]:
main("D1K.txt", 0.095)

1 D1K.txt
k = 1
Number of itemsets found: 55



In [315]:
main("D10K.txt", 0.095)

1 D10K.txt
k = 1
Number of itemsets found: 47



In [316]:
main("D50K.txt", 0.095)

1 D50K.txt
k = 1
Number of itemsets found: 58



In [317]:
main("D100K.txt", 0.095)

1 D100K.txt
k = 1
Number of itemsets found: 62



---

### min_support = 9.6%

In [318]:
main("D1K.txt", 0.096)

1 D1K.txt
k = 1
Number of itemsets found: 50



In [319]:
main("D10K.txt", 0.096)

1 D10K.txt
k = 1
Number of itemsets found: 35



In [320]:
main("D50K.txt", 0.096)

1 D50K.txt
k = 1
Number of itemsets found: 27



In [321]:
main("D100K.txt", 0.096)

1 D100K.txt
k = 1
Number of itemsets found: 15



---

### min_support = 9.7%

In [322]:
main("D1K.txt", 0.097)

1 D1K.txt
k = 1
Number of itemsets found: 48



In [323]:
main("D10K.txt", 0.097)

1 D10K.txt
k = 1
Number of itemsets found: 22



In [324]:
main("D50K.txt", 0.097)

1 D50K.txt
k = 1
Number of itemsets found: 4



In [325]:
main("D100K.txt", 0.097)

1 D100K.txt
k = 1
Number of itemsets found: 2



---

### min_support = 9.75%

In [326]:
main("D1K.txt", 0.0975)

1 D1K.txt
k = 1
Number of itemsets found: 41



In [327]:
main("D10K.txt", 0.0975)

1 D10K.txt
k = 1
Number of itemsets found: 15



In [328]:
main("D50K.txt", 0.0975)

1 D50K.txt
k = 1
Number of itemsets found: 3



In [329]:
main("D100K.txt", 0.0975)

1 D100K.txt
k = 1
Number of itemsets found: 1



---
---

# Idea1

In [330]:
# Define the Apriori algorithm function
def apriori_idea1(db, min_support):
    # Step 1: Find frequent 1-itemsets
    item_counts = defaultdict(int)
    for transaction in db:
        for item in transaction:
            item_counts[item] += 1
    num_transactions = len(db)
    freq_k_itemsets_all = {frozenset([item]): count / num_transactions for item, count in item_counts.items()}
    freq_k_itemsets = {frozenset([item]): count / num_transactions for item, count in item_counts.items() if count / num_transactions >= min_support}
    db_all_freq1 = []  # db with all transactions with freq itemset
    for transaction in db:
        satisfy_supp = False
        for it in transaction:
            if freq_k_itemsets_all[frozenset({it})] >= min_support:
                satisfy_supp = True
        if satisfy_supp:
            db_all_freq1.append(transaction)
            
    
    ########
    print(len(db), len(db_all_freq1))
    item_counts = defaultdict(int)
    for transaction in db_all_freq1:
        for item in transaction:
            item_counts[item] += 1
    num_transactions = len(db_all_freq1)
    freq_k_itemsets = {frozenset([item]): count / num_transactions for item, count in item_counts.items() if count / num_transactions >= min_support}
    ########

    frequent_itemsets = freq_k_itemsets
    # Step 2: Generate k-itemsets and find frequent ones until no more frequent itemsets are found
    k = 1
    while len(freq_k_itemsets) > 0:
        k += 1

        # Generate candidate k-itemsets from frequent (k-1)-itemsets
        candidate_itemsets = set([itemset1.union(itemset2) for itemset1 in frequent_itemsets for itemset2 in frequent_itemsets if len(itemset1.union(itemset2)) == k])
        # Count the support of each candidate itemset
        item_counts = defaultdict(int)
        for transaction in db_all_freq1:
            for candidate_itemset in candidate_itemsets:
                if candidate_itemset.issubset(transaction):
                    item_counts[candidate_itemset] += 1
        # Find frequent k-itemsets
        freq_k_itemsets = {itemset: count / num_transactions for itemset, count in item_counts.items() if count / num_transactions >= min_support}
        frequent_itemsets.update(freq_k_itemsets)

    return frequent_itemsets, k

---

### min_support = 8%

In [331]:
a1 = main('D1K.txt', 0.08, apriori_fn=apriori_idea1)
a1


1 D1K.txt
1000 1000
k = 1
Number of itemsets found: 98



In [332]:
b1 = main('D10K.txt', 0.08, apriori_fn=apriori_idea1)
b1

1 D10K.txt
10000 10000
k = 1
Number of itemsets found: 100



In [333]:
c1 = main('D50K.txt', 0.08, apriori_fn=apriori_idea1)
c1

1 D50K.txt
50000 50000
k = 1
Number of itemsets found: 100



In [334]:
d1 = main('D100K.txt', 0.08, apriori_fn=apriori_idea1)
d1

1 D100K.txt
100000 100000
k = 1
Number of itemsets found: 100



---

### min_support = 9.5%

In [335]:
a15 = main('D1K.txt', 0.095, apriori_fn=apriori_idea1)
a15

1 D1K.txt
1000 1000
k = 1
Number of itemsets found: 55



In [336]:
b15 = main('D10K.txt', 0.095, apriori_fn=apriori_idea1)
b15

1 D10K.txt
10000 9920
k = 1
Number of itemsets found: 48



In [337]:
c15 = main('D50K.txt', 0.095, apriori_fn=apriori_idea1)
c15

1 D50K.txt
50000 49898
k = 1
Number of itemsets found: 58



In [338]:
d15 = main('D100K.txt', 0.095, apriori_fn=apriori_idea1)
d15

1 D100K.txt
100000 99898
k = 1
Number of itemsets found: 62



### min_support = 9.6%

In [339]:
a5 = main('D1K.txt', 0.096, apriori_fn=apriori_idea1)
a5


1 D1K.txt
1000 999
k = 1
Number of itemsets found: 50



In [340]:
b5 = main('D10K.txt', 0.096, apriori_fn=apriori_idea1)
b5

1 D10K.txt
10000 9727
k = 1
Number of itemsets found: 38



In [351]:
c5 = main('D50K.txt', 0.096, apriori_fn=apriori_idea1)
c5

1 D50K.txt
50000 46801
k = 1
Number of itemsets found: 32



In [352]:
d5 = main('D100K.txt', 0.096, apriori_fn=apriori_idea1)
d5

1 D100K.txt
100000 78096
k = 1
Number of itemsets found: 27



---

### min_support = 9.7%

In [343]:
a8 = main('D1K.txt', 0.097, apriori_fn=apriori_idea1)
a8

1 D1K.txt
1000 999
k = 1
Number of itemsets found: 48



In [344]:
b8 = main('D10K.txt', 0.097, apriori_fn=apriori_idea1)
b8


1 D10K.txt
10000 9029
k = 1
Number of itemsets found: 28



In [345]:
c8 = main('D50K.txt', 0.097, apriori_fn=apriori_idea1)
c8



1 D50K.txt
50000 16875
k = 1
Number of itemsets found: 21



In [346]:
d8 = main('D100K.txt', 0.097, apriori_fn=apriori_idea1)
d8

1 D100K.txt
100000 18509
k = 1
Number of itemsets found: 21



---

### min_support = 9.75%

In [347]:
a10 = main('D1K.txt', 0.0975, apriori_fn=apriori_idea1)
a10


1 D1K.txt
1000 994
k = 1
Number of itemsets found: 42



In [348]:
b10 = main('D10K.txt', 0.0975, apriori_fn=apriori_idea1)
b10


1 D10K.txt
10000 7949
k = 1
Number of itemsets found: 23



In [349]:
c10 = main('D50K.txt', 0.0975, apriori_fn=apriori_idea1)
c10



1 D50K.txt
50000 13294
k = 1
Number of itemsets found: 21



In [350]:
d10 = main('D100K.txt', 0.0975, apriori_fn=apriori_idea1)
d10

1 D100K.txt
100000 9777
k = 2
Number of itemsets found: 47



---

# fin