In [85]:
import random

# Define the database sizes
D1K_SIZE = 1000
D10K_SIZE = 10000
D50K_SIZE = 50000
D100K_SIZE = 100000

In [86]:

# Define the database file names
D1K_FILE = "D1K.txt"
D10K_FILE = "D10K.txt"
D50K_FILE = "D50K.txt"
D100K_FILE = "D100K.txt"


In [87]:

# Define the function to generate a random transaction
def gen_transaction(num):
    transaction = []
    for i in range(num):
        # Generate a random item index between 0 and 99
        item_index = random.randint(0, 99)
        transaction.append(item_index)
    return transaction


In [88]:

# Define the function to generate a transactional database
def gen_database(file_name, db_size):
    with open(file_name, 'w') as db_file:
        for i in range(db_size):
            # Generate a random transaction size between 5 and 15
            transaction_size = random.randint(5, 15)
            # Generate a random transaction and write it to the file
            transaction = gen_transaction(transaction_size)
            db_file.write(" ".join([f"i{item}" for item in transaction]) + "\n")


In [89]:

# Generate the four transactional databases
gen_database(D1K_FILE, D1K_SIZE)
gen_database(D10K_FILE, D10K_SIZE)
gen_database(D50K_FILE, D50K_SIZE)
gen_database(D100K_FILE, D100K_SIZE)



In [90]:

from collections import defaultdict
from itertools import combinations

# Define a function to read a transactional database from a file
def read_database(file_name):
    with open(file_name, 'r') as db_file:
        transactions = [set(line.strip().split()) for line in db_file]
    return transactions

In [91]:
# Define the Apriori algorithm function
def apriori(db, min_support):
    # Step 1: Find frequent 1-itemsets
    item_counts = defaultdict(int)
    for transaction in db:
        for item in transaction:
            item_counts[item] += 1
    num_transactions = len(db)
    freq_k_itemsets = {frozenset([item]): count / num_transactions for item, count in item_counts.items() if count / num_transactions >= min_support}
    frequent_itemsets = freq_k_itemsets
    # Step 2: Generate k-itemsets and find frequent ones until no more frequent itemsets are found
    k = 1
    while len(freq_k_itemsets) > 0:
        k += 1
        # Generate candidate k-itemsets from frequent (k-1)-itemsets
        candidate_itemsets = set([itemset1.union(itemset2) for itemset1 in frequent_itemsets for itemset2 in frequent_itemsets if len(itemset1.union(itemset2)) == k])
        # Count the support of each candidate itemset
        item_counts = defaultdict(int)
        for transaction in db:
            for candidate_itemset in candidate_itemsets:
                if candidate_itemset.issubset(transaction):
                    item_counts[candidate_itemset] += 1
        # Find frequent k-itemsets
        freq_k_itemsets = {itemset: count / num_transactions for itemset, count in item_counts.items() if count / num_transactions >= min_support}
        frequent_itemsets.update(freq_k_itemsets)
    return frequent_itemsets, k
# db = read_database('D1K.txt')
# frequent_itemsets = apriori(db, 0.01)

In [92]:
# Define a function to write frequent itemsets to a file
def write_frequent_itemsets(file_name, frequent_itemsets):
    with open(file_name, 'w') as freq_file:
        for itemset, support in frequent_itemsets.items():
            freq_file.write("{" + ", ".join([f"i{item}" for item in itemset]) + "} " + f"{support:.2%}\n")


In [93]:

# Define the main function to run the Apriori algorithm on a database and save frequent itemsets to a file
# def main(db_file_name, min_support):
#     print("1", db_file_name)
#     db = read_database(db_file_name)
#     frequent_itemsets, k = apriori(db, min_support)
#     print(f'k = {k-1}')
#     freq_file_name = db_file_name.replace(".txt", f"_Apriori_{int(min_support*100)}.freq")
#     write_frequent_itemsets(freq_file_name, frequent_itemsets)
#     print(f"Number of scans: {len(frequent_itemsets)}\n")


# Define the main function to run the Apriori algorithm on a database and save frequent itemsets to a file
def main(db_file_name, min_support, apriori_fn=apriori):
    print("1", db_file_name)
    db = read_database(db_file_name)
    frequent_itemsets, k = apriori_fn(db, min_support)
    print(f'k = {k-1}')
    freq_file_name = db_file_name.replace(".txt", f"_AprioriAlgo_{apriori_fn.__name__}_{int(min_support*100)}.freq")
    write_frequent_itemsets(freq_file_name, frequent_itemsets)
    print(f"Number of itemsets found: {len(frequent_itemsets)}\n")


In [110]:
# Test the program using the first two databases with min_support = 1%
main("D1K.txt", 0.01)
main("D10K.txt", 0.01)
main("D50K.txt", 0.01)
main("D100K.txt", 0.01)




1 D1K.txt
k = 2
Number of itemsets found: 2136

1 D10K.txt
k = 2
Number of itemsets found: 885

1 D50K.txt
k = 2
Number of itemsets found: 151

1 D100K.txt
k = 2
Number of itemsets found: 103



In [111]:
# Test the Apriori algorithm using the four databases with min_support = 5%
main("D1K.txt", 0.05)
main("D10K.txt", 0.05)
main("D50K.txt", 0.05)
main("D100K.txt", 0.05)


1 D1K.txt
k = 1
Number of itemsets found: 100

1 D10K.txt
k = 1
Number of itemsets found: 100

1 D50K.txt
k = 1
Number of itemsets found: 100

1 D100K.txt
k = 1
Number of itemsets found: 100



In [112]:
# Test the Apriori algorithm using the four databases with min_support = 8%
main("D1K.txt", 0.08)
main("D10K.txt", 0.08)
main("D50K.txt", 0.08)
main("D100K.txt", 0.08)

1 D1K.txt
k = 1
Number of itemsets found: 96

1 D10K.txt
k = 1
Number of itemsets found: 100

1 D50K.txt
k = 1
Number of itemsets found: 100

1 D100K.txt
k = 1
Number of itemsets found: 100



In [113]:
# Test the Apriori algorithm using the four databases with min_support = 10%
main("D1K.txt", 0.1)
main("D10K.txt", 0.1)
main("D50K.txt", 0.1)
main("D100K.txt", 0.1)

1 D1K.txt
k = 1
Number of itemsets found: 34

1 D10K.txt
k = 1
Number of itemsets found: 2

1 D50K.txt
k = 0
Number of itemsets found: 0

1 D100K.txt
k = 0
Number of itemsets found: 0



In [116]:
# Test the Apriori algorithm using the four databases with min_support = 15%
main("D1K.txt", 0.15)
main("D10K.txt", 0.15)
main("D50K.txt", 0.15)
main("D100K.txt", 0.15)

1 D1K.txt
k = 0
Number of itemsets found: 0

1 D10K.txt
k = 0
Number of itemsets found: 0

1 D50K.txt
k = 0
Number of itemsets found: 0

1 D100K.txt
k = 0
Number of itemsets found: 0



In [106]:
# main("D50K.txt", 0.097)


1 D50K.txt
k = 1
Number of itemsets found: 6



In [98]:
import os

# Get the current working directory
cwd = os.getcwd()
print("Current working directory:", cwd)

# Construct the output file path
output_file_path = f"D1K.txt_Apriori_1.freq"
output_file_path = os.path.join(cwd, output_file_path)
print("Output file path:", output_file_path)


Current working directory: c:\Users\vps12\Documents\Assignments\4_Spring23\4310\Project\git\Apriori
Output file path: c:\Users\vps12\Documents\Assignments\4_Spring23\4310\Project\git\Apriori\D1K.txt_Apriori_1.freq


# Idea1

In [99]:
# Define the Apriori algorithm function
def apriori_idea1(db, min_support):
    # Step 1: Find frequent 1-itemsets
    item_counts = defaultdict(int)
    for transaction in db:
        for item in transaction:
            item_counts[item] += 1
    num_transactions = len(db)
    freq_k_itemsets_all = {frozenset([item]): count / num_transactions for item, count in item_counts.items()}
    freq_k_itemsets = {frozenset([item]): count / num_transactions for item, count in item_counts.items() if count / num_transactions >= min_support}
    db_all_freq1 = []  # db with all transactions with freq itemset
    for transaction in db:
        satisfy_supp = False
        for it in transaction:
            if freq_k_itemsets_all[frozenset({it})] >= min_support:
                satisfy_supp = True
        if satisfy_supp:
            db_all_freq1.append(transaction)
            
    
    ########
    print(len(db), len(db_all_freq1))
    item_counts = defaultdict(int)
    for transaction in db_all_freq1:
        for item in transaction:
            item_counts[item] += 1
    num_transactions = len(db_all_freq1)
    freq_k_itemsets = {frozenset([item]): count / num_transactions for item, count in item_counts.items() if count / num_transactions >= min_support}
    ########

    frequent_itemsets = freq_k_itemsets
    # Step 2: Generate k-itemsets and find frequent ones until no more frequent itemsets are found
    k = 1
    while len(freq_k_itemsets) > 0:
        k += 1

        # Generate candidate k-itemsets from frequent (k-1)-itemsets
        candidate_itemsets = set([itemset1.union(itemset2) for itemset1 in frequent_itemsets for itemset2 in frequent_itemsets if len(itemset1.union(itemset2)) == k])
        # Count the support of each candidate itemset
        item_counts = defaultdict(int)
        for transaction in db_all_freq1:
            for candidate_itemset in candidate_itemsets:
                if candidate_itemset.issubset(transaction):
                    item_counts[candidate_itemset] += 1
        # Find frequent k-itemsets
        freq_k_itemsets = {itemset: count / num_transactions for itemset, count in item_counts.items() if count / num_transactions >= min_support}
        frequent_itemsets.update(freq_k_itemsets)

    return frequent_itemsets, k

In [100]:
# Test the Idea 1 using the four databases with min_support = 1%
a1 = main('D1K.txt', 0.01, apriori_fn=apriori_idea1)
b1 = main('D10K.txt', 0.01, apriori_fn=apriori_idea1)
c1 = main('D50K.txt', 0.01, apriori_fn=apriori_idea1)
d1 = main('D100K.txt', 0.01, apriori_fn=apriori_idea1)

a1
b1
c1
d1

1 D1K.txt
1000 981
k = 1
Number of itemsets found: 36



In [None]:
# Test the Idea 1 using the four databases with min_support = 5%
a5 = main('D1K.txt', 0.05, apriori_fn=apriori_idea1)
b5 = main('D10K.txt', 0.05, apriori_fn=apriori_idea1)
c5 = main('D50K.txt', 0.05, apriori_fn=apriori_idea1)
d5 = main('D100K.txt', 0.05, apriori_fn=apriori_idea1)

a5
b5
c5
d5

In [None]:
# Test the Idea 1 using the four databases with min_support = 8%
a8 = main('D1K.txt', 0.08, apriori_fn=apriori_idea1)
b8 = main('D10K.txt', 0.08, apriori_fn=apriori_idea1)
c8 = main('D50K.txt', 0.08, apriori_fn=apriori_idea1)
d8 = main('D100K.txt', 0.08, apriori_fn=apriori_idea1)

a8
b8
c8
d8

In [None]:
# Test the Idea 1 using the four databases with min_support = 10%
a10 = main('D1K.txt', 0.10, apriori_fn=apriori_idea1)
b10 = main('D10K.txt', 0.10, apriori_fn=apriori_idea1)
c10 = main('D50K.txt', 0.10, apriori_fn=apriori_idea1)
d10 = main('D100K.txt', 0.10, apriori_fn=apriori_idea1)

a10
b10
c10
d10

In [None]:
# Test the Idea 1 using the four databases with min_support = 15%
a15 = main('D1K.txt', 0.15, apriori_fn=apriori_idea1)
b15 = main('D10K.txt', 0.15, apriori_fn=apriori_idea1)
c15 = main('D50K.txt', 0.15, apriori_fn=apriori_idea1)
d15 = main('D100K.txt', 0.15, apriori_fn=apriori_idea1)

a15
b15
c15
d15

In [101]:
# z = main("D10K.txt", 0.1, apriori_fn=apriori_idea1)
# z

1 D10K.txt
10000 1930
k = 1
Number of itemsets found: 27



In [102]:
# a= main("D50K.txt", 0.097, apriori_fn=apriori_idea1)
# a

1 D50K.txt
50000 22978
k = 1
Number of itemsets found: 19



In [None]:
# b= main("D100K.txt", 0.05, apriori_fn=apriori_idea1)
# b