# Read the data

In [1]:
data_file = "data/mushroom/data_2_10.dat"

sup = 0.2

# read the data
data = sc.textFile(data_file)

# configurations
SUPPORT = int(sup * data.count())

# Optimize by eliminating infrequent items in the itemsets

In [2]:
import time

# compute frequent-1 itemset

# Create the rdd which stores all the transactions
transactions = data.map(lambda line: line.strip().split())

# Merge all the transactions together
items=transactions.flatMap(lambda x:x)

# calculate the time
start_time_1 = time.time()

# count the frequency of all the items in the merged rdd list
item_counts=items.map(lambda x:(x, 1)).reduceByKey(lambda x,y:x+y)

# get the frequent-1 itemset
L1=item_counts.filter(lambda x:x[1]>=SUPPORT).map(lambda x:x[0])

# print(L1.count())
# transactions.flatMap(lambda x:x).count()

freq=sc.broadcast(set(L1.collect()))

end_time_1 = time.time()

elapsed_time_1 = end_time_1 - start_time_1

print("Time for computing frequent-1 itemset: ", elapsed_time_1)


Time for computing frequent-1 itemset:  0.25696778297424316


In [3]:


# eliminate all the infrequent items
def purge_itemset(itemsets):
    # Only keep frequent items in 
    return [item for item in itemsets if item in freq.value]
transactions=transactions.map(purge_itemset)
transactions.flatMap(lambda x:x).count()

15707

In [4]:
# Big data algorithm, with pruning
def join_and_pruning(L_k):
    if not L_k:
        return sc.parallelize([])  # Return an empty RDD if L_k is empty

    k = len(L_k[0])
    threshold = k * (k + 1) // 2  # Calculate the threshold for pruning

    # Parallelize the list and use flatMap to generate (k+1)-itemsets
    rdd = sc.parallelize(L_k)
    candidate_rdd = rdd.flatMap(lambda x: [
        (frozenset(x).union(frozenset(y)), 1) 
        for y in L_k if len(frozenset(x).union(frozenset(y))) == k + 1
    ])

    # Reduce by key to count occurrences of each (k+1)-itemset
    candidate_count_rdd = candidate_rdd.reduceByKey(lambda a, b: a + b)

    # Filter candidates based on the threshold
    filtered_candidates_rdd = candidate_count_rdd.filter(lambda x: x[1] >= threshold)

    # Convert each frozenset back to list
    final_candidates_rdd = filtered_candidates_rdd.map(lambda x: list(x[0]))

    return final_candidates_rdd.collect()

# Example usage
# L_k = [['108', '200'], ['108', '300'], ['200', '300'], ['200', '400']]
# rdd_result = join_and_pruning(L_k)
# print(rdd_result)


# Generate $L_{k}$ from $C_{k}$

In [5]:
itemset_broadcast=sc.broadcast(transactions.map(lambda x: set(x)).collect())

def support_count(itemset):
    # Count how many transactions contain the itemset
    count = sum(1 for transaction in itemset_broadcast.value if set(itemset).issubset(transaction))
    return (itemset, count)

def get_frequent_set(C_k):
    C_k_rdd=sc.parallelize(C_k)
    # Map and filter step
    L_k = C_k_rdd \
        .map(support_count) \
        .filter(lambda x: x[1] >= SUPPORT) \
        .map(lambda x: x[0]) \
        .collect()
    return L_k

# Finally do the job

In [6]:
import time

L_k = [[item] for item in L1.collect()]
k = 2

# store the elapsed time for each iteration
time_C = [0]
time_L = [elapsed_time_1]

# store the number of frequent itemsets for each iteration
num_frequent_itemsets_C = [item_counts.map(lambda x:x[0]).collect()]
num_frequent_itemsets_L = [len(L_k)]

while L_k:
    print(f'k={k}')
    
    start_time_C = time.time()  # Start timing for join_and_pruning
    
    C_k = join_and_pruning(L_k)
    
    end_time_C = time.time()  # End timing for join_and_pruning
    elapsed_time_C = end_time_C - start_time_C
    time_C.append(elapsed_time_C)
    num_frequent_itemsets_C.append(len(C_k))
    
    
    start_time_L = time.time()  # Start timing for get_frequent_set
    
    L_k = get_frequent_set(C_k)
    
    end_time_L = time.time()  # End timing for get_frequent_set
    elapsed_time_L = end_time_L - start_time_L
    time_L.append(elapsed_time_L)
    num_frequent_itemsets_L.append(len(L_k))
    
    k += 1


k=2
k=3
k=4
k=5


                                                                                

k=6


                                                                                

k=7


                                                                                

k=8


                                                                                

k=9


                                                                                

k=10


                                                                                

k=11


                                                                                

k=12
k=13
k=14
k=15
k=16


In [7]:
# print the time and the number of freq_items for each iteration
print("Time for computing C_k: ", [round(t, 3) for t in time_C])
print("Time for computing L_k: ", [round(t, 3) for t in time_L])

print("Number of frequent itemsets for C_k: ", num_frequent_itemsets_C)
print("Number of frequent itemsets for L_k: ", num_frequent_itemsets_L)


Time for computing C_k:  [0, 0.118, 0.123, 0.254, 1.412, 7.348, 13.853, 18.12, 16.648, 8.502, 2.997, 0.592, 0.136, 0.137, 0.128, 0.132]
Time for computing L_k:  [0.257, 0.048, 0.125, 0.189, 0.31, 0.45, 0.425, 0.498, 0.311, 0.214, 0.134, 0.085, 0.078, 0.078, 0.061, 0.068]
Number of frequent itemsets for C_k:  [15707, 903, 1816, 4912, 6686, 8977, 10214, 9515, 7012, 4004, 1729, 546, 119, 16, 1, 0]
Number of frequent itemsets for L_k:  [43, 377, 1483, 3600, 6342, 8884, 10205, 9506, 7012, 4004, 1729, 546, 119, 16, 1, 0]
