In [1]:
import re

def read_data(data_file, param_file):
    # Declare global variables
    global transactions, n_transactions, MIS, sdc

    # Initialize transactions and MIS dictionary
    transactions, MIS = [], {}

    # Read the transaction data file
    with open(data_file, encoding='utf-8-sig') as input_file1:
        for line in input_file1:
            line = line.strip()
            if line:  # Ensure non-empty lines
                transactions.append([int(float(i)) for i in line.split(',') if i != '\n' and i != ''])
    
    # Number of transactions
    n_transactions = len(transactions)
    
    # Get all unique items
    all_items = sorted(list({j for i in transactions for j in i}))

    # Read the parameter data file
    with open(param_file, encoding='utf-8-sig') as input_file2:
        for line in input_file2:
            line = line.strip()
            
            # Handle MIS values for specific items
            if 'MIS' in line and 'rest' not in line:
                item = [int(x) for x in re.findall(r'\d+', line)]
                min_sup = [float(x) for x in re.findall(r'\d*\.\d+', line)]
                MIS[item[0]] = min_sup[0]
            
            # Handle the 'rest' MIS case
            elif 'MIS' in line and 'rest' in line:
                min_sup = [float(x) for x in re.findall(r'\d*\.\d+', line)]
                rest = list(set(all_items) - set(list(MIS.keys())))
                for i in rest:
                    MIS[i] = min_sup[0]
            
            # Handle SDC value
            elif 'SDC' in line:
                sdc_value = re.findall(r'\d*\.\d+', line)
                if sdc_value:
                    sdc = float(sdc_value[0])
                else:
                    print("Warning: SDC value not found or incorrectly formatted.")
            
            # Warn if the line doesn't match expected patterns
            elif line != '':
                print(f"Warning: Unrecognized line format in parameter file: {line}")
    
    # Print parsed values for debugging
    print("Transactions:", transactions)
    print("MIS:", MIS)
    print("SDC:", sdc)


In [None]:
def init_pass(M):

    # Make support count global and return L 

    global support_count

    n_items = len(M)
    support_count =  {}

    for i in M:
        for j in transactions:
            if i in j:
                support_count[i] = support_count.get(i, 0) + 1
    
    i, L = 0, []

    while i < n_items:
        if M[i] in support_count.keys() and support_count[M[i]]/n_transactions >= MIS[M[i]]:
            L.append(M[i])
            break
        i += 1
    
    for j in range(i+1, n_items):
        if M[j] in support_count.keys() and support_count[M[j]]/n_transactions >= MIS[M[i]]:
            L.append(M[j])

    return L

In [None]:
def level2_candidate_gen(L):
    
    # Return C2 at Level2
    
    C2 = []
    for i in range(len(L) - 1):
        if support_count[L[i]]/n_transactions >= MIS[L[i]]:
            for j in range(i+1, len(L)):
                if support_count[L[j]]/n_transactions >= MIS[L[i]] and abs((support_count[L[i]]-support_count[L[j]])/n_transactions) <= sdc:
                    C2.append((L[i], L[j]))
                    
    return C2