In [2]:
import numpy as np
import re

In [8]:
filename_1 = 'CFDMiner_ccfd.txt'
filename_2 = 'FGC_Stream_CFDMiner_ccfd.txt'

In [9]:
def parse_ccfd(ccfd_str):
    # Extract the LHS and RHS parts of the CCFD
    match = re.match(r'\[(.*?)\] => (.*?), \((.*?) \|\| (.*?)\)', ccfd_str)
    if not match:
        raise ValueError(f"Invalid CCFD format: {ccfd_str}")
    
    lhs_attributes = set(match.group(1).split(', '))
    rhs_attribute = set([match.group(2)])
    lhs_values = set(match.group(3).split(', '))
    rhs_value = set([match.group(4)])
    
    return lhs_attributes, rhs_attribute, lhs_values, rhs_value

In [10]:
def parse_ccfd_file(filename):
    with open(filename, 'r') as file:
        ccfd_list = file.readlines()
    return [parse_ccfd(ccfd) for ccfd in ccfd_list]

In [None]:
def compare_ccfd_files(filename_1, filename_2):
    ccfd_list_1 = parse_ccfd_file(filename_1)
    ccfd_list_2 = parse_ccfd_file(filename_2)
    
    # CCFDs in file 1 but not in file 2
    ccfd_1T2F = [ccfd for ccfd in ccfd_list_1 if ccfd not in ccfd_list_2]
    print(f"number of CCFDs in file 1 but not in file 2 (miss): {len(ccfd_1T2F)}")
    # CCFDs in file 2 but not in file 1
    ccfd_1F2T = [ccfd for ccfd in ccfd_list_2 if ccfd not in ccfd_list_1]
    print(f"number of CCFDs in file 2 but not in file 1 (false positive): {len(ccfd_1F2T)}")
    # CCFDs in both files
    ccfd_1T2T = [ccfd for ccfd in ccfd_list_1 if ccfd in ccfd_list_2]
    print(f"number of CCFDs in both files (hit): {len(ccfd_1T2T)}")


In [12]:
compare_ccfd_files(filename_1, filename_2)

[({'att7'}, {'att2'}, {'sixteen'}, {'1'}), ({'att2'}, {'att1'}, {'4'}, {'d'}), ({'att5', 'att1'}, {'att2'}, {'c', 'b'}, {'1'}), ({'att5', 'att1'}, {'att2'}, {'b', 'a'}, {'1'}), ({'att1'}, {'att2'}, {'a'}, {'1'}), ({'att5', 'att1'}, {'att2'}, {'b'}, {'1'}), ({'att7', 'att1'}, {'att2'}, {'fifteen', 'd'}, {'1'})]
number of CCFDs in file 1 but not in file 2 (miss): 4
number of CCFDs in file 2 but not in file 1 (false positive): 404
number of CCFDs in both files (hit): 3
