In [1]:
import pandas as pd
from collections import defaultdict

input_path = r'data\machinelearning_100_top_all_20221206_151014'
data_df = pd.read_json(input_path + r'\data.json').sort_index()
tokens_map_df = pd.read_json(input_path + r'\tokens_map.json').sort_index()

In [2]:
def get_candidate_id(*ids: int) -> str:
    sorted_ids: list[int] = sorted(ids)
    return '.'.join(str(id) for id in sorted_ids)

In [3]:
all_tokens_ids: set[int] = set(tokens_map_df.index)
num_transactions: int = len(data_df)
dif_map: dict[int, set[int]] = defaultdict(set)

for transaction_id, tokens_ids in data_df.iterrows():
    non_existing_tokens_ids: set[int] = all_tokens_ids - set(tokens_ids['tokens'])
    for token_id in non_existing_tokens_ids:
        dif_map[token_id].add(transaction_id)

dif_map = dict(dif_map)

In [4]:
MIN_SUP = 2
dif_map: dict[int, set[int]] = {token_id: dif_list for token_id, dif_list in dif_map.items() if num_transactions - len(dif_list) > MIN_SUP}

In [5]:
class Candidate:
    def __init__(self, tokens_ids: list[int], sup: int, dif_list: set[int]) -> None:
        self.tokens_ids: list[int] = tokens_ids
        self.sup: int = sup
        self.dif_list = dif_list
        self.children: list[Candidate] = []

    def __repr__(self, layer = 0) -> str:
        repr = '  ' * layer
        repr += f'{self.sup} - {self.tokens_ids}\n'
        for child in self.children:
            repr += f'{child.__repr__(layer + 1)}'

        return repr

    def __str__(self) -> str:
        return self.__repr__()

    def add_child(self, child: "Candidate") -> None:
        self.children.append(child)

In [6]:
def declat_layer(layer: list[Candidate]) -> None:
    if len(layer) == 0:
        return
    
    new_layer: list[Candidate] = []

    for i, candidate in enumerate(layer):
        for other_candidate in layer[i + 1:]:
            if candidate.tokens_ids[:-1] == other_candidate.tokens_ids[:-1]:
                new_dif_list = other_candidate.dif_list - candidate.dif_list
                new_sup = candidate.sup - len(new_dif_list)
                if new_sup > MIN_SUP:
                    new_candidate = Candidate(candidate.tokens_ids + [other_candidate.tokens_ids[-1]], new_sup, new_dif_list)
                    candidate.add_child(new_candidate)
                    new_layer.append(new_candidate)

    return declat_layer(new_layer)


root_candidate = Candidate([], num_transactions, {})
for token_id, dif_list in dif_map.items():
    root_candidate.add_child(Candidate([token_id], num_transactions - len(dif_list), dif_list))

layer: list[Candidate] = root_candidate.children

In [7]:
declat_layer(layer)
root_candidate

100 - []
  22 - [10]
    8 - [10, 11]
      4 - [10, 11, 13]
      3 - [10, 11, 19]
      4 - [10, 11, 63]
        4 - [10, 11, 63, 64]
      4 - [10, 11, 64]
      3 - [10, 11, 67]
      3 - [10, 11, 3]
    11 - [10, 13]
      3 - [10, 13, 62]
        3 - [10, 13, 62, 63]
          3 - [10, 13, 62, 63, 64]
            3 - [10, 13, 62, 63, 64, 65]
          3 - [10, 13, 62, 63, 65]
        3 - [10, 13, 62, 64]
          3 - [10, 13, 62, 64, 65]
        3 - [10, 13, 62, 65]
      5 - [10, 13, 63]
        5 - [10, 13, 63, 64]
          3 - [10, 13, 63, 64, 65]
        3 - [10, 13, 63, 65]
      5 - [10, 13, 64]
        3 - [10, 13, 64, 65]
      4 - [10, 13, 65]
      3 - [10, 13, 98]
    4 - [10, 19]
    3 - [10, 22]
    4 - [10, 24]
    4 - [10, 44]
    3 - [10, 47]
    3 - [10, 62]
      3 - [10, 62, 63]
        3 - [10, 62, 63, 64]
          3 - [10, 62, 63, 64, 65]
        3 - [10, 62, 63, 65]
      3 - [10, 62, 64]
        3 - [10, 62, 64, 65]
      3 - [10, 62, 65]
    9 - [10, 63

In [8]:
def decode_declat(root_candidate) -> None:
    if len(root_candidate.children) == 0:
        return

    for child in root_candidate.children:
        child.tokens_ids = [tokens_map_df.loc[token_id]['token'] for token_id in child.tokens_ids]
        decode_declat(child)

decode_declat(root_candidate)
root_candidate

100 - []
  22 - ['d']
    8 - ['d', 'a']
      4 - ['d', 'a', 'of']
      3 - ['d', 'a', 'the']
      4 - ['d', 'a', 'machin']
        4 - ['d', 'a', 'machin', 'learn']
      4 - ['d', 'a', 'learn']
      3 - ['d', 'a', 'ha']
      3 - ['d', 'a', 'to']
    11 - ['d', 'of']
      3 - ['d', 'of', 'type']
        3 - ['d', 'of', 'type', 'machin']
          3 - ['d', 'of', 'type', 'machin', 'learn']
            3 - ['d', 'of', 'type', 'machin', 'learn', 'paper']
          3 - ['d', 'of', 'type', 'machin', 'paper']
        3 - ['d', 'of', 'type', 'learn']
          3 - ['d', 'of', 'type', 'learn', 'paper']
        3 - ['d', 'of', 'type', 'paper']
      5 - ['d', 'of', 'machin']
        5 - ['d', 'of', 'machin', 'learn']
          3 - ['d', 'of', 'machin', 'learn', 'paper']
        3 - ['d', 'of', 'machin', 'paper']
      5 - ['d', 'of', 'learn']
        3 - ['d', 'of', 'learn', 'paper']
      4 - ['d', 'of', 'paper']
      3 - ['d', 'of', 'and']
    4 - ['d', 'the']
    3 - ['d', 'convolut'

In [9]:
b = dict(tokens_map_df["token"])
c = tokens_map_df["token"]