In [1]:
import pandas as pd
from collections import defaultdict
from typing import NewType

input_path = r'data\python_100_top_all_20221130_230247'
data_df = pd.read_json(input_path + r'\data.json').sort_index()
tokens_map_df = pd.read_json(input_path + r'\tokens_map.json',).sort_index()

In [2]:
def get_candidate_id(*ids: int) -> str:
    sorted_ids: list[int] = sorted(ids)
    return '.'.join(str(id) for id in sorted_ids)

In [3]:
all_tokens_ids: set[int] = set(tokens_map_df.index)
num_transactions: int = len(data_df)
dif_map: dict[int, set[int]] = defaultdict(set)

for transaction_id, tokens_ids in data_df.iterrows():
    non_existing_tokens_ids: set[int] = all_tokens_ids - set(tokens_ids['tokens'])
    for token_id in non_existing_tokens_ids:
        dif_map[token_id].add(transaction_id)

dif_map = dict(dif_map)

In [4]:
MIN_SUP = 2
dif_map = {token_id: dif_list for token_id, dif_list in dif_map.items() if num_transactions - len(dif_list) > MIN_SUP}

In [5]:
from typing import Union


class Candidate:
    def __init__(self, tokens_ids: list[int], sup: int, parent: Union['Candidate' , None]) -> None:
        self.tokens_ids = tokens_ids
        self.sup = sup
        self.parent = parent

    def __repr__(self) -> str:
        return f'{self.sup} - {self.tokens_ids} - {self.parent.tokens_ids if self.parent is not None else None}'

    def __str__(self) -> str:
        return self.__repr__()

In [6]:
candidates: list[Candidate] = []
root_candidate = Candidate([], num_transactions, None)
candidates.append(root_candidate)
for token_id, dif_list in dif_map.items():
    candidates.append(Candidate([token_id], num_transactions - len(dif_list), root_candidate))

for candidate in candidates:
    print(candidate)

100 - [] - None
3 - [16] - []
32 - [17] - []
22 - [19] - []
3 - [21] - []
11 - [22] - []
3 - [23] - []
3 - [25] - []
13 - [28] - []
3 - [30] - []
3 - [31] - []
10 - [32] - []
27 - [33] - []
4 - [34] - []
19 - [35] - []
3 - [36] - []
27 - [39] - []
14 - [41] - []
6 - [42] - []
3 - [45] - []
3 - [47] - []
7 - [49] - []
3 - [51] - []
4 - [52] - []
4 - [54] - []
21 - [56] - []
3 - [60] - []
3 - [65] - []
4 - [67] - []
14 - [71] - []
6 - [73] - []
5 - [75] - []
21 - [76] - []
4 - [77] - []
3 - [94] - []
7 - [96] - []
13 - [101] - []
4 - [102] - []
8 - [106] - []
4 - [107] - []
3 - [117] - []
5 - [125] - []
3 - [132] - []
3 - [134] - []
17 - [136] - []
21 - [138] - []
3 - [143] - []
3 - [144] - []
3 - [145] - []
4 - [147] - []
4 - [153] - []
3 - [154] - []
5 - [159] - []
4 - [181] - []
4 - [186] - []
3 - [197] - []
7 - [199] - []
6 - [219] - []
4 - [224] - []
6 - [227] - []
4 - [238] - []
3 - [246] - []
5 - [250] - []
3 - [257] - []
4 - [281] - []
3 - [282] - []
3 - [284] - []
4 - [330] - []