In [1]:
%load_ext autoreload
%autoreload 2

In [4]:
from pathlib import Path

from docile.dataset import Dataset


docile_dataset="docile221124-3"

DATASET_EXPORTS_DEFAULT_PATH = Path("/datasets")
dataset_path = DATASET_EXPORTS_DEFAULT_PATH / docile_dataset

docile_prefinal = Dataset.load(dataset_path, "all")
docile_prefinal_train = Dataset.load(dataset_path, "train")
docile_prefinal_val = Dataset.load(dataset_path, "val")
docile_prefinal_test = Dataset.load(dataset_path, "test")

Loading documents for docile221124-3/all: 100%|██████████| 5902/5902 [00:00<00:00, 56916.05it/s]
Loading documents for docile221124-3/train: 100%|██████████| 4402/4402 [00:00<00:00, 76252.51it/s]
Loading documents for docile221124-3/val: 100%|██████████| 500/500 [00:00<00:00, 13432.17it/s]
Loading documents for docile221124-3/test: 100%|██████████| 1000/1000 [00:00<00:00, 72778.61it/s]


In [18]:
import _collections_abc
from collections import Counter


class RecursiveCounter(Counter):
    """Counter that can have RecursiveCounters as values."""
    def __missing__(self, elem):
        if elem not in self:
            return self.__class__({})

    def __iadd__(self, other):
        if isinstance(other, int) and not self:
            return other
        for key, val in other.items():
            self[key] += val
        return self
    
    def total_sum(self):
        total_sum = 0
        for val in self.values():
            if isinstance(val, int):
                total_sum += val
            else:
                total_sum += val.total_sum()
        return total_sum
    
    def __hash__(self) -> int:
        return hash(tuple(sorted(self.items())))
                
    def __le__(self, other: "RecursiveCounter") -> bool:
        return self.total_sum() <= other.total_sum()
    
    def update(self, iterable=None, /, **kwds):
        '''Like dict.update() but add counts instead of replacing them.
        Source can be an iterable, a dictionary, or another Counter instance.
        >>> c = Counter('which')
        >>> c.update('witch')           # add elements from another iterable
        >>> d = Counter('watch')
        >>> c.update(d)                 # add elements from another counter
        >>> c['h']                      # four 'h' in which, witch, and watch
        4
        '''
        # The regular dict.update() operation makes no sense here because the
        # replace behavior results in the some of original untouched counts
        # being mixed-in with all of the other counts for a mismash that
        # doesn't have a straight-forward interpretation in most counting
        # contexts.  Instead, we implement straight-addition.  Both the inputs
        # and outputs are allowed to contain zero and negative counts.

        if iterable is not None:
            if isinstance(iterable, _collections_abc.Mapping):
                if self:
                    if isinstance(iterable, RecursiveCounter):
                        self += iterable
                    else:
                        self += RecursiveCounter(iterable)
                else:
                    # fast path when counter is empty
                    super().update(iterable)
            else:
                for elem in iterable:
                    if isinstance(elem, RecursiveCounter):
                        self += elem
                    else:
                        self[elem] += 1
        if kwds:
            self.update(kwds)
    
    def most_common_recursive(self, max_count: int = 0, reverse: bool = False):
        items_sorted = []
        to = max_count if max_count else len(self)
        for key, val in sorted(self.items(), key=lambda kv: kv[1], reverse=not reverse)[:to]:
            if isinstance(val, int):
                items_sorted.append([f"{key}:{val}"])
            else:
                items_sorted.append((key, val.total_sum(), val.most_common_recursive(max_count, reverse)))
        return items_sorted

    def _keep_positive(self):
        pass

In [7]:
stats = RecursiveCounter()

datasets = [docile_prefinal_train, docile_prefinal_val, docile_prefinal_test]

for doc in docile_prefinal:
    for dataset in datasets:
        try:
            _d = dataset[doc.docid]
        except:
            pass
        else:
            # print([{f.fieldtype: RecursiveCounter([dataset.split_name])} for f in doc.annotation.fields])
            # rc = RecursiveCounter([RecursiveCounter({f.fieldtype: RecursiveCounter([dataset.split_name])}) for f in doc.annotation.fields])
            stats["KILE-fieldtypes"] += RecursiveCounter(RecursiveCounter({f.fieldtype: RecursiveCounter([dataset.split_name])}) for f in doc.annotation.fields)
            stats["LIR-fieldtypes"] += RecursiveCounter(RecursiveCounter({f.fieldtype: RecursiveCounter([dataset.split_name])}) for f in doc.annotation.li_fields)

In [8]:
stats.most_common_recursive(True)

[('KILE-fieldtypes',
  72947,
  [('iban', 5, ['train:4', 'test:1']),
   ('recipient_vat_id', 9, ['train:8', 'val:1']),
   ('recipient_ic', 12, ['train:6', 'test:4', 'val:2']),
   ('bic', 27, ['train:22', 'test:4', 'val:1']),
   ('sender_vat_id', 33, ['train:33']),
   ('recipient_dic', 41, ['train:21', 'val:14', 'test:6']),
   ('sender_ic', 53, ['train:46', 'test:7']),
   ('tax_detail_rate', 68, ['train:59', 'test:6', 'val:3']),
   ('bank_num', 96, ['train:81', 'test:11', 'val:4']),
   ('account_num', 136, ['train:109', 'test:14', 'val:13']),
   ('var_sym', 195, ['train:137', 'test:39', 'val:19']),
   ('sender_order_id', 228, ['train:173', 'test:28', 'val:27']),
   ('customer_delivery_addrline', 333, ['train:269', 'test:44', 'val:20']),
   ('customer_other_addrline', 349, ['train:267', 'test:60', 'val:22']),
   ('recipient_delivery_name', 354, ['train:283', 'test:47', 'val:24']),
   ('amount_paid', 445, ['train:358', 'test:64', 'val:23']),
   ('tax_detail_base', 516, ['train:409', 'test

In [22]:
from docile.dataset import Field

header_stats = RecursiveCounter()

for doc in docile_prefinal:
    # print([{f.fieldtype: RecursiveCounter([dataset.split_name])} for f in doc.annotation.fields])
    # rc = RecursiveCounter([RecursiveCounter({f.fieldtype: RecursiveCounter([dataset.split_name])}) for f in doc.annotation.fields])
    header_fields = [Field.from_annotation(a) for a in doc.annotation.content["line_item_headers"]]
    header_stats += RecursiveCounter(RecursiveCounter({f.fieldtype: RecursiveCounter([f.text.lower()])}) for f in header_fields)

In [23]:
len(header_stats.keys())

21

In [24]:
header_stats.most_common_recursive(25)

[('table_column_other',
  13597,
  [['length:1305'],
   ['time:1236'],
   ['len:834'],
   ['type:498'],
   ['day:324'],
   ['start/end time:297'],
   ['days:251'],
   ['times:246'],
   ['dur:206'],
   ['daypart:199'],
   ['start:187'],
   ['station:179'],
   ['m:170'],
   ['mtwtfss:143'],
   ['time period:123'],
   ['dp:120'],
   ['end:116'],
   ['ch:104'],
   ['c/t:103'],
   ['start time:94'],
   ['run times:93'],
   ['tu:93'],
   ['daypart\ncode:91'],
   ['date:90'],
   ['scheduled time:90']]),
 ('table_column_date',
  4444,
  [['date:1342'],
   ['end date:488'],
   ['start date:471'],
   ['start:276'],
   ['dates:231'],
   ['end:175'],
   ['end\ndate:128'],
   ['start\ndate:126'],
   ['air date:107'],
   ['run date:104'],
   ['run dates:102'],
   ['stop:77'],
   ['effective dates:70'],
   ['day:68'],
   ['days:49'],
   ['date range:46'],
   ['dates to run:38'],
   ['period:33'],
   ['from:32'],
   ['thru:32'],
   ['days of:31'],
   ['invoice date:30'],
   ['order date:29'],
   ['shi