In [1]:
import sys

import rics

# Print relevant versions
print(f"{rics.__version__=}")
print(f"{sys.version=}")
!git log --pretty=oneline --abbrev-commit -1

rics.__version__='0.1.0'
sys.version='3.8.10 (default, Nov 26 2021, 20:14:08) \n[GCC 9.3.0]'
[33m299dcc8[m[33m ([m[1;36mHEAD -> [m[1;32mid-translation[m[33m)[m Add docs and jupyterlab folder (perftest+demo)


In [2]:
from rics.logutils import basicConfig, logging

basicConfig(level=logging.INFO, rics_level=logging.DEBUG)

# Benchmarks for `FormatApplier` implementations

In [3]:
DATASET = "name.basics"

## Load data

In [None]:
%%time
from data import load_imdb

df, id_columns = load_imdb(dataset=DATASET)
print(f"{id_columns=}")
print(f"{df.shape=}")

  from .autonotebook import tqdm as notebook_tqdm
2022-03-06T15:50:24.813 [rics.utils.get_local_or_remote:DEBUG] Local file path: '/home/dev/git/private-rics/jupyterlab/data-cache/name.basics.tsv.gz'.
2022-03-06T15:50:24.813 [rics.utils.get_local_or_remote:DEBUG] Remote file path: 'https://datasets.imdbws.com/name.basics.tsv.gz'.
2022-03-06T15:50:24.816 [rics.utils.get_local_or_remote:INFO] Local processed file path: '/home/dev/git/private-rics/jupyterlab/data-cache/clean_and_fix_ids/name.basics.tsv.pkl'.
2022-03-06T15:50:24.817 [rics.utils.get_local_or_remote:INFO] Running <function clean_and_fix_ids at 0x7f34ae8778b0>..


In [None]:
import tqdm

tqdm.__version__

In [None]:
from typing import Dict

from rics.translation.offline import DefaultFormatApplier, Format, FormatApplier, TranslationMap
from rics.translation.offline.types import IdType, NameType, PlaceholdersDict, PlaceholdersTuple, TranslatedIds

In [None]:
fmt = Format("{id}:{name} (*{birthYear}â€ {deathYear}) | Profession: {primaryProfession}")

## Define the test procedure
Force reinitialization every time; this is how the translator will usually do it.

In [None]:
def run_translate(clazz, id_key) -> TranslatedIds:
    tmap = TranslationMap(
        {DATASET: df.rename(columns={id_key: "id", "primaryName": "name"})},
        format_applier_type=clazz,  # Prepare data
    )
    return tmap[(DATASET, fmt)]  # Does the actual formatting

## Define candidates

In [None]:
import numpy as np
import pandas as pd

In [None]:
candidates = [DefaultFormatApplier]


class BasicFormatApplier(FormatApplier):
    def __init__(self, source: NameType, placeholders: PlaceholdersDict) -> None:
        super().__init__(source, placeholders)
        self._placeholders = placeholders

    def _apply(self, fstring: str, placeholders: PlaceholdersTuple) -> TranslatedIds:
        ids = self._placeholders["id"]
        p_list = tuple([self._placeholders[p] for p in placeholders])
        return {idx: fstring.format(*row) for idx, row in zip(ids, zip(*p_list))}

    @property
    def positional(self) -> bool:
        """Positional-flag for the default applicator."""
        return True


class NumpyFormatApplier(FormatApplier):
    def __init__(self, source: NameType, placeholders: PlaceholdersDict) -> None:
        super().__init__(source, placeholders)
        self._ids = placeholders["id"]
        self._arr = np.array([placeholders[placeholder] for placeholder in self.placeholders], dtype="<U64")

    def _apply(self, fstring: str, placeholders: PlaceholdersTuple) -> TranslatedIds:
        placeholder_idx = np.searchsorted(self._placeholder_names, placeholders)
        # This might be where we lose time?
        sliced = self._arr[placeholder_idx].T[:, :]
        return {idx: fstring.format(*row) for idx, row in zip(self._ids, sliced)}

    @property
    def positional(self) -> bool:
        return True


class PandasFormatApplier(FormatApplier):
    def __init__(self, source: NameType, placeholders: PlaceholdersDict) -> None:
        super().__init__(source, placeholders)
        self._df = pd.DataFrame.from_dict(placeholders)
        self._df.index = self._df["id"]
        self._range = range(len(self._df.index))

    def _apply(self, fstring: str, placeholders: PlaceholdersTuple) -> TranslatedIds:
        def func(arg: pd.Series) -> str:
            return fstring.format(*arg)

        return self._df[list(placeholders)].apply(func, raw=True, axis=1)

    @property
    def positional(self) -> bool:
        return True


candidates.extend([BasicFormatApplier, NumpyFormatApplier, PandasFormatApplier])
candidates

## Sample output and verification

In [None]:
id_key = "int_id_nconst"
reference = run_translate(DefaultFormatApplier, id_key)
for t in reference.values():
    print(f"Total translations: {len(reference)}. Sample translation:\n    {t}")
    break

for cand in candidates:
    cand_translations = run_translate(cand, id_key)
    cmp = pd.Series(reference) == pd.Series(cand_translations)
    assert cmp if isinstance(cmp, bool) else cmp.all(), f"Bad candidate: {cand}"

## Run perfomance comparison

In [None]:
from rics.utils import tname

In [None]:
for cand in candidates:
    print(f"{tname(cand)}:")
    for id_key in ["int_id_nconst", "str_id_nconst"]:
        print(f"    {id_key=}")
        %timeit -r 5 -n 5 run_translate(cand, id_key)
    print("=" * 80)

# Conclusion
The `BasicFormatApplier` seems best choice *for this use case* (likely because it doesn't copy as much data?). There are certainly better ways to use both Pandas and numpy, but `BasicFormatApplier` has the added benefit and being easy to understand and requiring no external dependencies.