# Fetching data using `PandasFetcher`
Translating using pickle files.

In [1]:
import sys

import rics

import id_translation

# Print relevant versions
print(f"{id_translation.__version__=}")
print(f"{sys.version=}")
rics.configure_stuff(id_translation_level="DEBUG")

id_translation.__version__='0.15.0.dev1'
sys.version='3.11.13 (main, Jun  4 2025, 08:57:30) [GCC 13.3.0]'
👻 Configured some stuff just the way I like it!


## Make local Pickle files
We'll download data from https://datasets.imdbws.com and clean it to make sure all values are given.

In [2]:
sources = ["name.basics", "title.basics"]

In [3]:
from data import load_imdb

for dataset in sources:
    load_imdb(dataset)

2025-07-20T16:26:36.659 [rics.utility.misc.get_local_or_remote:INFO] Local processed file path: '/home/dev/.id-translation/notebooks/cache/clean_and_fix_ids/name.basics.tsv.pkl'.
2025-07-20T16:26:36.786 [rics.utility.misc.get_local_or_remote:INFO] Local processed file path: '/home/dev/.id-translation/notebooks/cache/clean_and_fix_ids/title.basics.tsv.pkl'.


## Create translator from config
Click [here](config.toml) to see the file.

In [4]:
from id_translation import Translator

translator = Translator.from_config("config.toml")

2025-07-20T16:26:36.849 [id_translation.fetching:DEBUG] Derived read_function='pandas.read_pickle' based on suffix='.pkl' found in read_path_format='~/.id-translation/notebooks/cache/clean_and_fix_ids/{}.tsv.pkl'.


In [5]:
translator.initialize_sources()

2025-07-20T16:26:36.861 [id_translation.fetching:DEBUG] Path pattern='~/.id-translation/notebooks/cache/clean_and_fix_ids/*.tsv.pkl' matched 2 files: {'name.basics': '~/.id-translation/notebooks/cache/clean_and_fix_ids/name.basics.tsv.pkl', 'title.basics': '~/.id-translation/notebooks/cache/clean_and_fix_ids/title.basics.tsv.pkl'}
2025-07-20T16:26:37.062 [id_translation.fetching:INFO] Finished initialization of 'PandasFetcher' in 205 ms: PandasFetcher(sources=['name.basics', 'title.basics'])


Translator(online=True: fetcher=PandasFetcher(sources=['name.basics', 'title.basics']))

In [6]:
translator.go_offline()

2025-07-20T16:26:37.076 [id_translation.Translator:DEBUG] Begin going offline with 2 sources provided by: PandasFetcher(sources=['name.basics', 'title.basics'])
2025-07-20T16:26:37.079 [id_translation.fetching:DEBUG] Begin fetching all IDs for placeholders=('id', 'name', 'original_name', 'from', 'to') for 2/2: ['name.basics', 'title.basics'].
2025-07-20T16:26:37.080 [id_translation.fetching:DEBUG] Begin mapping of wanted placeholders={'from', 'id', 'name', 'to'} to actual placeholders={'primaryProfession', 'int_id_nconst', 'deathYear', 'nconst', 'primaryName', 'knownForTitles', 'birthYear'} for source='name.basics'.
2025-07-20T16:26:37.085 [id_translation.fetching.map:DEBUG] Computed 4x7 match scores in context='name.basics' in 142 μs:
candidates  primaryProfession  int_id_nconst  deathYear  nconst  primaryName  knownForTitles  birthYear
values                                                                                                 
from                     -inf           -inf  

Translator(online=False: cache=TranslationMap('name.basics': 199200 IDs, 'title.basics': 64264 IDs))

In [7]:
tmap = translator.cache
for source in tmap:
    translations = tmap[source]
    print(f"Translations for {source=};")
    for i, (idx, translation) in enumerate(tmap[source].items()):
        print(f"    {repr(idx)} -> {repr(translation)}")
        if i == 2:
            break

Translations for source='name.basics';
    'nm0000001' -> 'nm0000001:Fred Astaire *1899†1987'
    'nm0000002' -> 'nm0000002:Lauren Bacall *1924†2014'
    'nm0000004' -> 'nm0000004:John Belushi *1949†1982'
Translations for source='title.basics';
    'tt0038276' -> 'tt0038276:You Are an Artist (original: You Are an Artist) *1946†1955'
    'tt0039120' -> 'tt0039120:Americana (original: Americana) *1947†1949'
    'tt0039121' -> 'tt0039121:Birthday Party (original: Birthday Party) *1947†1949'
