# Fetching data using `PandasFetcher`
Translating using pickle files.

In [1]:
import sys
import rics
import id_translation

# Print relevant versions
print(f"{rics.__version__=}")
print(f"{id_translation.__version__=}")
print(f"{sys.version=}")
!git log --pretty=oneline --abbrev-commit -1

rics.__version__='0.17.0.dev1'
id_translation.__version__='0.1.0.dev0'
sys.version='3.10.6 (main, Nov  2 2022, 18:53:38) [GCC 11.3.0]'
[33m1ba612a[m[33m ([m[1;36mHEAD -> [m[1;32mmain[m[33m, [m[1;31morigin/main[m[33m, [m[1;31morigin/HEAD[m[33m)[m Spelling and formatting


In [2]:
from rics.utility.logs import basic_config, logging

basic_config(
    level=logging.INFO, rics_level=logging.DEBUG, id_translation_level=logging.DEBUG
)

## Make local Pickle files
We'lll download data from https://datasets.imdbws.com and clean it to make sure all values are given (which means that for actors are dead and titles have stopped airing).

In [3]:
sources = ["name.basics", "title.basics"]

In [4]:
from data import load_imdb

for dataset in sources:
    load_imdb(dataset)

2022-11-26T15:36:53.703 [rics.utility.misc.get_local_or_remote:DEBUG] Local file path: '/home/dev/git/id-translation/jupyterlab/id-translation/data-cache/name.basics.tsv.gz'.
2022-11-26T15:36:53.723 [rics.utility.misc.get_local_or_remote:DEBUG] Remote file path: 'https://datasets.imdbws.com/name.basics.tsv.gz'.
2022-11-26T15:36:53.947 [rics.utility.misc.get_local_or_remote:INFO] Fetching data from 'https://datasets.imdbws.com/name.basics.tsv.gz'..


https://datasets.imdbws.com/name.basics.tsv.gz:   0%|          | 0.00/226M [00:00<?, ?iB/s]

2022-11-26T15:37:16.023 [rics.utility.misc.get_local_or_remote:INFO] Local processed file path: '/home/dev/git/id-translation/jupyterlab/id-translation/data-cache/clean_and_fix_ids/name.basics.tsv.pkl'.
2022-11-26T15:37:16.027 [rics.utility.misc.get_local_or_remote:INFO] Running clean_and_fix_ids..
2022-11-26T15:40:42.551 [rics.utility.misc.get_local_or_remote:INFO] Serializing processed data to '/home/dev/git/id-translation/jupyterlab/id-translation/data-cache/clean_and_fix_ids/name.basics.tsv.pkl'..
2022-11-26T15:40:43.959 [rics.utility.misc.get_local_or_remote:DEBUG] Local file path: '/home/dev/git/id-translation/jupyterlab/id-translation/data-cache/title.basics.tsv.gz'.
2022-11-26T15:40:43.979 [rics.utility.misc.get_local_or_remote:DEBUG] Remote file path: 'https://datasets.imdbws.com/title.basics.tsv.gz'.
2022-11-26T15:40:43.991 [rics.utility.misc.get_local_or_remote:INFO] Fetching data from 'https://datasets.imdbws.com/title.basics.tsv.gz'..


https://datasets.imdbws.com/title.basics.tsv.gz:   0%|          | 0.00/157M [00:00<?, ?iB/s]

2022-11-26T15:41:05.423 [rics.utility.misc.get_local_or_remote:INFO] Local processed file path: '/home/dev/git/id-translation/jupyterlab/id-translation/data-cache/clean_and_fix_ids/title.basics.tsv.pkl'.
2022-11-26T15:41:05.447 [rics.utility.misc.get_local_or_remote:INFO] Running clean_and_fix_ids..
  df = pd.read_csv(input_path, sep="\t", header=0, engine="c")
2022-11-26T15:45:10.719 [rics.utility.misc.get_local_or_remote:INFO] Serializing processed data to '/home/dev/git/id-translation/jupyterlab/id-translation/data-cache/clean_and_fix_ids/title.basics.tsv.pkl'..


## Create translator from config
Click [here](config.toml) to see the file.

In [5]:
from id_translation import Translator

translator = Translator.from_config("config.toml")
translator

Translator(online=True: fetcher=PandasFetcher(sources=['name.basics', 'title.basics']))

In [6]:
tmap = translator.store().cache

2022-11-26T15:45:12.239 [rics.mapping.Mapper:DEBUG] Begin computing match scores for values=('name', 'to', 'original_name', 'from', 'id') in context='name.basics' to candidates=('primaryName', 'deathYear', 'int_id_nconst', 'knownForTitles', 'primaryProfession', 'birthYear', 'nconst') using HeuristicScore([force_lower_case()] -> AbstractFetcher.default_score_function).
2022-11-26T15:45:12.323 [rics.mapping.Mapper:DEBUG] Computed 5x7 match scores in 0.0200166 sec:
candidates     primaryName  deathYear  int_id_nconst  knownForTitles  primaryProfession  birthYear  nconst
values                                                                                                    
name                   inf       -inf           -inf            -inf               -inf       -inf    -inf
to                    -inf        inf           -inf            -inf               -inf       -inf    -inf
original_name     0.181818   0.022222       0.076923             0.0           0.015385        0.0     0.

In [7]:
for source in tmap:
    translations = tmap[source]
    print(f"Translations for {source=};")
    for i, (idx, translation) in enumerate(tmap[source].items()):
        print(f"    {repr(idx)} -> {repr(translation)}")
        if i == 2:
            break

Translations for source='title.basics';
    'tt0025509' -> 'tt0025509:Les Misérables (original: Les misérables) *1934†1934'
    'tt0035803' -> 'tt0035803:The German Weekly Review (original: Die Deutsche Wochenschau) *1940†1945'
    'tt0038276' -> 'tt0038276:You Are an Artist (original: You Are an Artist) *1946†1955'
Translations for source='name.basics';
    'nm0000001' -> 'nm0000001:Fred Astaire *1899†1987'
    'nm0000002' -> 'nm0000002:Lauren Bacall *1924†2014'
    'nm0000004' -> 'nm0000004:John Belushi *1949†1982'
