# Fetching data using `PandasFetcher`
Translating using pickle files.

In [1]:
import sys

import rics

# Print relevant versions
print(f"{rics.__version__=}")
print(f"{sys.version=}")
!git log --pretty=oneline --abbrev-commit -1

rics.__version__='0.3.1'
sys.version='3.8.10 (default, Nov 26 2021, 20:14:08) \n[GCC 9.3.0]'
[33mc2fccc8[m[33m ([m[1;36mHEAD -> [m[1;32mrandom-fixups[m[33m)[m Revert DIO changes in Translator, break out fetch


In [2]:
from rics.utility.misc import get_local_or_remote
from rics.utility.logs import basic_config, logging

basic_config(level=logging.INFO, rics_level=logging.DEBUG)

## Make local Pickle files
We'lll download data from https://datasets.imdbws.com and clean it to make sure all values are given (which means that for actors are dead and titles have stopped airing).

In [3]:
sources = ["name.basics", "title.basics"]

In [4]:
from data import load_imdb

for dataset in sources:
    load_imdb(dataset)

2022-03-17T21:29:58.348 [rics.utility.misc.get_local_or_remote:DEBUG] Local file path: '/home/dev/git/private-rics/jupyterlab/data-cache/name.basics.tsv.gz'.
2022-03-17T21:29:58.349 [rics.utility.misc.get_local_or_remote:DEBUG] Remote file path: 'https://datasets.imdbws.com/name.basics.tsv.gz'.
2022-03-17T21:29:58.447 [rics.utility.misc.get_local_or_remote:INFO] Fetching data from 'https://datasets.imdbws.com/name.basics.tsv.gz'..


https://datasets.imdbws.com/name.basics.tsv.gz:   0%|          | 0.00/214M [00:00<?, ?iB/s]

2022-03-17T21:30:04.073 [rics.utility.misc.get_local_or_remote:INFO] Local processed file path: '/home/dev/git/private-rics/jupyterlab/data-cache/clean_and_fix_ids/name.basics.tsv.pkl'.
2022-03-17T21:30:04.074 [rics.utility.misc.get_local_or_remote:INFO] Running <function clean_and_fix_ids at 0x7ffa400b2550>..
2022-03-17T21:30:33.844 [rics.utility.misc.get_local_or_remote:INFO] Serializing processed data to '/home/dev/git/private-rics/jupyterlab/data-cache/clean_and_fix_ids/name.basics.tsv.pkl'..
2022-03-17T21:30:34.112 [rics.utility.misc.get_local_or_remote:DEBUG] Local file path: '/home/dev/git/private-rics/jupyterlab/data-cache/title.basics.tsv.gz'.
2022-03-17T21:30:34.113 [rics.utility.misc.get_local_or_remote:DEBUG] Remote file path: 'https://datasets.imdbws.com/title.basics.tsv.gz'.
2022-03-17T21:30:34.114 [rics.utility.misc.get_local_or_remote:INFO] Fetching data from 'https://datasets.imdbws.com/title.basics.tsv.gz'..


https://datasets.imdbws.com/title.basics.tsv.gz:   0%|          | 0.00/146M [00:00<?, ?iB/s]

2022-03-17T21:30:38.259 [rics.utility.misc.get_local_or_remote:INFO] Local processed file path: '/home/dev/git/private-rics/jupyterlab/data-cache/clean_and_fix_ids/title.basics.tsv.pkl'.
2022-03-17T21:30:38.261 [rics.utility.misc.get_local_or_remote:INFO] Running <function clean_and_fix_ids at 0x7ffa400b2550>..
  df = pd.read_csv(input_path, sep="\t", header=0, engine="c")
2022-03-17T21:31:05.202 [rics.utility.misc.get_local_or_remote:INFO] Serializing processed data to '/home/dev/git/private-rics/jupyterlab/data-cache/clean_and_fix_ids/title.basics.tsv.pkl'..


## Create translator from config

In [5]:
from rics.translation import Translator

translator = Translator.from_config("config.yaml")
print(f"Fetcher: {translator._fetcher}")
tmap = translator.store()

2022-03-17T21:31:06.138 [rics.translation.fetching.PandasFetcher:DEBUG] Sources initialized: ['name.basics', 'title.basics']
2022-03-17T21:31:06.138 [rics.translation.fetching.Fetcher:DEBUG] Overrides for source='name.basics': {'id': 'nconst', 'name': 'primaryName', 'from': 'birthYear', 'to': 'deathYear'}; unique to source.


Fetcher: PandasFetcher(read_function=read_pickle)


2022-03-17T21:31:06.485 [rics.translation.fetching.Fetcher:DEBUG] Fetched ('nconst', 'primaryName', 'birthYear', 'deathYear', 'primaryProfession', 'knownForTitles', 'int_id_nconst') for 163739 IDS from 'name.basics' in 0.344963 sec.
2022-03-17T21:31:06.486 [rics.translation.fetching.Fetcher:DEBUG] Overrides for source='title.basics': {'id': 'tconst', 'name': 'primaryTitle', 'original_name': 'originalTitle', 'from': 'startYear', 'to': 'endYear'}; unique to source.
2022-03-17T21:31:06.554 [rics.translation.fetching.Fetcher:DEBUG] Fetched ('tconst', 'titleType', 'primaryTitle', 'originalTitle', 'isAdult', 'startYear', 'endYear', 'runtimeMinutes', 'genres', 'int_id_tconst') for 42152 IDS from 'title.basics' in 0.0673213 sec.
2022-03-17T21:31:06.555 [rics.translation.Translator:INFO] Store TranslationMap('title.basics': 42152 IDs, 'name.basics': 163739 IDs)


In [6]:
for source in tmap:
    translations = tmap[source]
    print(f"Translations for {source=};")
    for i, (idx, translation) in enumerate(tmap[source].items()):
        print(f"    {repr(idx)} -> {repr(translation)}")
        if i == 2:
            break

Translations for source='name.basics';
    'nm0000001' -> 'nm0000001:Fred Astaire *1899†1987'
    'nm0000002' -> 'nm0000002:Lauren Bacall *1924†2014'
    'nm0000004' -> 'nm0000004:John Belushi *1949†1982'
Translations for source='title.basics';
    'tt0025509' -> 'tt0025509:Les Misérables (original: Les misérables) *1934†1934'
    'tt0035803' -> 'tt0035803:The German Weekly Review (original: Die Deutsche Wochenschau) *1940†1945'
    'tt0038276' -> 'tt0038276:You Are an Artist (original: You Are an Artist) *1946†1955'


## Prepare for `SqlFetcher` demo

PostgreSQL must be running locally, with a user called `postgres` using password `your_password` and a the database `imdb` created.
```python
import sqlalchemy

engine = sqlalchemy.create_engine("postgresql://postgres:your_password@localhost:5432/imdb")

for source in sources:
    df = load_imdb(source)[0]
    df.to_sql(source.replace(".", "_"), engine, if_exists="replace")
```
Copy-and paste, then run this snippet to load data into the SQL database.