In [1]:
import contextlib

print('working')
import pydal.objects
# %run ~/work/__init__edwh__new.ipynb
#!pip install pydal httpx[http2] trio
from pydal import DAL
import os, pathlib, httpx, trio
if workdir := pathlib.Path('/home/jovyan/work').exists():
    workdir.chdir()
elif pathlib.Path('.jupyterlab/notebooks/DUO').exists():
    os.chdir('.jupyterlab/notebooks')
elif pathlib.Path('.').absolute().name == 'DUO':
    os.chdir('..')
elif pathlib.Path('.').absolute().name == 'notebooks':
    pass
else:
    raise ValueError(os.getcwd())
print(os.getcwd())

working
/home/remco/PycharmProjects/omgeving/jupyterlab/notebooks


[Datamodel](https://www.edustandaard.nl/app/uploads/2022/11/Canoniek-model-RIO-Generiek-20221121.pdf)

Bronnen:
https://onderwijsdata.duo.nl/datasets/rio_nfo_po_vo_vavo_mbo_ho

 * [Onderwijslocatiegebruiken](https://onderwijsdata.duo.nl/datasets/rio_nfo_po_vo_vavo_mbo_ho/resources/a86ef529-66dd-4fee-94f2-a947d4fc4617)
   `https://onderwijsdata.duo.nl/api/3/action/datastore_search?resource_id=a86ef529-66dd-4fee-94f2-a947d4fc4617&limit=5`
   > Op deze pagina vindt u de bestanden met betrekking tot de registratie instellingen en opleidingen (nfo, po, vo, vavo, mbo en ho). Deze bestanden kunt u handmatig downloaden of via de API (Application Programming Interface) te raadplegen. Voor vragen of uitgebreidere documentatie, waaronder het overkoepelend relatiemodel, kunt u contact opnemen met gegevensmagazijn@duo.nl.
 * [informatiemodel en begrippen](https://www.rio-onderwijs.nl/informatiemodel-en-begrippen)


Betere alternatieven:
https://datascience.stackexchange.com/questions/63101/collaborating-on-jupyter-notebooks

In [2]:
# duodb.mbo_relaties_opleidingseenheden_erkenningen.truncate()
import ipywidgets as widgets
import contextlib
output = widgets.Output()
display(output)

table_progress = widgets.IntProgress()
concurrent_requests = widgets.IntProgress(max=15)
table_progress.value = 0

with output:
    display(table_progress, concurrent_requests)

fieldmap  = dict(int='integer', timestamp='datetime', numeric='float',int4='integer')
import slugify
from functools import partial
fieldname = partial(slugify.slugify, separator='_')

def define_table(db, name, js):
    fields = [pydal.Field(fieldname(f['id']), fieldmap.get(f['type'], f['type'])) for f in js['result']['fields'] if f['id']!='_id']
    with contextlib.suppress(SyntaxError):
        return db.define_table(name, *fields)
    return db[name]

async def get(url, client, output):
    for retry in range(15):
        try:
            # print('requesting', url)
            concurrent_requests.value += 1
            js =  (await client.get(url, timeout=5)).json()
            # print('received', url)
            with output:
                print('+', len(js['result']['records']))
            return js
        except (httpx.HTTPError, httpx.RemoteProtocolError) as e:
            with output:
                print('retry',retry,':', url )
        finally:
            concurrent_requests.value -= 1
    return {}


async def load_table(table:pydal.objects.Table, lock:trio.Lock,  client:httpx.Client, url, total, max_retries=5):
    table_progress.value += 1
    table_output = widgets.Output()
    pbar = widgets.IntProgress(max=total)
    display(pbar, table_output)
    with table_output:
        display(pbar, f'Loading {table._tablename} from  {url}')
    check_for_trunk = True # flag to check only after the first request
    while True:
        js = await get(url, client, table_output)
        if check_for_trunk:
            async with lock:
                actual_rows_in_table = table._db(table).count()
                with table_output:
                    print(f'expecting {total} rows in table, actual: {actual_rows_in_table}')
                    if total != actual_rows_in_table:
                        print('Truncating table', table._tablename, 'because', actual_rows_in_table, '!=', total)
                        table.truncate()

            check_for_trunk = False
        if (nr_of_records := len(js['result']['records'])) == 0:
            break
        # assert len(js['result']['records']) == js['result']['total'], "Niet alle data is in de resultset beschikbaar"
        # with table_output:
        #     print('inserting', nr_of_records, 'records into', table._tablename)
        async with lock:
            for record in js['result']['records']:
                rec = {fieldname(k):v for k,v in record.items() if k != '_id'}
                for f in js['result']['fields']:
                    if f['type'] == 'timestamp':
                        lower_id = fieldname(f['id'])
                        rec[lower_id] = rec[lower_id].replace('T',' ') if rec[lower_id] else None
                    table.insert(**rec)
            pbar.value += nr_of_records
#            duodb.commit()
        url = 'https://onderwijsdata.duo.nl' + js['result']['_links']['next']
    table_output.clear_output()
    with table_output:
        print(f'Done with {table._tablename} excepted', total, 'records, inserted', pbar.value, 'records, for a total of ', table._db(table).count(), 'records')
    pbar.close()
    #pbar.close()
    table_progress.value += 1


#resource_map = dict(
#    onderwijslocatiegebruiken = 'a86ef529-66dd-4fee-94f2-a947d4fc4617',
#    onderwijslocaties = 'a7e3f323-6e46-4dca-a834-369d9d520aa8',
#)
api = httpx.get('https://onderwijsdata.duo.nl/api/3/action/package_show?id=rio_nfo_po_vo_vavo_mbo_ho').json()
from pprint import pp
resource_map = {resource['name']:resource['id'] for resource in  api['result']['resources']}
resource_map = {k:v for k,v in resource_map.items() if not k.endswith('cohorten')}
# resource_map = {resource['name']:resource['id'] for resource in  api['result']['resources'] if resource['name'].startswith(('aangeboden','onderwijslocatie','onderwijsbesturen','onderwijsaanbieders')) and not resource['name'].endswith('cohorten') }
if 'duodb' not in locals():
    duodb = pydal.DAL('sqlite://duo-data.sqlite3', folder='./DUO')
resource_map |= {
    'leerlingen_po_per_vestiging':'9278ae97-4014-49f4-91fc-8cc255c2595d',
    'leerlingen_vo_per_vestiging':'d49219cc-2f36-4c2d-8007-b385ba44ec8d'
}


async def main():
    concurrent_table_limiter = trio.CapacityLimiter(concurrent_requests.max)
    duodb_lock = trio.Lock()
    table_progress.max = len(resource_map) * 2 # 1 for starting the table, 1 for finishing the table
    async with httpx.AsyncClient(timeout=5, http2=True, limits=httpx.Limits(max_connections=concurrent_requests.max, max_keepalive_connections=concurrent_requests.max)) as client, trio.open_nursery() as nursery:
        for idx, (naam, gid) in enumerate(resource_map.items()):
            # print(idx,'/',len(resource_map),':',naam,':', gid)
            js = await get(f'https://onderwijsdata.duo.nl/api/3/action/datastore_search?resource_id={gid}&limit=1', client, output)
            if 'result' not in js:
                pp(js)
                continue
            table = define_table(duodb, naam, js)
            async with concurrent_table_limiter:
                nursery.start_soon(
                    load_table, table, duodb_lock, client, f'https://onderwijsdata.duo.nl/api/3/action/datastore_search?resource_id={gid}&limit=500', int(js['result']['total'])
                )
trio.run(main)

Output()

IntProgress(value=0, max=8264)

Output()

IntProgress(value=0, max=1492)

Output()

IntProgress(value=0, max=253)

Output()

IntProgress(value=0, max=28653)

Output()

IntProgress(value=0, max=213)

Output()

IntProgress(value=0, max=181115)

Output()

IntProgress(value=0, max=678)

Output()

IntProgress(value=0, max=595)

Output()

IntProgress(value=0, max=22464)

Output()

IntProgress(value=0, max=4024)

Output()

IntProgress(value=0, max=12988)

Output()

IntProgress(value=0, max=8412)

Output()

IntProgress(value=0, max=5988)

Output()

IntProgress(value=0, max=113413)

Output()

IntProgress(value=0, max=41869)

Output()

IntProgress(value=0, max=13470)

Output()

IntProgress(value=0, max=316)

Output()

IntProgress(value=0, max=147)

Output()

IntProgress(value=0, max=10253)

Output()

IntProgress(value=0, max=16)

Output()

IntProgress(value=0, max=646)

Output()

IntProgress(value=0, max=143041)

Output()

IntProgress(value=0, max=3724)

Output()

IntProgress(value=0, max=15801)

Output()

IntProgress(value=0, max=6509)

Output()

IntProgress(value=0, max=86177)

Output()

IntProgress(value=0, max=10395)

Output()

IntProgress(value=0, max=40)

Output()

IntProgress(value=0, max=5567)

Output()

IntProgress(value=0, max=68277)

Output()

IntProgress(value=0, max=365)

Output()

IntProgress(value=0, max=12204)

Output()

IntProgress(value=0, max=274)

Output()

IntProgress(value=0, max=13147)

Output()

IntProgress(value=0, max=32273)

Output()

IntProgress(value=0, max=16174)

Output()

IntProgress(value=0, max=12943)

Output()

IntProgress(value=0, max=636)

Output()

IntProgress(value=0, max=34)

Output()

IntProgress(value=0, max=10)

Output()

IntProgress(value=0, max=5)

Output()

IntProgress(value=0, max=1)

Output()

IntProgress(value=0, max=154)

Output()

IntProgress(value=0, max=5)

Output()

IntProgress(value=0, max=151)

Output()

IntProgress(value=0, max=23734)

Output()

IntProgress(value=0, max=489)

Output()

IntProgress(value=0, max=851)

Output()

IntProgress(value=0, max=897)

Output()

IntProgress(value=0, max=11729)

Output()

IntProgress(value=0, max=7997)

Output()

IntProgress(value=0, max=11461)

Output()

IntProgress(value=0, max=246)

Output()

IntProgress(value=0, max=17550)

Output()

IntProgress(value=0, max=10313)

Output()

IntProgress(value=0, max=11083)

Output()

IntProgress(value=0, max=78)

Output()

IntProgress(value=0, max=11193)

Output()

IntProgress(value=0, max=596)

Output()

IntProgress(value=0, max=10829)

Output()

IntProgress(value=0, max=867)

Output()

IntProgress(value=0, max=1553)

Output()

IntProgress(value=0, max=12)

Output()

IntProgress(value=0, max=8188)

Output()

IntProgress(value=0, max=68)

Output()

IntProgress(value=0, max=79)

Output()

IntProgress(value=0, max=11)

Output()

IntProgress(value=0, max=51)

Output()

IntProgress(value=0, max=79)

Output()

IntProgress(value=0, max=39548)

Output()

IntProgress(value=0, max=24)

Output()

IntProgress(value=0, max=22)

Output()

IntProgress(value=0, max=563160)

Output()

IntProgress(value=0, max=1152)

Output()

IntProgress(value=0, max=1915)

Output()

IntProgress(value=0, max=487)

Output()

IntProgress(value=0, max=1568)

Output()

{'help': 'https://onderwijsdata.duo.nl/api/3/action/help_show?name=datastore_search',
 'error': {'__type': 'Not Found Error',
           'message': 'Not found: Resource '
                      '"29902285-3328-4cdc-80fa-34216dbaf7cf" was not found.'},
 'success': False}


IntProgress(value=0, max=1167)

Output()

IntProgress(value=0, max=1548)

Output()

IntProgress(value=0, max=2026)

Output()

IntProgress(value=0, max=924)

Output()

IntProgress(value=0, max=16839)

Output()

IntProgress(value=0, max=365)

Output()

IntProgress(value=0, max=12674)

Output()

IntProgress(value=0, max=10292)

Output()

IntProgress(value=0, max=1195)

Output()

IntProgress(value=0, max=805)

Output()

IntProgress(value=0, max=33794)

Output()

IntProgress(value=0, max=7259)

Output()

IntProgress(value=0, max=96)

Output()

IntProgress(value=0, max=98190)

Output()

IntProgress(value=0, max=134640)

Output()

In [3]:
mem(mem.onderwijslocatiegebruiken).select(limitby=(0,10)).as_dict()


NameError: name 'mem' is not defined

In [None]:

gid = 'd49219cc-2f36-4c2d-8007-b385ba44ec8d'
js = httpx.get(f'https://onderwijsdata.duo.nl/api/3/action/datastore_search?resource_id={gid}&limit=10').json()

In [None]:
# duodb.leerlingen_vo_per_vestiging.truncate()

In [None]:
duodb.commit()