# 0. Setup

In [None]:
import os

from edwh.core.backend.support import Environment
from edwh.core.data_model import OrganisationPriority
import datetime
#!pip install ujson  > /dev/null
# import ujson
!pip install tqdm pandas
from tqdm import tqdm
tqdm.pandas(desc='Jupyter')

In [None]:
env = Environment(long_running=True)

db = env.db
backend = env.backend

In [None]:
import os
os.chdir('/home/jovyan/work')

In [None]:
import sqlite3
import gc

def dict_factory(cursor, row):
    # https://stackoverflow.com/questions/3300464/how-can-i-get-dict-from-sqlite-query
    d = {}
    for idx, col in enumerate(cursor.description):
        d[col[0]] = row[idx]
    return d

con = sqlite3.connect("data/all_with_gid.db")
con.row_factory = dict_factory
cur = con.cursor()

def query_all(sql) -> list[dict]:
    cur.execute(sql)
    return cur.fetchall()

def query_one(sql) -> dict:
    cur.execute(sql)
    return cur.fetchone()

# 1. brin + vestigingsnummer

In [None]:
# 'table' organisation_effdted_now contains the latest data with highest prio (same fields as db.organisation)
table = db.organisation_effdted_now
missing_info_query = table.brin == None
missing_info_query |= table.vestigingscode == None
latest_rows_missing_brin = db(missing_info_query).select().as_dict(key='gid')

In [None]:
# brin info
brin_rows = query_all("""select * from scholen where "BRIN NUMMER" is not null and "BRIN NUMMER" != '';""")
brinfo = {_['org_gid']: _ for _ in brin_rows}
# clean up memory a bit:
del brin_rows
gc.collect()
pass

In [None]:
from edwh.core.data_model import OrganisationPriority
from pprint import pprint

incomplete = {}  # gid: reason

updated_count = 0
db.rollback()
for gid, school in tqdm(latest_rows_missing_brin.items()):
    info = brinfo.get(gid)
    if not info:
        # there is a row, but it's missing BRIN NUMMER -> irrelevant
        incomplete[gid] = "brin"
        continue

    brin = info['BRIN NUMMER']
    vestiging = info['VESTIGINGSNUMMER']
    if brin in vestiging:
        # VESTIGINGSNUMMER is brin + vestiging
        # bijv. 06YL00 = 06YL + 00
        vestiging = vestiging[4:]

    if not vestiging:
        incomplete[gid] = "vestiging"
        continue

    assert len(brin) == 4, f"Incorrect brin: {brin} (gid: {gid})"
    assert len(
        vestiging) == 2, f"Incorrect vestiging: {vestiging} (gid: {gid}, brin: {brin}, VN: {info['VESTIGINGSNUMMER']})"

    # todo: soms is er wel een brin en geen vestigingsnummer,
    # als de brin dan uniek is: brin wel invullen en kijken wat de huidige data is (voor vestigingsnummer)
    backend.update_effectivedated(
            db.organisation,
            db.organisation.gid == gid,
            values={
                'brin': brin,
                'vestigingscode': vestiging,
            },
            prio=OrganisationPriority.DUO,
            last_saved_by="Locatiegegevens aan Organisatie.ipynb op basis van all_with_gid.db"
        )

    updated_count += 1

db.commit()
print(f"Updated {updated_count} out of {len(latest_rows_missing_brin)} items")

In [None]:
from collections import defaultdict
# brin/vestiging zit NIET in all_updates, want de rest hangt daar van af!
ALL_UPDATES = defaultdict(dict) # gid: changes by key: value

# 2. LeerlingNummers

In [None]:
import collections
import json

leerling_info = collections.defaultdict(int) # key: (brin, vestiging); value: total leerlingen
vestigingen = collections.defaultdict(list) # brin: [vestigingsnummers]

with open('data/leerlingen.json') as f:
    for row in json.load(f):
        leerling_info[(row['brin'], int(row['vestigingscode']))] += row['leerling_aantal']
        vestigingen[row['brin']].append(row['vestigingscode'])

In [None]:
table = db.organisation_effdted_now
query = table.student_count == None
query |= table.student_count == 0
query &= table.brin != None
query &= table.vestigingscode != None

rows = db(query).select(table.gid, table.brin, table.vestigingscode).as_dict(key='gid')

In [None]:
missing = {}
updated_count = 0
for gid, org in tqdm(rows.items()):
    if org['brin'] is None:
        missing[gid] = 'brin'
        continue
    vestiging = org['vestigingscode']
    if vestiging is None:
        missing[gid] = 'vestiging'
        continue
    key = (org['brin'], int(vestiging))
    student_count = leerling_info.get(key, None)
    if not student_count:
        missing[gid] =  f'student_count: {student_count}'
        continue

    ALL_UPDATES[gid]['student_count'] = student_count
    # backend.update_effectivedated(
    #     db.organisation,
    #     db.organisation.gid == gid,
    #     values={
    #       "student_count": student_count
    #     },
    #     prio=OrganisationPriority.DUO.value,
    #     last_saved_by="Locatiegegevens aan Organisatie.ipynb op basis van leerlingen.json"
    # )
    updated_count += 1

db.commit()

print(f"updated {updated_count} out of {len(rows)} items.")

# 3. Overige Data
## 3.1: geo (lonlat)

In [None]:
table = db.organisation_effdted_now
query = table.lonlat == None

rows = db(query).select(table.gid).as_list()

In [None]:
# lonlat
lonlat_rows = query_all("""select org_gid, geo from scholen where geo is not null;""")
lonlat_info = {_['org_gid']: _ for _ in lonlat_rows}
# clean up memory a bit:
del lonlat_rows
gc.collect()
pass

In [None]:
updated_count = 0
missing = {}
for row in tqdm(rows):
    gid = row['gid']
    lonlat = lonlat_info.get(gid, {}).get('geo')
    if not lonlat:
        missing[gid] =  f'geo info'
        continue

    ALL_UPDATES[gid]['lonlat'] = lonlat
    # backend.update_effectivedated(
    #     db.organisation,
    #     db.organisation.gid == gid,
    #     values={
    #       "lonlat": lonlat
    #     },
    #     prio=OrganisationPriority.DUO,
    #     last_saved_by="Locatiegegevens aan Organisatie.ipynb op basis van all_with_gid.db"
    # )
    updated_count += 1

db.commit()

print(f"updated {updated_count} out of {len(rows)} items.")

## 3.2: sector + education type, education_level(s),

In [None]:
con_leerlingen = sqlite3.connect("data/leerlingen.sqlite")
con_leerlingen.row_factory = dict_factory
cur_leerlingen = con_leerlingen.cursor()

def query_all_leerlingen(sql) -> list[dict]:
    cur_leerlingen.execute(sql)
    return cur_leerlingen.fetchall()

def query_one_leerlingen(sql) -> dict:
    cur_leerlingen.execute(sql)
    return cur_leerlingen.fetchone()

In [None]:
# ed_level_rows = query_all("""select org_gid, ONDERWIJSSTRUCTUUR, VESTIGINGSNAAM from scholen where geo is not null;""")
# ed_level_info = {_['org_gid']: (_['ONDERWIJSSTRUCTUUR'], _["VESTIGINGSNAAM"]) for _ in ed_level_rows}
# # clean up memory a bit:
# del ed_level_rows
# gc.collect()
# pass

In [None]:
db.rollback()
table = db.organisation_effdted_now
query = table.sector == None
query |= table.education_type == None
# query |= table.education_level == None
query &= table.brin != None
query &= table.vestigingscode != None

rows = db(query).select(table.gid, table.brin, table.vestigingscode,
                        table.sector, table.education_type, table.education_level).as_dict(key='gid')

In [None]:
from collections import defaultdict

# leerlingen.sqlite contains onderwijs_type & sector(s)
sectors = defaultdict(list)
info = query_all_leerlingen("""select * from leerling_aantallen;""")

for row in info:
    key = (row['brin'], int(row['vestigingscode']))
    value = (row['onderwijs_niveau'], row['onderwijs_type'])
    sectors[key].append(value)

del info

# all_with_gid.db contains 'onderwijsstructuur' -> niveau(s)

In [None]:
# def parse_education_level(onderwijsstructuur: str, vestigingsnaam: str) -> set[str]:
#     # convert ONDERWIJSSTRUCTUUR to value allowed by ALLOWED_LEVELS
#     levels = set()
#     if not onderwijsstructuur:
#         # todo: probably basisschool?
#         return levels
#     parts = onderwijsstructuur.lower().split('/')
#
#     if 'vwo' in parts:
#         levels.add('vwo')
#     if 'havo' in parts:
#         levels.add('havo')
#
#     print(parts, vestigingsnaam)
#
#     return levels
#
# parse_education_level()

In [None]:
from edwh.core.data_model import EDUCATION_SECTOR, EDUCATION_TYPE, EDUCATION_LEVEL

ALLOWED_SECTORS = set(EDUCATION_SECTOR.keys())
ALLOWED_TYPES = set(EDUCATION_TYPE.keys())
ALLOWED_LEVELS = set(EDUCATION_LEVEL.keys())

missing = {
    'sector': [],
    'type': [],
}

found = {
    'sector': [],
    'type': [],
}

for gid, existing_data in tqdm(rows.items()):
    found_any = False
    sector_key = (existing_data['brin'], int(existing_data['vestigingscode']))
    sector_info = sectors.get(sector_key)

    if not existing_data['sector'] and sector_info:
        # update sector(s)
        existing_data['sector'] = []
        for aanbod in sector_info:
            sector = aanbod[0]
            assert sector in ALLOWED_SECTORS, f"Invalid {sector=}"
            existing_data['sector'].append(sector)

            if not existing_data['education_level'] and sector == "po":
                # fill it in based on sector info
                existing_data['education_level'] = ["bo"]

        found['sector'].append(gid)
        found_any = True
    else:
        missing['sector'].append(gid)

    if not existing_data['education_type'] and sector_info:
        # update eduction type (r or s)
        for aanbod in sector_info:
            type = aanbod[1]
            assert type in ALLOWED_TYPES, f"Invalid {type=}"
            existing_data['education_type'] = type
            continue  # only one type!

        found['type'].append(gid)
        found_any = True
    else:
        missing['type'].append(gid)

    if found_any:
        if existing_data['education_type']:
            ALL_UPDATES[gid]['education_type'] = existing_data['education_type']
        if existing_data['education_level']:
            ALL_UPDATES[gid]['education_level'] = existing_data['education_level']
        if existing_data['sector']:
            ALL_UPDATES[gid]['sector'] = existing_data['sector']

        # backend.update_effectivedated(
        #     db.organisation,
        #     db.organisation.gid == gid,
        #     values=existing_data,
        #     prio=OrganisationPriority.DUO,
        #     last_saved_by="Locatiegegevens aan Organisatie.ipynb op basis van leerlingen.sqlite"
        # )

db.commit()

print("Missing:")
print({level: len(missing[level]) for level in missing})
print("Found:")
print({level: len(found[level]) for level in found})

# Niveau's

In [None]:
from pydal import DAL, Field
import tabulate, re

vodb = DAL('sqlite://vo.db', folder='data', migrate=False)


def clean(onderwijstype):
    geschoond_type = re.sub(r'[0-9]-[0-9]', '', onderwijstype.replace('lj', ''))
    geschoond_type = re.sub(r'[0-9]', '', geschoond_type)
    geschoond_type = re.sub(r'uitbest. aan VAVO', '', geschoond_type)
    if 'Praktijkonderwijs' in geschoond_type:
        geschoond_type = 'Praktijkonderwijs'
    return geschoond_type.strip()


vodb._adapter.connection.create_function("clean", 1, clean)

sql = '''
select  [brin nummer],
        [vestigingsnummer],
        clean("ONDERWIJSTYPE VO EN LEER- OF VERBLIJFSJAAR") as geschoond_type
  from tsv1
  group by [brin nummer], [vestigingsnummer], clean("ONDERWIJSTYPE VO EN LEER- OF VERBLIJFSJAAR")
'''
output = vodb.executesql(sql)

In [None]:
niveaus_per_locatie = defaultdict(list)

for brin, locatie, niveau in output:
    key = (brin, locatie)
    niveaus_per_locatie[key].append(niveau)

In [None]:
db.rollback()
table = db.organisation_effdted_now
query = table.education_level == None
query &= table.brin != None
query &= table.vestigingscode != None

rows = db(query).select(table.gid, table.brin, table.vestigingscode).as_dict(key='gid')

In [None]:
missing = 0
found = 0

for gid, row in tqdm(rows.items()):
    key = (row['brin'], int(row['vestigingscode']))
    niveaus = niveaus_per_locatie.get(key)
    if niveaus:
        found += 1
        ALL_UPDATES[gid]['education_level'] = niveaus
    else:
        missing += 1

print(f'{missing=} - {found=} out of {len(rows)}')

# Push All Changes at once:

In [None]:
print(
    'will update:',
    len(ALL_UPDATES),
    'out of:',
    db(db.organisation_effdted_now).count())

In [None]:
db.rollback()
for gid, changes in tqdm(ALL_UPDATES.items()):
    backend.update_effectivedated(
            db.organisation,
            db.organisation.gid == gid,
            values=changes,
            prio=OrganisationPriority.DUO,
            last_saved_by="Locatiegegevens aan Organisatie.ipynb op basis van leerlingen.sqlite/leerlingen.json and all_with_gid.db"
        )

db.commit()