---

### for me

 - [YAML Cheatsheet](https://quickref.me/yaml.html)
 - [YAML Viewer](https://jsonformatter.org/yaml-viewer)

# Notes

 - provide links and URIs wherever possible
 - _extended_ YAML with inline links in Markdown style [text](https://www.example.org)
 - 

# Feedback

 - the more you annotate _what_ the information in the guides _is_ (types, meta-info), the more information we can extract and potentially link together with the Datahub
 - try to keep structures _across_ guides of different levels _and_ pieces of information as uniform as possible
   -> makes info _predictable_ for others and allows to link more into the Datahub
 - 

# Problems

 - which URIs?
   -> definitely the Datahub ones _at some point_ but those aren't ready yet
  - which thesaurus/thesauri?

# Thoughts, Ideas

 - link directly to other search guides (across levels)
 - link directly to example objects that the current guide pertains to (is examplary of)
   -> this serves the "points of acces"/"portals" motivation of the research guides

---

In [None]:
import os

os.path.split("hello/world/whatever")

# YAML to Markdown Parsing - custom-built

In [2]:
from glob import glob
import yaml

from ResearchAids import ResearchAid

BASE_DIR = "../published"
eng = glob(f"{BASE_DIR}/*/English/*.yml")
dutch = glob(f"{BASE_DIR}/*/Dutch/*.yml")
# top = glob(f"{BASE_DIR}/TopLevel/*.yml")

yaml_files = sorted(dutch + eng)

for filename in yaml_files:
    with open(filename) as handle:
        yml = yaml.safe_load(handle)
        try:
            r = ResearchAid(yml, raise_parsing_error=True)
        except KeyError as e:
            print(filename, e)
            # raise
            if "remarks" in str(e):
                print(filename, e)
                raise
            else:
                print(filename, e)
        except AttributeError as e:
            print(filename, e)
        except TypeError as e:
            print(filename, e)
            print(yml["Sources"])

In [None]:
from ResearchAids import *


Level2.parse_relevant_data("hello")

In [None]:
for y in yamls:
    print(y.keys())

In [None]:
# print(YAML2MD(yamls[1])())
# yamls[3]["Relevant data"]#["Tags"]
print([s.keys() for s in yamls[2]["Sources"]['Secondary sources']][0], "\n---")
print([s.keys() for s in yamls[3]["Sources"]['Secondary sources']][0])



In [None]:
# print(Level2(yamls[2])())

print(ResearchAid(yamls[0], raise_parsing_error=True)())

In [None]:
with open("../published/niveau3/English/NZG_20240508.yml") as handle:
    yml = yaml.safe_load(handle)

[d.keys() for d in yml["Sources"]['Secondary sources']]
yml["Sources"]['Secondary sources']

---

In [None]:
import re
from glob import glob
import yaml


def correct_IRI(url):
    # correct IRIs:
    #  - https://sws.geonames.org/6255149/
    #  - http://vocab.getty.edu/aat/300266789
    # -- http://www.wikidata.org/entity/Q219477
    md_link_re = re.compile(r"\[(.*)\]\(https?:\/\/(?:sws|www).geonames.org\/([0-9]+)\/?.*\)")
    # uri_re = re.compile(r"^https?:\/\/(?:sws|www).geonames.org\/([0-9]+)\/?.*")

    if md_link_re.match(url):
        link_text, geonames_id = md_link_re.match(url).group(1), md_link_re.match(url).group(2)
        print(f"parsed {url}")
        return f"[{link_text}](https://sws.geonames.org/{geonames_id}/)"
    elif ("http" in url[:20]) or ("www" in url[:20]):
        print(f" {url}  didn't parse! is it correct?")
    else:
        pass
    
    return url



complex_types = (list, dict)
def iter_urls(yml):
    if isinstance(yml, str):
        return correct_IRI(yml)
    if isinstance(yml, list):
        return list(map(iter_urls, yml))
    if isinstance(yml, dict):
        return {iter_urls(k): iter_urls(v) for k, v in yml.items()}
    return yml

In [None]:
BASE_DIR = "../published"
eng = glob(f"{BASE_DIR}/*/English/*.yml")
dutch = glob(f"{BASE_DIR}/*/Dutch/*.yml")
# top = glob(f"{BASE_DIR}/TopLevel/*.yml")

yaml_files = sorted(dutch + eng)

for filename in yaml_files:
    print(filename)
    with open(filename) as handle:
        yml = yaml.safe_load(handle)
        iter_urls(yml)
    print("\n-------------------\n")

In [None]:
with open("../published/niveau0/Dutch/TopLevel_20240606.yml") as handle:
    yml = yaml.safe_load(handle)
    print(yml == iter_urls(yml))


---

# parsing MD to DOCX

In [None]:
import yaml
from ResearchAids import ResearchAid

In [None]:
aid = "../published/niveau3/English/WMLeiden_20240508.yml"
md_file = "../EXPORTS/MD/niveau3/English/WMLeiden.md"

with open(md_file) as handle:
    md = handle.read()

with open(aid) as handle:
    yml = yaml.safe_load(handle)


# print(md)

In [None]:
ra = ResearchAid(yml)
if ra._parsed:
    md_content = ra()
    
    with open("test.md", "w") as handle:
        handle.write(md_content)

In [None]:
import re

img_regex = re.compile(r"!\[.+\]\(.+\)")

md2 = md_content[:]
for instance in img_regex.findall(md2):
    md2 = md2.replace(instance, instance[1:])


In [None]:
print(md2)

In [None]:
from datetime import datetime



---
# related aids table

In [None]:
ymls = []
for filename in yaml_files:
    with open(filename) as handle:
        yml = yaml.safe_load(handle)
        ymls.append(yml)

curaid = ymls[10]
relaids = curaid["RelatedAides"]

In [None]:
def parse_related(ls):
    levels = [[] for _ in range(4)]
    for aid_d in ls:
        item_title, item_value_dict = tuple(aid_d.items())[0]
        link = item_value_dict["link"]
        rel_type = item_value_dict["rel_type"]
        
        lvl = int(link.split("/")[0][-1])
        levels[lvl].append((item_title, link))
    return levels

def empty_row(n_cols, fill="     "):
    return fill.join(["|"]*n_cols)

def md_row(cells, total_cols):
    pad = max((total_cols - len(cells))//2, 0) + 1

    row_md = empty_row(pad) or "|"
    for val in cells:
        row_md += f"  {val}  |"
    row_md += empty_row(pad + (total_cols - len(cells)) % 2)[1:]
    return row_md

def md_link(tup):
    text, link = tup
    if link is None:
        return f"_{text}_"
    return f"[{text}]({link})"

def related_aids_table(own_title, own_level, related_aids_ls):
    grouped_rel_aids = parse_related(related_aids_ls)
    grouped_rel_aids[own_level].insert(len(grouped_rel_aids[own_level])//2, (own_title, None))

    n_cols = max(map(len, grouped_rel_aids))

    md = empty_row(n_cols + 1) + "\n"
    md += empty_row(n_cols + 1, fill=":---:") + "\n"
    for i, lvl_ls in enumerate(grouped_rel_aids):
        # lvl_ls = sorted(lvl_d.items())

        # if len(lvl_ls) < 1: 
        #     md += empty_row(n_cols) + "\n"
        if True:
            links = list(map(md_link, sorted(lvl_ls)))
            md += md_row(links, n_cols) + "\n"
        

        # md += "|"
        # for tit, link in sorted(lvl_ls):
        #     md += f" [{tit}]({link}) |"
        # md += "\n"
    return md

In [None]:
print(related_aids_table(curaid["Title"], int(curaid["Level"]), relaids))

In [None]:
relaids

In [None]:
"" or "|"

In [1]:
"asdafsdf"["asd"]

  "asdafsdf"["asd"]


TypeError: string indices must be integers, not 'str'

---
# automatically add `copyright_metadata`

In [1]:
from tqdm import tqdm
import yaml
from glob import glob
import os

from datetime import datetime

os.chdir("..")



BASE_DIR = "./published"

if not os.path.isdir(BASE_DIR):
    print("not sure where I am, exiting!")
    exit()
    
eng = glob(f"{BASE_DIR}/*/English/*.yml")
dutch = glob(f"{BASE_DIR}/*/Dutch/*.yml")
# top = glob(f"{BASE_DIR}/TopLevel/*.yml")

yaml_files = dutch + eng

license_dict = dict(
  license="https://creativecommons.org/licenses/by-sa/4.0/deed.en",
  copyright_holder="NIOD-KNAW",
  date=datetime.today().strftime("%Y-%m-%d")
)
        

for f in tqdm(yaml_files):
    # print(f"processing {f}...")
    with open(f) as handle:
        yaml_content = yaml.safe_load(handle)
        if not yaml_content: print("---", f, yaml_content)
        yaml_content["copyright_metadata"] = license_dict

        print(type(yaml_content), yaml_content["copyright_metadata"])

  9%|█████▋                                                             | 7/82 [00:00<00:02, 30.22it/s]

<class 'dict'> {'license': 'https://creativecommons.org/licenses/by-sa/4.0/deed.en', 'copyright_holder': 'NIOD-KNAW', 'date': '2025-10-01'}
<class 'dict'> {'license': 'https://creativecommons.org/licenses/by-sa/4.0/deed.en', 'copyright_holder': 'NIOD-KNAW', 'date': '2025-10-01'}
<class 'dict'> {'license': 'https://creativecommons.org/licenses/by-sa/4.0/deed.en', 'copyright_holder': 'NIOD-KNAW', 'date': '2025-10-01'}
<class 'dict'> {'license': 'https://creativecommons.org/licenses/by-sa/4.0/deed.en', 'copyright_holder': 'NIOD-KNAW', 'date': '2025-10-01'}
<class 'dict'> {'license': 'https://creativecommons.org/licenses/by-sa/4.0/deed.en', 'copyright_holder': 'NIOD-KNAW', 'date': '2025-10-01'}
<class 'dict'> {'license': 'https://creativecommons.org/licenses/by-sa/4.0/deed.en', 'copyright_holder': 'NIOD-KNAW', 'date': '2025-10-01'}
<class 'dict'> {'license': 'https://creativecommons.org/licenses/by-sa/4.0/deed.en', 'copyright_holder': 'NIOD-KNAW', 'date': '2025-10-01'}
<class 'dict'> {'lic

 20%|████████████▉                                                     | 16/82 [00:00<00:02, 31.34it/s]

<class 'dict'> {'license': 'https://creativecommons.org/licenses/by-sa/4.0/deed.en', 'copyright_holder': 'NIOD-KNAW', 'date': '2025-10-01'}
<class 'dict'> {'license': 'https://creativecommons.org/licenses/by-sa/4.0/deed.en', 'copyright_holder': 'NIOD-KNAW', 'date': '2025-10-01'}
<class 'dict'> {'license': 'https://creativecommons.org/licenses/by-sa/4.0/deed.en', 'copyright_holder': 'NIOD-KNAW', 'date': '2025-10-01'}
<class 'dict'> {'license': 'https://creativecommons.org/licenses/by-sa/4.0/deed.en', 'copyright_holder': 'NIOD-KNAW', 'date': '2025-10-01'}
<class 'dict'> {'license': 'https://creativecommons.org/licenses/by-sa/4.0/deed.en', 'copyright_holder': 'NIOD-KNAW', 'date': '2025-10-01'}
<class 'dict'> {'license': 'https://creativecommons.org/licenses/by-sa/4.0/deed.en', 'copyright_holder': 'NIOD-KNAW', 'date': '2025-10-01'}
<class 'dict'> {'license': 'https://creativecommons.org/licenses/by-sa/4.0/deed.en', 'copyright_holder': 'NIOD-KNAW', 'date': '2025-10-01'}
<class 'dict'> {'lic

 29%|███████████████████▎                                              | 24/82 [00:00<00:01, 32.41it/s]

<class 'dict'> {'license': 'https://creativecommons.org/licenses/by-sa/4.0/deed.en', 'copyright_holder': 'NIOD-KNAW', 'date': '2025-10-01'}
<class 'dict'> {'license': 'https://creativecommons.org/licenses/by-sa/4.0/deed.en', 'copyright_holder': 'NIOD-KNAW', 'date': '2025-10-01'}
<class 'dict'> {'license': 'https://creativecommons.org/licenses/by-sa/4.0/deed.en', 'copyright_holder': 'NIOD-KNAW', 'date': '2025-10-01'}
<class 'dict'> {'license': 'https://creativecommons.org/licenses/by-sa/4.0/deed.en', 'copyright_holder': 'NIOD-KNAW', 'date': '2025-10-01'}
<class 'dict'> {'license': 'https://creativecommons.org/licenses/by-sa/4.0/deed.en', 'copyright_holder': 'NIOD-KNAW', 'date': '2025-10-01'}
<class 'dict'> {'license': 'https://creativecommons.org/licenses/by-sa/4.0/deed.en', 'copyright_holder': 'NIOD-KNAW', 'date': '2025-10-01'}
<class 'dict'> {'license': 'https://creativecommons.org/licenses/by-sa/4.0/deed.en', 'copyright_holder': 'NIOD-KNAW', 'date': '2025-10-01'}
<class 'dict'> {'lic

 40%|██████████████████████████▌                                       | 33/82 [00:00<00:01, 35.91it/s]

<class 'dict'> {'license': 'https://creativecommons.org/licenses/by-sa/4.0/deed.en', 'copyright_holder': 'NIOD-KNAW', 'date': '2025-10-01'}
<class 'dict'> {'license': 'https://creativecommons.org/licenses/by-sa/4.0/deed.en', 'copyright_holder': 'NIOD-KNAW', 'date': '2025-10-01'}
<class 'dict'> {'license': 'https://creativecommons.org/licenses/by-sa/4.0/deed.en', 'copyright_holder': 'NIOD-KNAW', 'date': '2025-10-01'}
<class 'dict'> {'license': 'https://creativecommons.org/licenses/by-sa/4.0/deed.en', 'copyright_holder': 'NIOD-KNAW', 'date': '2025-10-01'}
<class 'dict'> {'license': 'https://creativecommons.org/licenses/by-sa/4.0/deed.en', 'copyright_holder': 'NIOD-KNAW', 'date': '2025-10-01'}
<class 'dict'> {'license': 'https://creativecommons.org/licenses/by-sa/4.0/deed.en', 'copyright_holder': 'NIOD-KNAW', 'date': '2025-10-01'}
<class 'dict'> {'license': 'https://creativecommons.org/licenses/by-sa/4.0/deed.en', 'copyright_holder': 'NIOD-KNAW', 'date': '2025-10-01'}
<class 'dict'> {'lic

 52%|██████████████████████████████████▌                               | 43/82 [00:01<00:00, 41.29it/s]

<class 'dict'> {'license': 'https://creativecommons.org/licenses/by-sa/4.0/deed.en', 'copyright_holder': 'NIOD-KNAW', 'date': '2025-10-01'}
<class 'dict'> {'license': 'https://creativecommons.org/licenses/by-sa/4.0/deed.en', 'copyright_holder': 'NIOD-KNAW', 'date': '2025-10-01'}
<class 'dict'> {'license': 'https://creativecommons.org/licenses/by-sa/4.0/deed.en', 'copyright_holder': 'NIOD-KNAW', 'date': '2025-10-01'}
<class 'dict'> {'license': 'https://creativecommons.org/licenses/by-sa/4.0/deed.en', 'copyright_holder': 'NIOD-KNAW', 'date': '2025-10-01'}
<class 'dict'> {'license': 'https://creativecommons.org/licenses/by-sa/4.0/deed.en', 'copyright_holder': 'NIOD-KNAW', 'date': '2025-10-01'}
<class 'dict'> {'license': 'https://creativecommons.org/licenses/by-sa/4.0/deed.en', 'copyright_holder': 'NIOD-KNAW', 'date': '2025-10-01'}
<class 'dict'> {'license': 'https://creativecommons.org/licenses/by-sa/4.0/deed.en', 'copyright_holder': 'NIOD-KNAW', 'date': '2025-10-01'}
<class 'dict'> {'lic

 66%|███████████████████████████████████████████▍                      | 54/82 [00:01<00:00, 42.82it/s]

<class 'dict'> {'license': 'https://creativecommons.org/licenses/by-sa/4.0/deed.en', 'copyright_holder': 'NIOD-KNAW', 'date': '2025-10-01'}
<class 'dict'> {'license': 'https://creativecommons.org/licenses/by-sa/4.0/deed.en', 'copyright_holder': 'NIOD-KNAW', 'date': '2025-10-01'}
<class 'dict'> {'license': 'https://creativecommons.org/licenses/by-sa/4.0/deed.en', 'copyright_holder': 'NIOD-KNAW', 'date': '2025-10-01'}
<class 'dict'> {'license': 'https://creativecommons.org/licenses/by-sa/4.0/deed.en', 'copyright_holder': 'NIOD-KNAW', 'date': '2025-10-01'}
<class 'dict'> {'license': 'https://creativecommons.org/licenses/by-sa/4.0/deed.en', 'copyright_holder': 'NIOD-KNAW', 'date': '2025-10-01'}
<class 'dict'> {'license': 'https://creativecommons.org/licenses/by-sa/4.0/deed.en', 'copyright_holder': 'NIOD-KNAW', 'date': '2025-10-01'}
<class 'dict'> {'license': 'https://creativecommons.org/licenses/by-sa/4.0/deed.en', 'copyright_holder': 'NIOD-KNAW', 'date': '2025-10-01'}
<class 'dict'> {'lic

 79%|████████████████████████████████████████████████████▎             | 65/82 [00:01<00:00, 42.81it/s]

<class 'dict'> {'license': 'https://creativecommons.org/licenses/by-sa/4.0/deed.en', 'copyright_holder': 'NIOD-KNAW', 'date': '2025-10-01'}
<class 'dict'> {'license': 'https://creativecommons.org/licenses/by-sa/4.0/deed.en', 'copyright_holder': 'NIOD-KNAW', 'date': '2025-10-01'}
<class 'dict'> {'license': 'https://creativecommons.org/licenses/by-sa/4.0/deed.en', 'copyright_holder': 'NIOD-KNAW', 'date': '2025-10-01'}
<class 'dict'> {'license': 'https://creativecommons.org/licenses/by-sa/4.0/deed.en', 'copyright_holder': 'NIOD-KNAW', 'date': '2025-10-01'}
<class 'dict'> {'license': 'https://creativecommons.org/licenses/by-sa/4.0/deed.en', 'copyright_holder': 'NIOD-KNAW', 'date': '2025-10-01'}
<class 'dict'> {'license': 'https://creativecommons.org/licenses/by-sa/4.0/deed.en', 'copyright_holder': 'NIOD-KNAW', 'date': '2025-10-01'}
<class 'dict'> {'license': 'https://creativecommons.org/licenses/by-sa/4.0/deed.en', 'copyright_holder': 'NIOD-KNAW', 'date': '2025-10-01'}
<class 'dict'> {'lic

 91%|████████████████████████████████████████████████████████████▎     | 75/82 [00:01<00:00, 38.29it/s]

<class 'dict'> {'license': 'https://creativecommons.org/licenses/by-sa/4.0/deed.en', 'copyright_holder': 'NIOD-KNAW', 'date': '2025-10-01'}
<class 'dict'> {'license': 'https://creativecommons.org/licenses/by-sa/4.0/deed.en', 'copyright_holder': 'NIOD-KNAW', 'date': '2025-10-01'}
<class 'dict'> {'license': 'https://creativecommons.org/licenses/by-sa/4.0/deed.en', 'copyright_holder': 'NIOD-KNAW', 'date': '2025-10-01'}
<class 'dict'> {'license': 'https://creativecommons.org/licenses/by-sa/4.0/deed.en', 'copyright_holder': 'NIOD-KNAW', 'date': '2025-10-01'}
<class 'dict'> {'license': 'https://creativecommons.org/licenses/by-sa/4.0/deed.en', 'copyright_holder': 'NIOD-KNAW', 'date': '2025-10-01'}
<class 'dict'> {'license': 'https://creativecommons.org/licenses/by-sa/4.0/deed.en', 'copyright_holder': 'NIOD-KNAW', 'date': '2025-10-01'}
<class 'dict'> {'license': 'https://creativecommons.org/licenses/by-sa/4.0/deed.en', 'copyright_holder': 'NIOD-KNAW', 'date': '2025-10-01'}
<class 'dict'> {'lic

100%|██████████████████████████████████████████████████████████████████| 82/82 [00:02<00:00, 38.41it/s]

<class 'dict'> {'license': 'https://creativecommons.org/licenses/by-sa/4.0/deed.en', 'copyright_holder': 'NIOD-KNAW', 'date': '2025-10-01'}
<class 'dict'> {'license': 'https://creativecommons.org/licenses/by-sa/4.0/deed.en', 'copyright_holder': 'NIOD-KNAW', 'date': '2025-10-01'}
<class 'dict'> {'license': 'https://creativecommons.org/licenses/by-sa/4.0/deed.en', 'copyright_holder': 'NIOD-KNAW', 'date': '2025-10-01'}
<class 'dict'> {'license': 'https://creativecommons.org/licenses/by-sa/4.0/deed.en', 'copyright_holder': 'NIOD-KNAW', 'date': '2025-10-01'}
<class 'dict'> {'license': 'https://creativecommons.org/licenses/by-sa/4.0/deed.en', 'copyright_holder': 'NIOD-KNAW', 'date': '2025-10-01'}





In [2]:
yaml_content

{'Breakdown': {'Locations': [{'Aruba, Bonaire and Curaçao': {'link': 'niveau2/English/ArubaBonaireCuracao_20250619.yml',
     'rel_type': 'see also'}},
   {'Saba, Sint Eustatius and Sint Maarten': {'link': 'niveau2/English/SabaStEustatiusStMaarten_202501619.yml',
     'rel_type': 'see also'}}],
  'Provenance research into colonial collections': [{'Select and delineate': {'link': 'niveau1/English/SelectAndDelineate_20240425.yml',
     'rel_type': 'see also'}},
   {'Doing research': {'link': 'niveau1/English/DoingResearch_20240425.yml',
     'rel_type': 'see also'}},
   {'Sources': {'link': 'niveau1/English/Sources_20240501.yml',
     'rel_type': 'see also'}},
   {'Reporting': {'link': 'niveau1/English/Reporting_20240501.yml',
     'rel_type': 'see also'}},
   {'The Dutch restitution policy': {'link': 'niveau1/English/RestitutionPolicy_20250123.yml',
     'rel_type': 'see also'}}],
  'Topics': [{'Persons': {'Subtopics': [{'C.G.C. Reinwardt': {'link': 'niveau3/Dutch/Reinwardt_20241217.yml

In [95]:
with open("test_coptright_automation.yml", "w") as handle:
    orig = yaml_content
    # no_breaks = orig.replace("\n", "\\n")
    x = yaml.safe_dump(orig, None, width=40096)


In [20]:
# yaml.dumps(yaml_content["Content"])

# print(yaml_content["Content"]["content"])

In [96]:
# print(x.decode("utf-8"))
print()
print(x)


"Breakdown":
  "Locations":
  - "Aruba, Bonaire and Cura\xE7ao":
      "link": |-
        niveau2/English/ArubaBonaireCuracao_20250619.yml
      "rel_type": |-
        see also
  - "Saba, Sint Eustatius and Sint Maarten":
      "link": |-
        niveau2/English/SabaStEustatiusStMaarten_202501619.yml
      "rel_type": |-
        see also
  "Provenance research into colonial collections":
  - "Select and delineate":
      "link": |-
        niveau1/English/SelectAndDelineate_20240425.yml
      "rel_type": |-
        see also
  - "Doing research":
      "link": |-
        niveau1/English/DoingResearch_20240425.yml
      "rel_type": |-
        see also
  - "Sources":
      "link": |-
        niveau1/English/Sources_20240501.yml
      "rel_type": |-
        see also
  - "Reporting":
      "link": |-
        niveau1/English/Reporting_20240501.yml
      "rel_type": |-
        see also
  - "The Dutch restitution policy":
      "link": |-
        niveau1/English/RestitutionPolicy_20250123.yml

In [74]:
print(yaml.dump("""hallo  
welt""", None, allow_unicode=True))# encoding="utf-8")

"hallo  \nwelt"



In [77]:
yaml_content["Content"]["content"]

yaml_content

{'Breakdown': {'Locations': [{'Aruba, Bonaire and Curaçao': {'link': 'niveau2/English/ArubaBonaireCuracao_20250619.yml',
     'rel_type': 'see also'}},
   {'Saba, Sint Eustatius and Sint Maarten': {'link': 'niveau2/English/SabaStEustatiusStMaarten_202501619.yml',
     'rel_type': 'see also'}}],
  'Provenance research into colonial collections': [{'Select and delineate': {'link': 'niveau1/English/SelectAndDelineate_20240425.yml',
     'rel_type': 'see also'}},
   {'Doing research': {'link': 'niveau1/English/DoingResearch_20240425.yml',
     'rel_type': 'see also'}},
   {'Sources': {'link': 'niveau1/English/Sources_20240501.yml',
     'rel_type': 'see also'}},
   {'Reporting': {'link': 'niveau1/English/Reporting_20240501.yml',
     'rel_type': 'see also'}},
   {'The Dutch restitution policy': {'link': 'niveau1/English/RestitutionPolicy_20250123.yml',
     'rel_type': 'see also'}}],
  'Topics': [{'Persons': {'Subtopics': [{'C.G.C. Reinwardt': {'link': 'niveau3/Dutch/Reinwardt_20241217.yml

---
## DURING REFACTORING OF YAML_TO_EXPORT

In [18]:
import os
from glob import glob
import re

In [29]:
os.chdir("/home/valentin/Desktop/NIOD/research-guides-dev/")
cur_dir = "published"
eng = glob(f"{cur_dir}/*/English/*.yml")


def parse_filename(orig_path, has_path=False):
    path_part = r'.+\/' if has_path else ''
    m = re.search(fr'{path_part}(.*)_[0-9]+\.yml', orig_path)
    if m:
        return m.group(1)
    raise ValueError(f"{orig_path} couldn't be parsed!")



def parse_filepath(fp):
    *pref, published, level, lang, fname  = fp.split(os.path.sep)
    return published, level, lang, parse_filename(fname)


def get_export_path(orig_path, export_folder, extension):
    published, level, lang, name = parse_filepath(orig_path)
    ext = ("." + extension if not extension.startswith(".") else extension)
    return os.path.join("EXPORTS", export_folder, published, level, lang, name) + ext

get_export_path(eng[0], "MD", ".md")

'EXPORTS/MD/published/niveau1/English/Reporting.md'

In [30]:
import yaml

In [34]:
with open("invalid_yaml.yml") as handle:
    yaml_content = yaml.safe_load(handle)


ScannerError: mapping values are not allowed here
  in "invalid_yaml.yml", line 12, column 5

In [39]:
os.listdir("./EXPORTS")

['published', 'review']

---

In [2]:
with open("../EXPORTS/MD/published/niveau2/English/Ghana.md") as handle:
    md_content = handle.read()

In [4]:
print(md_content)


# Ghana


## Abstract

Dutch museum collections contain ethnographic objects, art objects and natural history specimens originating from what is now Ghana. Most of these objects were collected in the period up to 1872, when the Netherlands was present as a colonial power in what was then known as the Gold Coast.

### Description

From the last decade of the sixteenth century onwards, sailors from the Netherlands traded in the West African coast, following in the footsteps of the Portuguese. In order to protect Dutch traders there, the States General decided in 1612 to build a fort near the town of Moree in the Asebu chiefdom in the Gold Coast, for which an agreement was concluded with the local ruler. In 1637 Elmina Castle was captured from the Portuguese by the Dutch West India Company (WIC), with the Portuguese eventually being driven out of the Gold Coast altogether in 1642. In the mid-seventeenth century the WIC faced competition from Swedish, Brandenburg-Prussian, Danish and Engl

---
### LINK PARSING

In [1]:
with open("../EXPORTS/MD/published/niveau1/Dutch/Sources.md") as handle:
    md = handle.read()

In [22]:
print(md[:10])


# Bronnen


In [23]:
import regex as re

data = md

pattern = re.compile(r'\[([^][]+)\](\(((?:[^()]+|(?2))+)\))')

for match in pattern.finditer(data):
    print(match.start(), match.end())
    description, x, url = match.groups()
    print(f"{description}: {url}")
 

1503 1616
Woorden doen ertoe: https://www.tropenmuseum.nl/sites/default/files/2018-09/WordsMatter_DEF_Totale_PDF_NL_0.pdf
3337 3378
archieven.nl: https://www.archieven.nl/
3653 3847
Koninklijk Instituut voor Taal-, Land- en Volkenkunde (KITLV): https://app.colonialcollections.nl/nl/research-aids/https%3A%2F%2Fn2t%252Enet%2Fark%3A%2F27023%2F62191a1bbed9b315db786f2037417b4f
3899 3984
Universitaire Bibliotheken Leiden: https://digitalcollections.universiteitleiden.nl
4090 4142
Nationaal Archief: https://www.nationaalarchief.nl
5392 5468
diverse zoekhulpen: https://www.nationaalarchief.nl/onderzoeken/zoekhulpen
5526 5655
De koloniale staat 1854-1942: https://www.nationaalarchief.nl/onderzoeken/archief/2.14.97/invnr/10ED/file/Koloniale%20staat.pdf
6161 6201
www.delpher.nl: https://www.delpher.nl
6544 6583
Delpher: https://youtu.be/PfXY9aQC7F4
7089 7122
Delpher: https://www.delpher.nl
7125 7223
Digital Collections van de Universiteit Leiden: https://digitalcollections.universiteitleiden.nl
7

In [20]:
import os
# def parse_relative_path(p):
#     *pref, level, lang, fname = p.split(os.path.sep)

# https://research-aids.github.io/published/niveau3/Dutch/Reinwardt.html
# https://research-aids.github.io/review/niveau3/Dutch/niveau2/Dutch/CivilServants_20240320.yml
# niveau2/Dutch/niveau3/Dutch/Wereldtentoonstelling1883_202550304.yml
BASE_URL = "https://research-aids.github.io/"

def check_exists(relative_path):
    relative_path = relative_path.replace(".yml", ".html")
    if os.path.exists(os.path.join("published", relative_path)):
        return BASE_URL + "published" + relative_path
    elif os.path.exists(os.path.join("review", relative_path)):
        return BASE_URL + "review" + relative_path
    else:
        raise ValueError(f"{relative_path} DOESN'T SEEM TO EXIST")


def check_is_published(relative_path, md_dir="."):
    relative_path = relative_path.replace(".yml", ".md")
    if os.path.exists(os.path.join(md_dir, "published", relative_path)):
        return "published"
    elif os.path.exists(os.path.join(md_dir, "review", relative_path)):
        return "review"
    elif os.path.exists(os.path.join(md_dir, "archive", relative_path)):
        raise ValueError(f"referenced Research Aid '{relative_path}' is in the archive folder! (And doesn't get exported.)")
    else:
        raise ValueError(f"referenced Research Aid '{relative_path}' doesn't seem to exist!")


def parse_filename(orig_path, has_path=False):
    path_part = r'.+\/' if has_path else ''
    m = re.search(fr'{path_part}(.*)_[0-9]+\.yml', orig_path)
    if m:
        return m.group(1)
    raise ValueError(f"{orig_path} couldn't be parsed!")


def relative_path_to_URL(relative_path, md_dir="."):
    is_published = check_is_published(relative_path, md_dir)
    return BASE_URL + os.path.join(is_published, parse_filename(relative_path)) + ".html"


# check_exists(url)

url, os.path.join("published", url)

('niveau1/Dutch/SelectAndDelineate_20240425.yml',
 'published/niveau1/Dutch/SelectAndDelineate_20240425.yml')

In [21]:
is_published = "published"
BASE_URL + os.path.join(is_published, parse_filename(url)) + ".html"


'https://research-aids.github.io/published/niveau1/Dutch/SelectAndDelineate.html'

In [39]:
d2 = data[:match.start(3)] + (BASE_URL + os.path.join(is_published, parse_filename(url)) + ".html") +\
data[match.end(3):]

d2[match.start(3)-10:match.end(3)+50]

'agleggen](https://research-aids.github.io/published/niveau1/Dutch/SelectAndDelineate.html)  \n\n\n\n_last edi'