In [32]:
from noid_wrapper import noid_client
client = noid_client.NoidClient("/home/srappel/noid_wrapper/config.yaml")

In [33]:
from pathlib import Path
import json

def process_metadata_files(base_dir) -> list:
    """Recursively processes metadata files in directories to extract ARK IDs."""
    documents = []
    base_path = Path(base_dir)
    
    for json_file in base_path.rglob("*.json"):
        with json_file.open('r') as file:
            data = json.load(file)
            ark_id = data.get("dct_identifier_sm")
            if ark_id:
                documents.append((ark_id, data))
    
    return documents


In [None]:
target_dir = Path("/home/srappel/noid_wrapper/metadata")
target_documents = process_metadata_files(target_dir)

i = 0
for documnet in target_documents:
    i += 1
    print(i, documnet[0])
    


In [None]:
rejects = []
for document in target_documents:
    noid_id = document[0][0].removeprefix("ark:/")
    
    # Validate the ID
    if not client.validate(noid_id):
        rejects.append(noid_id)
        continue

    # Make a dict of the elements to be bound...
    where_part = noid_id.replace("/","-")
    where = f"https://geodiscovery.uwm.edu/catalog/ark:-{where_part}"
    title = document[1].get("dct_title_s", "Untitled")
    permalink = f"https://digilib.uwm.edu/ark:/{noid_id}"

    bind_params = {
        "where": where,
        "title": title,
        "permalink": permalink
    }

    client._run_noid_command("hold", "set", noid_id)

    client.bind_multiple(noid_id, bind_params)

    

        


In [36]:
param_map_agsl_aardvark = {
    "where": "dct_references_s",
    "title": "dct_title_s",
    "download": "dct_references_s",
    "identifier": "dct_identifier_sm",
    "ogm_aardvark_id": "id",
    "access": "dct_accessRights_s"
}

In [39]:
def bind_directory(dir, param_map, hold='set'):
    bind_params = {}
    documents = process_metadata_files(Path(dir))
    assert isinstance(documents, list)

    for document in target_documents:

        identifier = document[0][0]
        noid_id = identifier.removeprefix("ark:/")
        assert isinstance(noid_id, str)
        assert client.validate(noid_id)

        ogm_aardvark_id = document[1].get(param_map["ogm_aardvark_id"], identifier.replace("/", "-"))
        
        bind_params["identifier"] = identifier
        bind_params["ogm_aardvark_id"] = ogm_aardvark_id
        bind_params["title"] = document[1].get(param_map["title"], "Untitled")
        bind_params["title"] = document[1].get(param_map["access"], "Public")

        #references 
        references = json.loads(document[1].get("dct_references_s", ""))
                
        bind_params["download"] = references.get("http://schema.org/downloadUrl", "Null")
        bind_params["where"] = references.get("http://schema.org/url", "Null")

        client.bind_multiple(noid_id, bind_params)



In [None]:
bind_directory("/home/srappel/noid_wrapper/metadata", param_map_agsl_aardvark)