This code will download a listing of OAI-PMH repositories, and then retrieve the full collection list from each repository.

See the the following wiki for more information - [Historical Document Repositories](https://github.com/seanclauson/Roots/wiki/Historical-Document-Repositories).

In [None]:
!pip install xmltodict

In [85]:
import urllib.request 
import json
import xmltodict
import ssl
import datetime

In [101]:
# URL to a JSON file listing all OAI repositories to be processed.
repos_url = "https://raw.githubusercontent.com/seanclauson/Roots/master/oai-repo-listing.json"

# Path to the outputted JSON file, containing all OAI repos and collections.
output_file_name = "json_output.json"

In [100]:
def get_repositories_list(list_url):
    """
    Retrieve list of OAI repositories and return the repos as a list.
    """
    with urllib.request.urlopen(list_url) as url:
        json_data = json.loads(url.read().decode())

    return json_data


# print(type(get_repositories_list(repos_url)))

In [96]:
# Grab XML of collections from OAI-PMH endpoint
def get_repository_sets(oai_url):
    """
    Given a repository's OAI-PMH endpoint, return all sets as a list. 
    """
    # Ignore SSL errors
    ctx = ssl.create_default_context()
    ctx.check_hostname = False
    ctx.verify_mode = ssl.CERT_NONE
    
    with urllib.request.urlopen(oai_url, context = ctx) as file:
        data = file.read()

    data = xmltodict.parse(data)

    repo_sets = []

    for set in data["OAI-PMH"]["ListSets"]["set"]:
        this_set = {}
        this_set["spec"] = set["setSpec"]
        this_set["name"] = set["setName"]
        try:
            this_set["description"] = set["setDescription"]["oai_dc:dc"]["dc:description"]
            this_set["description"] = json.dumps(this_set["description"])[1:-1]
        except:
            this_set["description"] = ""

        repo_sets.append(this_set)

    return repo_sets
           

In [102]:
def get_all_repos_sets(url):
    """
    Given a URL to a list of OAI repositories, return list of all repos and their sets.
    """
    repos = get_repositories_list(url)
#     repos = repos[:2] # slicing to reduce list while in dev/test

    oai_repos = []

    for repo in repos:
        this_repo = {}
        this_repo["oai-repo-id"] = repo["id"]
        this_repo["browse-url-template"] = repo["url-template-set-landing"].replace("<url-home>", repo["url-home"])
        oai_url = repo["url-template-oai-base"].replace("<url-home>", repo["url-home"])
        oai_url = oai_url + "verb=ListSets"

        this_repo["sets"] = get_repository_sets(oai_url)

        oai_repos.append(this_repo)

    return oai_repos


In [None]:
# Get all OAI repository collections.
final_data = {}
final_data["run-time"] = datetime.datetime.now().replace(microsecond=0).isoformat()
final_data["oai-repos"] = get_all_repos_sets(repos_url)
        
# Output repository/collection information as JSON file.    
with open(output_file_name, "w") as file:
    json.dump(final_data, file)