In [None]:
from collections import defaultdict
import concurrent.futures
import csv
import glob
import gzip
from itertools import permutations, combinations, starmap, repeat
import json
from math import ceil
import operator
import os
from pprint import pprint
import re
import string
from urllib.parse import quote_plus

from tqdm.notebook import tqdm
from toolz import concat, identity, keyfilter, partition_all, assoc, dissoc, unique, merge
from pymongo import MongoClient, InsertOne, UpdateOne
from rdflib import Graph, URIRef, Literal
from rdflib.namespace import SKOS, RDFS, RDF
import requests

from heliokos.infra.core import Harmonization

def pick(whitelist, d):
    return keyfilter(lambda k: k in whitelist, d)

In [None]:
def part_fn(part, doc_fn):
    rv = []
    with gzip.open(part, "rt") as f:
        for line in f:
            doc = json.loads(line)
            rv.append(doc_fn(doc))
    return rv

def process_parts(parts, doc_fn):

    pbar = tqdm(total=len(parts))
    rv = []
    with concurrent.futures.ProcessPoolExecutor(max_workers=32) as executor:
        future_to_part = {executor.submit(part_fn, part, doc_fn): part for part in parts}
        for future in concurrent.futures.as_completed(future_to_part):
            part = future_to_part[future]
            try:
                data = future.result()
                rv.extend(data)
                pbar.update(1)
            except Exception as exc:
                print(f"{part} generated an exception: {exc}")
    return rv

In [None]:
client = MongoClient()
mdb = client.heliokos

# store openalex concepts in mongodb

In [None]:
# https://docs.openalex.org/download-all-data/openalex-snapshot
oax_concept_parts = glob.glob("/openalex-snapshot/data/concepts/**/*.gz")

def it(doc):
    return doc

oax_concepts = process_parts(oax_concept_parts, it)

In [None]:
len(oax_concepts)

In [None]:
rv = mdb.oax_concepts.insert_many(oax_concepts)

In [None]:
len(rv.inserted_ids)

In [None]:
mdb.oax_concepts.create_index("id", unique=True)

# Rig up MongoDB as a serviceable graph database

Using: Coupal, Daniel, Pascal Desmarets, and Steve Hoberman. MongoDB Data Modeling and Schema Design. First. Align > Refine > Design. Sedona: Technics Publications, 2023.

- single collection pattern (relatedTo -> attribute pattern: outgoing.o, incoming.s)
- inheritance pattern (id, type, author.orcid, ads_work.bibcode, ...)
- attribute pattern (outgoing: [{p,o}], incoming: [{s,p}])
- graph pattern (incoming / outgoing) / tree pattern (parent / ancestors / children)

# Get unique DOIs from ADS "snapshot"

above took 1.5 minutes on laptop, 3.5 minutes on beefy server. Hmmm.

In [None]:
print(f"{mdb.ads_works_doiurls.estimated_document_count():,}")

In [None]:
print(f"{mdb.ads_works_only_doiurls.estimated_document_count():,}")

In [None]:
ads_unique_dois = {d["_id"] for d in mdb.ads_works_only_doiurls.find()}

In [None]:
None in ads_unique_dois

# Get slim OpenAlex work for every ADS DOI

What do I need from an OpenAlex work?

- doi
- title
- publication_date
- type
- concepts.id, concepts.score
- referenced_works

In [None]:
def work_essentials(doc):
    rv =  pick(["doi", "title", "publication_date", "type", "referenced_works"], doc)
    rv["concepts"] = [pick(["id", "score"], c) for c in doc["concepts"]]
    rv["_id"] = doc["id"]
    return rv

In [None]:
oax_works = glob.glob("/openalex-snapshot/data/works/**/*.gz")

def openalex_work_doi_in_ads(doc):
    return work_essentials(doc) if doc.get("doi") in ads_unique_dois else None

openalex_works_with_doi_in_ads = process_parts(oax_works, openalex_work_doi_in_ads)
openalex_works_with_doi_in_ads = list(filter(None, openalex_works_with_doi_in_ads))
len(openalex_works_with_doi_in_ads)

above took 17 minutes on server using 32 workers.

In [None]:
chunksize = 1_000

n_chunks = (len(openalex_works_with_doi_in_ads) // chunksize) + 1

for i, chunk in enumerate(tqdm(partition_all(chunksize, openalex_works_with_doi_in_ads), total=n_chunks)):
    requests = [InsertOne(d) for d in chunk]
    result = mdb.oax_works_with_doi_in_ads.bulk_write(requests, ordered=False)
    assert result.inserted_count == len(requests), f"chunk {i} failure"

above took 7 minutes. The MongoDB collection is 18G in memory, 7G on disk.

# UAT as SKOS

In [None]:
g_uat = Graph()
g_uat.parse(location="https://raw.githubusercontent.com/astrothesaurus/UAT/v.5.0.0/UAT.rdf", format="xml")

In [None]:
print("saving to disk...")
g_uat.serialize("uat.ttl")
print("done")

In [None]:
sorted(t[0].n3(g_uat.namespace_manager) for t in g_uat.query("SELECT DISTINCT ?p WHERE {?s ?p ?o }", initNs={"skos": SKOS}))

# OpenAlex concepts as skos:ConceptScheme

In [None]:
def concept_essentials(doc):
    rv =  pick(["display_name", "description", "level"], doc)
    rv["ancestors"] = [pick(["id", "level"], c) for c in doc["ancestors"]]
    rv["_id"] = doc["id"]
    return rv

In [None]:
oax_concept_parts = glob.glob("/openalex-snapshot/data/concepts/**/*.gz")

oax_concepts = process_parts(oax_concept_parts, concept_essentials)

above took 30 seconds.

In [None]:
chunksize = 1_000

n_chunks = (len(oax_concepts) // chunksize) + 1

for i, chunk in enumerate(tqdm(partition_all(chunksize, oax_concepts), total=n_chunks)):
    requests = [InsertOne(d) for d in chunk]
    result = mdb.oax_concepts_slim.bulk_write(requests, ordered=False)
    assert result.inserted_count == len(requests), f"chunk {i} failure"

10x reduction in collection size in memory, from 380M to 34M.

In [None]:
docs = list(mdb.oax_concepts_slim.find())

In [None]:
g_oax = Graph()
scheme_id = URIRef("https://heliokos.example/OpenAlex")
g_oax.add((scheme_id, RDF.type, SKOS.ConceptScheme))
g_oax.add((scheme_id, SKOS.prefLabel, Literal("OpenAlex Concept Scheme")))
g_oax.add((scheme_id, RDFS.seeAlso, URIRef("https://docs.openalex.org/api-entities/concepts")))

for d in tqdm(docs):
    id_ = URIRef(d["_id"])
    g_oax.add((id_, RDF.type, SKOS.Concept))
    g_oax.add((id_, SKOS.inScheme, scheme_id))
    g_oax.add((id_, SKOS.prefLabel, Literal(d["display_name"])))
    if d.get("description"):
        g_oax.add((id_, SKOS.definition, Literal(d["description"])))
    if d["level"] == 0:
        g_oax.add((scheme_id, SKOS.hasTopConcept, id_))
    for a in d["ancestors"]:
        if a["level"] == d["level"] - 1:
            g_oax.add((URIRef(a["id"]), SKOS.narrower, id_))

print("saving to disk...")
g_oax.serialize("openalex.ttl")
print("done")

In [None]:
g_oax, g_uat = Graph(), Graph()
g_oax.parse("src/heliokos/infra/static/openalex.ttl")
g_uat.parse("src/heliokos/domain/uat.ttl")

In [None]:
len(g_uat), len(g_oax)

In [None]:
len([s for s in g_uat.subjects(RDF.type, SKOS.Concept, unique=True)])

In [None]:
len([s for s in g_oax.subjects(RDF.type, SKOS.Concept, unique=True)])

# Pruning skos:ConceptScheme to Cover Corpus Only

In [None]:
# get all concepts that tag corpus
concepts_in_corpus = [d["_id"] for d in mdb.oax_works_with_doi_in_ads.aggregate([
    {"$project": {"_id": 0, "concepts.id": 1}},
    {"$unwind": {"path": "$concepts"}},
    {"$group": {"_id": "$concepts.id"}},
    ], allowDiskUse=True)]

In [None]:
n_in_ads, n_total = mdb.oax_concepts_in_oax_works_with_doi_in_ads.estimated_document_count(), mdb.oax_concepts.estimated_document_count()
print(f"{n_total=:,}, {n_in_ads=:,}")
f"{(n_in_ads/n_total):.0%}"

In [None]:
# get all concepts that tag corpus
# - (got it above)
# make set of all these concepts and their ancestors
# - automatic with oax
# get complement of above set
# - yuck, still 98% (>60k) concepts present. so not going to bother with next step.
# for each member of this complement m,
#   remove all (m, None, None) and (None, None, m) triples from the scheme.


# Assisted Harmonization of UAT to OpenAlex
UAT as retrieval scheme and OpenAlex as tagging scheme.

Approach: from Accidental Taxonomist 3rd ed. (2022), pp 348-9.

A series of comparison passes:

In [None]:
# primary/retrieval taxonomy: uat
# merging/tagging taxonomy: oax
from heliokos.infra.core import ConceptScheme

hzn = Harmonization(
    tagging_scheme=ConceptScheme.from_graph(g_oax),
    retrieval_scheme=ConceptScheme.from_graph(g_uat)
)

In [None]:
all("openalex.org" in str(c) for c in hzn.tagging_scheme.g.subjects(RDF.type, SKOS.Concept))

In [None]:
all("astrothesaurus.org" in str(c) for c in hzn.retrieval_scheme.g.subjects(RDF.type, SKOS.Concept))

1. exact match of merging_preflabel with primary_preflabel

In [None]:
def is_en_us_label(label):
    return (label.language is None) or (label.language in ("en", "en-US"))

In [None]:
t_merging = hzn.tagging_scheme.g
t_primary = hzn.retrieval_scheme.g

In [None]:
merging_preflabel_concepts = {str(preflabel): c for c, preflabel in t_merging.query("""
SELECT ?c ?preflabel WHERE {
?c a skos:Concept .
?c skos:prefLabel ?preflabel .
}""") if is_en_us_label(preflabel)}
primary_preflabel_concepts = {str(preflabel): c for c, preflabel in t_primary.query("""
SELECT ?c ?preflabel WHERE {
?c a skos:Concept .
?c skos:prefLabel ?preflabel .
}""") if is_en_us_label(preflabel)}

exact_match_preflabel_preflabel = {}
for merging_preflabel, merging_concept in merging_preflabel_concepts.items():
    if merging_preflabel in primary_preflabel_concepts:
        exact_match_preflabel_preflabel[merging_concept] = primary_preflabel_concepts[merging_preflabel]
len(exact_match_preflabel_preflabel)

2. exact match of merging_preflabel with a primary_altlabel

In [None]:
primary_altlabel_concepts = {str(altlabel): c for c, altlabel in t_primary.query("""
SELECT ?c ?altlabel WHERE {
?c a skos:Concept .
?c skos:altLabel ?altlabel .
}""") if is_en_us_label(altlabel)}

exact_match_preflabel_altlabel = {}
for merging_preflabel, merging_concept in merging_preflabel_concepts.items():
    if merging_preflabel in primary_altlabel_concepts:
        exact_match_preflabel_altlabel[merging_concept] = primary_altlabel_concepts[merging_preflabel]
len(exact_match_preflabel_altlabel)

3. exact match of merging_altlabel with primary_preflabel. merging_preflabel should be added as a primary_altlabel.

In [None]:
merging_altlabel_concepts = {str(altlabel): c for c, altlabel in t_merging.query("""
SELECT ?c ?altlabel WHERE {
?c a skos:Concept .
?c skos:altLabel ?altlabel .
}""") if is_en_us_label(altlabel)}

exact_match_altlabel_preflabel = {}
for merging_altlabel, merging_concept in merging_altlabel_concepts.items():
    if merging_altlabel in primary_preflabel_concepts:
        exact_match_altlabel_preflabel[merging_concept] = primary_preflabel_concepts[merging_altlabel]
len(exact_match_altlabel_preflabel)
# unsuprisingly 0, as oax concepts don't have altlabels.

4. exact match of merging_altlabel with primary_altlabel. merging_preflabel should be added as a primary_altlabel.

In [None]:
exact_match_altlabel_altlabel = {}
for merging_altlabel, merging_concept in merging_altlabel_concepts.items():
    if merging_altlabel in primary_altlabel_concepts:
        exact_match_altlabel_altlabel[merging_concept] = primary_altlabel_concepts[merging_altlabel]
len(exact_match_altlabel_altlabel)
# unsuprisingly 0, as oax concepts don't have altlabels.

5. close (not exact) match of merging_preflabel with primary_preflabel or primary_altlabel.

In [None]:
# use elasticsearch analyzer to get set of tokens for each label.

base_url = os.getenv("ELASTIC_HOST")
auth = ("elastic", os.getenv("ELASTIC_PASSWORD"))

def get_tokens(text=None):
    if text is None:
        raise ValueError("no text supplied")
    
    rv = requests.post(
        base_url + "/_analyze",
        json={
            "tokenizer": "standard",
            "filter": ["snowball"],
            "text": text,
        },
        auth=auth,
    )
    rv.raise_for_status()
    return rv.json()["tokens"]

def get_label_tokens_threadpool(label_concepts):
    """estimated ≈ 7 hours for ≈ 65k concepts done serially. No thanks."""
    rv = {}
    with concurrent.futures.ThreadPoolExecutor(max_workers=32) as executor:
        future_to_label = {
            executor.submit(get_tokens, label): label
            for label in label_concepts
        }
        for future in tqdm(concurrent.futures.as_completed(future_to_label), total=len(label_concepts)):
            label = future_to_label[future]
            try:
                data = future.result()
            except Exception as exc:
                print('%r generated an exception: %s' % (label, exc))
            else:
                rv[label] = data
    return rv

In [None]:
def insert_concept_label_tokens(label_concepts, label_tokens, type_="preflabel"):
    reqs = []
    for label, concept_id in label_concepts.items():
        reqs.append(InsertOne({
            "concept": str(concept_id),
            "label": label,
            "type": type_,
            "tokens": label_tokens[label],
        }))
    
    chunksize = 1_000
    n_chunks = (len(reqs) // chunksize) + 1
    for i, chunk in enumerate(tqdm(partition_all(chunksize, reqs), total=n_chunks)):
        result = mdb.concept_label_tokens.bulk_write([req for req in chunk], ordered=False)
        assert result.inserted_count == len(chunk), f"chunk {i} failure"

In [None]:
mdb.drop_collection("concept_label_tokens")

In [None]:
#primary_preflabel_tokens = get_label_tokens_threadpool(primary_preflabel_concepts)

In [None]:
insert_concept_label_tokens(primary_preflabel_concepts, primary_preflabel_tokens, "preflabel")

In [None]:
#merging_preflabel_tokens = get_tokens_threadpool(merging_preflabel_concepts)

above took ≈ 13 minutes with 32 threadpool workers.

In [None]:
insert_concept_label_tokens(merging_preflabel_concepts, merging_preflabel_tokens, "preflabel")

In [None]:
#primary_altlabel_tokens = get_label_tokens_threadpool(primary_altlabel_concepts)

In [None]:
insert_concept_label_tokens(primary_altlabel_concepts, primary_altlabel_tokens, "altlabel")

In [None]:
len(mdb.concept_label_tokens.distinct("concept", {"concept": {"$regex": "openalex"}}))

In [None]:
len(merging_preflabel_concepts.values())

In [None]:
len(mdb.concept_label_tokens.distinct("concept", {"concept": {"$regex": "astrothesaurus"}}))

In [None]:
mdb.concept_label_tokens.create_index({"concept": 1, "type": 1, "label": 1}, unique=True)

Whew. Okay, finally: 5. close (not exact) match of merging_preflabel with primary_preflabel or primary_altlabel.

- first pass: merging_preflabel set-of-tokens is same as primary_preflabel set-of-tokens
- second pass: merging_preflabel set-of-tokens is same as primary_altlabel set-of-tokens

optional, only if both merging set-of-tokens and primary set-of-tokens have >= 4 members.

- third pass: merging_preflabel set-of-tokens has >75% intersection with primary_preflabel set-of-tokens
- fourth pass: merging_preflabel set-of-tokens has >75% intersection with primary_altlabel set-of-tokens

In [None]:
merging_preflabel_filter = {"concept": {"$regex": "^https://openalex.org/"}, "type": "preflabel"}
n_mpl = mdb.concept_label_tokens.count_documents(merging_preflabel_filter)

In [None]:
primary_preflabel_filter = {"concept": {"$regex": "^http://astrothesaurus.org/"}, "type": "preflabel"}
n_ppl = mdb.concept_label_tokens.count_documents(primary_preflabel_filter)

In [None]:
primary_altlabel_filter = {"concept": {"$regex": "^http://astrothesaurus.org/"}, "type": "altlabel"}
n_pal = mdb.concept_label_tokens.count_documents(primary_altlabel_filter)

In [None]:
def lower_unless_isupper(s):
    """Lowercase, except likely acronyms."""
    return s.lower() if not s.isupper() else s

# double-for-loop est. 1.5 hrs to complete. no thanks.
ppl_tokens_concepts = {
    tuple([lower_unless_isupper(i["token"]) for i in ppl["tokens"]]): ppl["concept"]
    for ppl in mdb.concept_label_tokens.find(primary_preflabel_filter)
}
pal_tokens_concepts = {
    tuple([lower_unless_isupper(i["token"]) for i in ppl["tokens"]]): ppl["concept"]
    for ppl in mdb.concept_label_tokens.find(primary_altlabel_filter)
}

In [None]:
mpl_matches = {}

for mpl in tqdm(mdb.concept_label_tokens.find(merging_preflabel_filter), total=n_mpl):
    mpl_tokens = tuple([lower_unless_isupper(i["token"]) for i in mpl["tokens"]])
    if mpl_tokens in ppl_tokens_concepts:
        # ordered match with preflabel
        mpl_matches[mpl["concept"]] = ppl_tokens_concepts[mpl_tokens]
    elif mpl_tokens in pal_tokens_concepts:
        # ordered match with altlabel
        mpl_matches[mpl["concept"]] = pal_tokens_concepts[mpl_tokens]
    elif len(mpl_tokens) <= 8:
        for p in permutations(mpl_tokens):
            if p in ppl_tokens_concepts:
                # unordered match with preflabel
                mpl_matches[mpl["concept"]] = ppl_tokens_concepts[p]    
            if p in pal_tokens_concepts:
                # unordered match with altlabel
                mpl_matches[mpl["concept"]] = pal_tokens_concepts[p]        
        else:
            if len(mpl_tokens) in range(4, 7):
                for p in permutations(mpl_tokens, r=(len(mpl_tokens) - 1)):
                    if p in ppl_tokens_concepts:
                        # unordered near-match (>75% intersection) with preflabel
                        mpl_matches[mpl["concept"]] = ppl_tokens_concepts[p]
                    elif p in pal_tokens_concepts:
                        # unordered near-match (>75% intersection) with altlabel
                        mpl_matches[mpl["concept"]] = pal_tokens_concepts[p]

In [None]:
len(mpl_matches)

woohoo!

Now need to generate table interface to y/n the candidate matches.

In [None]:
(len(exact_match_preflabel_preflabel) + 
 len(exact_match_preflabel_altlabel) +
 len(exact_match_altlabel_preflabel) + 
 len(exact_match_altlabel_altlabel) +
 len(mpl_matches))

In [None]:
for c_merging, c_primary in exact_match_preflabel_preflabel.items():
    print(c_merging)
    break

In [None]:
# TODO get a mapping done here, in the notebook, so you can go through the whole
#   process in-notebook. And then of course create mapping UI.

In [None]:
str(t_merging.value(URIRef("https://openalex.org/C184779094"), SKOS.definition))

In [None]:
def mapping_expansion(mapping):
    rows = []
    for c_merging, (c_primary, nocheck) in mapping.items():
        rows.append({
            "c_merging": str(URIRef(c_merging)),
            "merging": (f"{str(t_merging.value(URIRef(c_merging), SKOS.prefLabel))} " +
                        f"({str(t_merging.value(URIRef(c_merging), SKOS.definition))})"),
            "action": "k" if nocheck else "",
            "c_primary": str(URIRef(c_primary)),
            "primary": (f"{str(t_primary.value(URIRef(c_primary), SKOS.prefLabel))} " +
                        f"({', '.join(str(o) for o in t_primary.objects(URIRef(c_primary), SKOS.altLabel))})"),
        })
    return rows

In [None]:
candidates = {}
for candidates_increasing_precedence, nocheck in [
    (mpl_matches, False),
    (exact_match_altlabel_altlabel, False),
    (exact_match_altlabel_preflabel, False),
    (exact_match_preflabel_altlabel, True),
    (exact_match_preflabel_preflabel, True),
]:
    for mc, pc in candidates_increasing_precedence.items():
        candidates[str(mc)] = (str(pc), nocheck)

In [None]:
rows = sorted(mapping_expansion(candidates), key=lambda r: (r["action"], r["merging"].lower()))

with open("mapping_table.csv", "w") as csvfile:
    fieldnames = ["c_merging", "merging", "action", "primary", "c_primary"]
    writer = csv.DictWriter(csvfile, fieldnames=fieldnames)
    writer.writeheader()
    for row in rows:
        writer.writerow(row)

Evaluated manually, resulted in `oax_uat_mapping_table.csv`.

`n`: not acceptable; `b`: broader (ok fro upward posting); `k`: ok, equivalent.

(see p353 of Hedden 2022)

Now, to produce the mapping as RDF:

In [None]:
def printable(s):
    return "".join(filter(lambda x: x in string.printable, s))

g_map_oax_uat = Graph()

with open('oax_uat_mapping_table.csv') as f:
    reader = csv.DictReader(f)
    for d in reader:
        d = {printable(k): v for k, v in d.items()}
        predicate = None
        match d["action"]:
            case "k":
                predicate = SKOS.exactMatch
            case "b":
                predicate = SKOS.broadMatch
        if predicate:
            g_map_oax_uat.add((URIRef(d["c_merging"]), predicate, URIRef(d["c_primary"])))

In [None]:
len(list(g_map_oax_uat.subject_objects(SKOS.exactMatch)))

In [None]:
len(list(g_map_oax_uat.subject_objects(SKOS.broadMatch)))

# Query-Ground-Truth Candidate

from https://smd-cms.nasa.gov/wp-content/uploads/2023/11/gapanalysisreport-full-final.pdf (https://doi.org/10.1016/j.asr.2023.06.046):

review on "space weather effects of solar flares": https://doi.org/10.1016/j.jastp.2011.04.004
- 33 references in ADS (36 in OAX).

Let's do it.

In [None]:
# TODO gather these references and put in a candidate data structure
#   for a query-ground-truth candidate

review_article = mdb.oax_works_with_doi_in_ads.find_one({"doi": "https://doi.org/10.1016/j.jastp.2011.04.004"})

In [None]:
review_references =  list(mdb.oax_works_with_doi_in_ads.find({"_id": {"$in": review_article["referenced_works"]}}))

In [None]:
len(review_article["referenced_works"]), len(review_references)

huh. Well, P10 is suddenly very interesting...

In [None]:
query_ground_truth_candidate = {
    "q": "space weather effects of solar flares",
    "relevant": [d["_id"] for d in review_references],
    "type": "binary"
}

In [None]:
query_ground_truth_candidate

# Harmonization-Application Strategies

Defining a default strategy here.

search-box text --> weighted hits on retrieval (primary) scheme --> structured query against corpus using tagging (merging) scheme.

In [None]:
q = "space weather effects of solar flares"

## weighted hits on retrieval (primary) scheme

In [None]:
q_tokens = [i["token"] for i in get_tokens(q)]
q_tokens

In [None]:
def subslices(seq):
    "Return all contiguous non-empty subslices of a sequence"
    # subslices('ABCD') --> A AB ABC ABCD B BC BCD C CD D
    slices = starmap(slice, combinations(range(len(seq) + 1), 2))
    return map(operator.getitem, repeat(seq), slices)

In [None]:
q_tokens_ngrams = [tuple(ss) for ss in subslices(q_tokens)]
q_tokens_ngrams

In [None]:
len(q_tokens_ngrams)

In [None]:
primary_scheme_concept_score = defaultdict(int)

for q_tokens in tqdm(q_tokens_ngrams):
    if q_tokens in ppl_tokens_concepts:
        # ordered match with preflabel
        primary_scheme_concept_score[ppl_tokens_concepts[q_tokens]] += 1e6
    elif q_tokens in pal_tokens_concepts:
        # ordered match with altlabel
        primary_scheme_concept_score[pal_tokens_concepts[q_tokens]] += 1e6
    elif len(q_tokens) <= 8:
        for p in permutations(q_tokens):
            if p in ppl_tokens_concepts:
                # unordered match with preflabel
                primary_scheme_concept_score[ppl_tokens_concepts[p]] += 1e5
            if p in pal_tokens_concepts:
                # unordered match with altlabel
                primary_scheme_concept_score[pal_tokens_concepts[p]] += 1e5
        else:
            if len(q_tokens) in range(4, 7):
                for p in permutations(q_tokens, r=(len(q_tokens) - 1)):
                    if p in ppl_tokens_concepts:
                        # unordered near-match (>75% intersection) with preflabel
                        primary_scheme_concept_score[ppl_tokens_concepts[p]] += 1e4
                    elif p in pal_tokens_concepts:
                        # unordered near-match (>75% intersection) with altlabel
                        primary_scheme_concept_score[pal_tokens_concepts[p]] += 1e4

In [None]:
primary_scheme_concept_score

## structured query against corpus using tagging (merging) scheme

In [None]:
def harmonization_application_strategy(
    query: str, cs_retrieval : Graph, cs_tagging: Graph, g_harmonization: Graph, score_threshold=1e4
):
    q_tokens = [i["token"] for i in get_tokens(query)]
    q_tokens_ngrams = [tuple(ss) for ss in subslices(q_tokens)]

    retrieval_scheme_concept_score = defaultdict(int) # aka primary_scheme_concept_score
    
    for q_tokens in tqdm(q_tokens_ngrams):
        if q_tokens in ppl_tokens_concepts:
            # ordered match with preflabel
            retrieval_scheme_concept_score[ppl_tokens_concepts[q_tokens]] += 1e6
        elif q_tokens in pal_tokens_concepts:
            # ordered match with altlabel
            retrieval_scheme_concept_score[pal_tokens_concepts[q_tokens]] += 1e6
        elif len(q_tokens) <= 8:
            for p in permutations(q_tokens):
                if p in ppl_tokens_concepts:
                    # unordered match with preflabel
                    retrieval_scheme_concept_score[ppl_tokens_concepts[p]] += 1e5
                if p in pal_tokens_concepts:
                    # unordered match with altlabel
                    retrieval_scheme_concept_score[pal_tokens_concepts[p]] += 1e5
            else:
                if len(q_tokens) in range(4, 7):
                    for p in permutations(q_tokens, r=(len(q_tokens) - 1)):
                        if p in ppl_tokens_concepts:
                            # unordered near-match (>75% intersection) with preflabel
                            retrieval_scheme_concept_score[ppl_tokens_concepts[p]] += 1e4
                        elif p in pal_tokens_concepts:
                            # unordered near-match (>75% intersection) with altlabel
                            retrieval_scheme_concept_score[pal_tokens_concepts[p]] += 1e4

    retrieval_scheme_query = {
        "filter": {"concepts.id": {"$in": [
            c for c, score in retrieval_scheme_concept_score.items()
            if score > score_threshold
        ]}},
    }
    tagging_scheme_concepts = list(filter(None, [
        (
            str(g_harmonization.value(predicate=SKOS.exactMatch, object=URIRef(c))) or
            str(g_harmonization.value(predicate=SKOS.broadMatch, object=URIRef(c)))
        )
        for c in retrieval_scheme_query["filter"]["concepts.id"]["$in"]
    ]))
    tagging_scheme_query = {"filter": {"concepts.id": {"$in": tagging_scheme_concepts}}}
    return retrieval_scheme_query, tagging_scheme_query

In [None]:
rsq, tsq = harmonization_application_strategy(
    query=q,
    cs_retrieval=g_uat,
    cs_tagging=g_oax,
    g_harmonization=g_map_oax_uat,
)
rsq, tsq

# Performance-measuring jobs

- PN (e.g. P10, P25, P50)
- RN (e.g. R1000, R500, R100)
- MRR?
- NDCG?

In [None]:
corpus = {d["_id"] for d in mdb.oax_works_with_doi_in_ads.find({}, ["_id"])}

In [None]:
len(corpus)

In [None]:
query_ground_truth_candidate

In [None]:
corpus_index = mdb.oax_works_with_doi_in_ads

In [None]:
tsq["limit"] = 100
results = list(corpus_index.find(**tsq))

In [None]:
len(results)