In [1]:
%load_ext autoreload
%autoreload 2

In [2]:
from pprint import pprint

from terminusdb_client import WOQLQuery as WQ

from ads_query_eval.config import get_terminus_client, get_terminus_config
from ads_query_eval.app.bootstrap import QUERIES

client = get_terminus_client()

Checking for 200 response from http://terminus:6363/api/info...


In [None]:
from terminusdb_client import WOQLClient

config = get_terminus_config()
remote_client = WOQLClient(server_url="https://ads-query-eval-terminus.polyneme.xyz")
remote_client.connect(db=config["dbid"], user="admin", key=config["admin_pass"])

# troublshoot no-abstract and no-highlighting returns

In [6]:
from ads_query_eval.frame import s3
from ads_query_eval.config import get_s3_client

s3_client = get_s3_client()
items_content = s3.get_json(
        client=s3_client,
        key="items_top25__" + retrieval.s3_key,
    )

In [23]:
import requests

rv = requests.get(
    "https://polyneme.nyc3.cdn.digitaloceanspaces.com/ads-query-eval/items_all__useful%28topn%28200,similar%281961PhRvL...6...47D%29%29%29.2022-08-24.json.gz"
)

In [24]:
data = rv.json()
len(data)

1000

In [27]:
'abstract' in data[0].keys()

False

# scratch

In [None]:
#remote_client.replace_document(local_schema["Operation"], graph_type="schema", commit_msg="add optional status_reason")
remote_schema = remote_client.get_existing_classes()

In [None]:
local_schema = client.get_existing_classes()

In [None]:
from pprint import pprint

for cls in sorted(local_schema):
    if local_schema[cls] != remote_schema[cls]:
        print(cls)
        print("local:")
        pprint(local_schema[cls])
        print("upstream:")
        pprint(remote_schema[cls])
        print()

In [None]:
# document types that need new @id values and/or updated references:
# Retrieval @id:lexical(s3_key)
# RetrievedItem @id:hash(ads_bibcode,retrieval) retrieval:Retrieval
# Retrieval items:list[RetrievedItem]
# Evaluation retrieval:Retrieval
# ItemOfEvaluation @id:hash(evaluation,retrieved_item) retrieved_item:RetrievedItem

1. copy local Retrieval docs, remove items, then insert no-items docs into remote
1. insert local RetrievedItem docs into remote
1. replace remote Retrieval docs with local docs (now that RetrievedItem docs exist)
1. correct retrieval references of remote Evaluation docs
1. create updated ItemOfEvaluation docs and insert into remote

### copy local Retrieval docs, remove items, then insert no-items docs into remote

In [None]:
docs = []
for doc in client.get_documents_by_type("Retrieval"):
    doc["items"] = []
    docs.append(doc)

`docs` is now all local Retrieval docs, with `.items` set to `[]`

In [None]:
remote_client.insert_document(docs, commit_msg="insert corrected retrievals")

I eventually want to delete the above, but perhaps there are evaluations tied to them?

### insert local RetrievedItem docs into remote

In [None]:


docs = []
for doc in client.get_documents_by_type("RetrievedItem"):
    docs.append(doc)

In [None]:
from tqdm.notebook import tqdm
from toolz import partition_all
for i, p in tqdm(enumerate(partition_all(1000, docs))):
    remote_client.insert_document(list(p), commit_msg=f"insert corrected retrieved items (partition {i} of 28)")

### replace remote Retrieval docs with local docs (now that RetrievedItem docs exist)

In [None]:
for doc in tqdm(list(client.get_documents_by_type("Retrieval"))):
    remote_client.replace_document(doc, commit_msg="insert corrected retrieval with items")

The full corrected docs have been pushed. What about the "incorrect" retrievals upstream?

In [None]:
ids_ldocs = {d["@id"] for d in client.get_documents_by_type("Retrieval")}
rdocs_old = []
for rdoc in remote_client.get_documents_by_type("Retrieval"):
    if rdoc["@id"] not in ids_ldocs:
        rdocs_old.append(rdoc)
ids_rdocs_old = {d["@id"] for d in rdocs_old}
print(len(rdocs_old))
ids_rdocs_old

# correct retrieval references of remote Evaluation docs

In [None]:
retrieval_ids = {}

(WQ()
 .triple("v:e", "type", "@schema:Evaluation")
 .triple("v:e", "retrieval", "v:r")
 .triple("v:r", "done_at", "v:done_at")
 .execute(remote_client)["bindings"]
)

In [None]:
retrieval_ids = {
    'Retrieval/2022-08-24_full%3A%22coronal%20mass%20ejection%22': 'Retrieval/full%3A%22coronal%20mass%20ejection%22.2022-08-24.json.gz',
    'Retrieval/2022-08-24_full%3A%22geomagnetically%20induced%20current%22': 'Retrieval/full%3A%22geomagnetically%20induced%20current%22.2022-08-24.json.gz',
}

In [None]:
docs = []
for doc in remote_client.get_documents_by_type("Evaluation"):
    if doc["retrieval"] in retrieval_ids:
        doc["retrieval"] = retrieval_ids[doc["retrieval"]]
        docs.append(doc)

remote_client.replace_document(docs, commit_msg="new retrieval refs")

What remote retrievals have evaluations?

In [None]:
def bindings(client, woql_query):
    return (woql_query).execute(client)["bindings"]

In [None]:
bindings(remote_client, WQ().select("v:retrieval")
         .triple("v:eval", "type", "@schema:Evaluation")
         .triple("v:eval", "retrieval", "v:retrieval")
)
    

### create updated ItemOfEvaluation docs and insert into remote

document types that need new @id values and/or updated references:

- [x] Retrieval @id:lexical(s3_key)
- [x] RetrievedItem @id:hash(ads_bibcode,retrieval) retrieval:Retrieval
- [x] Retrieval items:list[RetrievedItem]
- [x] Evaluation retrieval:Retrieval
- [ ] ItemOfEvaluation @id:hash(evaluation,retrieved_item) retrieved_item:RetrievedItem


What evaluations do I care about? The ones I want to ensure correct ItemOfEvaluation docs for?

I care about Evaluation docs with a retrieval ID in `ids_ldocs`

In [None]:
bindings(remote_client, WQ().select("v:eval")
         .triple("v:eval", "type", "@schema:Evaluation")
         .triple("v:eval", "retrieval", "v:retrieval")
         .member("v:retrieval", [WQ().iri(d) for d in ids_ldocs])
)

Kill all other evaluations

In [None]:
other_evals = bindings(remote_client, WQ().select("v:eval")
         .triple("v:eval", "type", "@schema:Evaluation")
         .triple("v:eval", "retrieval", "v:retrieval")
         .woql_not(WQ().member("v:retrieval", [WQ().iri(d) for d in ids_ldocs]))
)
other_evals

In [None]:
remote_client.delete_document([d["eval"] for d in other_evals])

Now, do I care about Retrieval docs unreferenced by an Evaluation doc? Yes. Some retrievals have not been evaluated!

An ItemOfEval doc has a retrieved_item reference. The referenced RetrievedItem will have an ads_bibode and a retrieval.

For each reference to a RetrievedItem, I want to return a reference to an equivalent RetrievedItem --
that is, a RetrievedItem with a matching ads_bibcode and with a retrieval for a matching query.

In [None]:
from ads_query_eval.lib.util import pick

for doc in remote_client.get_documents_by_type("ItemOfEvaluation"):
    id_retrieved_item = doc["retrieved_item"]

How many ItemOfEvaluation docs interest me?

In [None]:
bindings(remote_client, WQ().count("v:count").select("v:ioe")
         .triple("v:eval", "type", "@schema:Evaluation")
         .triple("v:ioe", "type", "@schema:ItemOfEvaluation")
         .triple("v:ioe", "evaluation", "v:eval")
)

All of them. Right.

In [None]:
stuffs = bindings(remote_client, WQ().select("v:ioe", "v:new_ioe", "v:retrieval", "v:retrieved_item", "v:equiv_retrieved_item", "v:equiv_retrieval").woql_and(
    WQ().triple("v:ioe", "type", "@schema:ItemOfEvaluation"),
    WQ().triple("v:ioe", "retrieved_item", "v:retrieved_item"),
    WQ().triple("v:retrieved_item", "ads_bibcode", "v:ads_bibcode"),
    WQ().triple("v:retrieved_item", "retrieval", "v:retrieval"),
    WQ().triple("v:equiv_retrieved_item", "ads_bibcode", "v:ads_bibcode"),
    WQ().woql_not(WQ().eq("v:equiv_retrieved_item", "v:retrieved_item")),
    WQ().triple("v:equiv_retrieved_item", "retrieval", "v:equiv_retrieval"),
    WQ().woql_not(WQ().eq("v:retrieval", "v:equiv_retrieval")),
    WQ().cast("v:retrieval", "xsd:string", "v:retrieval_id_as_str"),
    WQ().regexp(r"\d{4}-\d{2}-\d{2}", "v:retrieval_id_as_str", "v:retrieval_id_date_str"),
    WQ().cast("v:equiv_retrieval", "xsd:string", "v:equiv_retrieval_id_as_str"),
    WQ().regexp(r"\d{4}-\d{2}-\d{2}", "v:equiv_retrieval_id_as_str", "v:equiv_retrieval_id_date_str"),
    WQ().eq("v:retrieval_id_date_str", "v:equiv_retrieval_id_date_str"),
    WQ().triple("v:retrieval", "query", "v:query").triple("v:equiv_retrieval", "query",  "v:query"),
    WQ().triple("v:ioe", "evaluation", "v:evaluation"),
    WQ().unique("ItemOfEvaluation/", ["v:evaluation", "v:equiv_retrieved_item"], "v:new_ioe"),
    
))
for s in stuffs:
    assert not s["retrieval"].endswith(".json.gz"), s
    assert s["equiv_retrieval"].endswith(".json.gz"), s

In [None]:
len(stuffs)

Wazzaaaaaaap!!

In [None]:
new_ioe_docs = []
ids_old_ioe_docs = []
for s in stuffs:
    doc = remote_client.get_document(s["ioe"])
    doc["retrieved_item"] = s["equiv_retrieved_item"]
    ids_old_ioe_docs.append(doc.pop("@id"))
    new_ioe_docs.append(doc)

In [None]:
len(new_ioe_docs), len(ids_old_ioe_docs)

In [None]:
remote_client.insert_document(new_ioe_docs)

In [None]:
remote_client.delete_document(ids_old_ioe_docs)

Okay, now, RetrievalItem docs that reference Retrieval docs with IDs that don't end with ".json.gz" (*takes breath*) can be deleted

# delete stale Retrieval and RetrievedItem docs

In [None]:
bs = bindings(remote_client, WQ().woql_and(
    WQ().triple("v:retrieved_item", "retrieval", "v:retrieval"),
    WQ().cast("v:retrieval", "xsd:string", "v:retrieval_id_as_str"),
    WQ().woql_not(WQ().regexp(r"\.json\.gz$", "v:retrieval_id_as_str", "v:good_retrieval")),
))

In [None]:
len(bs)

In [None]:
bs[0]

In [None]:
ri_ids = [b["retrieved_item"] for b in bs]

In [None]:
remote_client.optimize("admin/ads-query-eval")
remote_client.optimize("admin/ads-query-eval/_meta")
remote_client.optimize("admin/ads-query-eval/local/_commits")

Ah, delete the Retrieval docs first...

In [None]:
ids_stale_retrievals = list({b["retrieval"] for b in bs})

In [None]:
len(ids_stale_retrievals)

In [None]:
# id_ = ids_stale_retrievals[-1]
# stale_doc = remote_client.get_document(id_)
# stale_doc["items"] = []

# items_bindings = bindings(remote_client, WQ().woql_and(
#     WQ().path(WQ().iri(id_), "items,rdf:rest*,rdf:first", "v:item"),
#     WQ().triple("v:item", "retrieval", "v:retrieval"),
# ))
# remote_client.replace_document(stale_doc, commit_msg=f"purge items from {id_}")
# remote_client.delete_document([d["item"] for d in items_bindings], commit_msg=f"delete items for {id_}")

In [None]:
# WQ().woql_and(
#     WQ().triple("v:retrieved_item", "retrieval", WQ().iri(id_)),
#     WQ().delete_document("v:retrieved_item"),
#     WQ().delete_document(WQ().iri(id_)),
# ).execute(remote_client, commit_msg=f"delete stale Retrieval {id_} and its RetrievalItem docs")

In [None]:
id_ = ids_stale_retrievals[0]

In [None]:
len(WQ().select("v:id").woql_and(
    WQ().eq("v:id2", WQ().iri(id_)),
    (WQ().distinct("v:id")
     .triple("v:id1", "retrieval", "v:id2")
     .woql_or(
         WQ().eq("v:id", "v:id1"),
         WQ().eq("v:id", "v:id2"),
     )
    ),
    #WQ().delete_document("v:id"),
).execute(remote_client)["bindings"])

In [None]:
for id_ in tqdm(ids_stale_retrievals):
    WQ().select("v:id").woql_and(
        WQ().eq("v:id2", WQ().iri(id_)),
        (WQ().distinct("v:id")
         .triple("v:id1", "retrieval", "v:id2")
         .woql_or(
             WQ().eq("v:id", "v:id1"),
             WQ().eq("v:id", "v:id2"),
         )
        ),
        WQ().delete_document("v:id"),
    ).execute(remote_client, commit_msg=f"delete stale Retrieval {id_} and its RetrievalItem docs")

# util

In [None]:
for name, defn in client.get_existing_classes().items():
    print(name)
    #pprint(defn)
    #print()

In [None]:
import requests

c = remote_client

def tget(url):
    c._check_connection()
    return requests.get(
        url,
        headers=c._default_headers,
        auth=c._auth(),
    )

def tget_ttl(dbid, which=None):
    assert which in {"schema","instance"}
    rv = tget(f"https://ads-query-eval-terminus.polyneme.xyz/api/triples/admin/{dbid}/local/branch/main/{which}?format=turtle")
    for line in rv.json().split('\n'):
        print(line)
    return rv

#tget_ttl(c.db, which="schema")
#tget_ttl(c.db, which="instance")