In [None]:
import os
from urllib.parse import urlencode

import requests

url = "https://ui.adsabs.harvard.edu/v1/search/query"

headers = {
    "Accept": "application/json",
    "Content-type": "application/json",
    "Authorization": f"Bearer {os.getenv('ADS_API_TOKEN')}"
}

def params_for(q=""):
    return {
        "__clearBigQuery": "true",
        "fl": (
            "identifier,[citations],reference,abstract,author,book_author,"
            "orcid_pub,orcid_user,orcid_other,bibcode,citation_count,"
            "comment,doi,id,keyword,page,property,pub,pub_raw,pubdate,pubnote,"
            "read_count,title,volume,links_data,esources,data,"
            "citation_count_norm,email,doctype"
        ),
        "q": q,
        "hl": "true",
        "hl.fl": "title,abstract,body,ack,*",
        "hl.maxAnalyzedChars": "150000",
        "hl.requireFieldMatch": "true",
        "hl.usePhraseHighlighter": "true",
        "rows": "25",
        "sort": "score desc",
        "start": "0"
    }

def fetch_first_page(q):
    encoded_query = urlencode(params_for(q))
    response = requests.get(f"{url}?{encoded_query}", headers=headers)
    if response.status_code != 200:
            raise Exception(response.text)
    return response.json()
    

def fetch_first_n(q, n=1000):
    responses = []
    n_fetched_total = 0
    start = 0
    rows = 200 if n > 200 else n
    while True:
        params = params_for(q)
        params["rows"] = str(rows)
        params["start"] = str(start)
        response = requests.get(f"{url}?{urlencode(params)}", headers=headers)
        if response.status_code != 200:
            raise Exception(response.text)
        rv = response.json()
        responses.append(rv)
        n_fetched = len(rv["response"]["docs"])
        n_fetched_total += n_fetched
        if n_fetched_total >= n or n_fetched < rows:
            print(n_fetched_total, n, n_fetched, rows)
            break
        else:
            start += rows
            print(f"q: {q} start: {start}...")
    return responses

In [None]:
queries = (
    'full:"coronal mass ejection"',
    'full:"solar wind"',
    'full:"ionospheric_conductivity"',
    'full:"space weather"',
    'full:"geomagnetically induced current"',
    'full:("solar wind" AND magnetosphere AND coupling)',
    'full:(magnetosphere AND ionosphere AND coupling)',
    'full:("interplanetary magnetic field" AND reconnection)',
    'full:"substorm"',
    'full:"particle acceleration"',
    #'similar(bibcode:2015AdSpR..55.2745S)',
    #'useful(topn(200,similar(1958ApJ...128..664P)))',
    #'useful(topn(200,similar(1961PhRvL...6...47D)))',
    #'trending(full:"space weather")'
)

In [None]:
responses = []

In [None]:
from tqdm import tqdm

for q in tqdm(queries):
    responses.extend(fetch_first_n(q, n=1000))

In [None]:
len(responses)

In [None]:
import gzip
import json
    
with gzip.open('query_responses.json.gz', 'w') as f:
    f.write(json.dumps(responses).encode('utf-8'))

In [None]:
query_analysis = {}

for r in responses:
    highlighting = r['highlighting']
    q = r['responseHeader']['params']['q']
    docs = r['response']['docs']
    docs_with_highlighting = []
    for d in docs:
        dwh = {k: v for k, v in d.items()}
        h_for_doc = highlighting.get(d['id'])
        if h_for_doc:
            dwh['highlighting'] = h_for_doc
        docs_with_highlighting.append(dwh)
    if q not in query_analysis:
        query_analysis[q] = {"returned": []}
    query_analysis[q]["returned"].extend(docs_with_highlighting)

ADS display template: https://github.com/adsabs/bumblebee/blob/752b9146a404de2cfefebf55cb0cc983907f7519/src/js/widgets/list_of_things/templates/item-template.html

The template depends on preprocessing of the API JSON response by custom JS code, e.g. to produce a `formattedDate` field and a `links` field which is derived from various raw-API-response fields.

In [None]:
query_topic_reviews = {
    'full:"coronal mass ejection"': ['bibcode:2017LRSP...14....5K', 'bibcode:2012LRSP....9....3W'],
    'full:"solar wind"': ['bibcode:2021LRSP...18....3V'],
    'full:"ionospheric_conductivity"': ['bibcode:1993JATP...55.1493B', 'bibcode:2008AnGeo..26.3913A', 'bibcode:2012GMS...197..143M', 'bibcode:1956NCim....4S1385C'],
    'full:"space weather"': ['bibcode:2006LRSP....3....2S', 'bibcode:2021LRSP...18....4T', 'bibcode:2007LRSP....4....1P', 'bibcode:2015AdSpR..55.2745S', 'bibcode:2022FrASS...8..253B'],
    'full:"geomagnetically induced current"': ['bibcode:2017SpWea..15..828P', 'bibcode:2017SpWea..15..258B'],
    'full:("solar wind" AND magnetosphere AND coupling)': ['bibcode:2021LRSP...18....3V', 'bibcode:2007LRSP....4....1P', 'bibcode:2022FrASS...908629D'],
    'full:(magnetosphere AND ionosphere AND coupling)': ['bibcode:2007LRSP....4....1P', 'bibcode:2008SSRv..139..235W'],
    'full:("interplanetary magnetic field" AND reconnection)': ['bibcode:2012SSRv..172..187G', 'bibcode:2011SSRv..160...95F', 'bibcode:2021LRSP...18....3V'],
    'full:"substorm"': ['bibcode:2015SSRv..190....1K'],
    'full:"particle acceleration"': ['bibcode:2012SSRv..173..433F', 'bibcode:2012SSRv..173..103M'],
}

In [None]:
for q, a in tqdm(query_analysis.items(), total=len(query_analysis)):
    print(q, len(a["returned"]))
    if "topic_review_info" not in a:
        a["topic_review_info"] = []
    for q_bibcode in query_topic_reviews[q]:
        print(f"fetching {q_bibcode} for {q}")
        a["topic_review_info"].append(fetch_first_page(q_bibcode))

In [None]:
for q, a in query_analysis.items():
    a["relevant_bibcodes"] = set()
    for response in a["topic_review_info"]:
        a["relevant_bibcodes"] |= set(response['response']['docs'][0]['reference'])
    a["relevant_bibcodes"] = list(a["relevant_bibcodes"])

In [None]:
for q, a in query_analysis.items():
    for doc in a['returned']:
        doc_bibcodes = set()
        doc_bibcodes.add(doc['bibcode'])
        doc_bibcodes |= set(doc['identifier']) # non-bibcodes don't affect intersection check below.
        doc["_relevant_as_topic_review_ref"] = bool(doc_bibcodes & set(a["relevant_bibcodes"]))

In [None]:
ranks = []
for q, a in query_analysis.items():
    relevant = [doc["_relevant_as_topic_review_ref"] for doc in a['returned']]
    print(f"query: '{q}'")
    print(f"--  R@1000: {sum(relevant)/len(relevant):.2%}")
    rank = next((i for i, r in enumerate(relevant) if r), None)
    print(f"--  first relevant result at position: {rank if rank is not None else '>1000'}")
    print()
    ranks.append(rank)
print()
print("Mean Reciprocal Rank (MRR):", sum([1/(r+1) for r in ranks])/len(ranks))

In [None]:
import gzip
import json
    
with gzip.open('query_analysis.json.gz', 'w') as f:
    f.write(json.dumps(query_analysis).encode('utf-8'))

In [None]:
import gzip
import json

with gzip.open('query_analysis.json.gz', 'r') as f:
    qa_loaded = json.loads(f.read().decode('utf-8'))
    
for q, a in qa_loaded.items():
    pass