In [None]:
from docuverse.engines import SearchResult
from docuverse.engines import SearchEngine
from docuverse.engines.search_engine_config_params import DocUVerseConfig
from docuverse.utils import get_param
import pickle

from docuverse.utils.elastic.elastic_ingestion import normalize_text

In [None]:
config="experiments/zdocs/milvus_dense.granite-125m.test.yaml"
engine=SearchEngine(DocUVerseConfig(config))

In [None]:
results=engine.read_output()

In [None]:
res=results[0]

In [None]:
len(res)

In [None]:
urls=[[get_param(d, 'metadata.url') for d in res] for res in results]
qurls = [d.question.get('metadata')['gold-urls'] for d in results]
qtext = [d.question.text for d in results]

In [None]:
data=engine.read_data()

In [None]:
import re
VERSION_PATTERN = re.compile(r'(\d+(\.\d+)?(\.\d+)?)\?')

def normalize_url(url):
    url = VERSION_PATTERN.sub('latest?', url)
    return re.sub(r'#.*', '', url)

In [None]:
url2id = {}
id2pos = {}
reverse_norm_map = {}
def add_key(ddict, key1, key2):
    if key1 in ddict:
        # ddict[key1][key2] = 1
        if key2 in ddict[key1]:
            ddict[key1][key2] += 1
        else:
            ddict[key1][key2] = 1
    else:
        ddict[key1] = {key2: 1}

for pos, d in enumerate(data):
    url = d['metadata']['url']
    add_key(url2id, url, d['id'])
    norm_url = normalize_url(url)
    d['metadata']['norm_url'] = norm_url
    add_key(url2id, norm_url, d['id'])
    if norm_url in reverse_norm_map:
        if url in reverse_norm_map[norm_url]:
            pass
            # ids = url2id[norm_url].keys()
            # print(ids)
            # print(f"{norm_url} (position {pos} already seen in documents {ids}, at positions "
            #       f"{[id2pos[p] for p in list(ids) if p in id2pos]}.")
        else:
            reverse_norm_map[norm_url][url] = 1
        reverse_norm_map[norm_url][url] += 1
    else:
        reverse_norm_map[norm_url] = {url: 1}

    id2pos[d['id']] = pos

In [None]:
num_errors = 0
for url in reverse_norm_map:
    ids = [id2pos[p] for p in url2id[url].keys()]
    for i, id in enumerate(ids):
        if ids[0]+i != id:
            print(f"Error with url {url}, id list {ids}, urls: {list(reverse_norm_map[url].keys())}")
            num_errors += 1
            break
print(f"Number of errors: {num_errors}")

In [None]:
from rouge_score.rouge_scorer import RougeScorer
rouge_scorer = RougeScorer(['rouge1', 'rougeL'], use_stemmer=True)
def compute_rouge_matches(result, thr=0.9):
    gold = result.question.get('metadata.answer')

    scores = []
    for answer in result:
        score = rouge_scorer.score(gold, answer.text)
        scores.append(score['rouge1'].recall)

    inds = sorted(range(len(scores)), key=lambda i: scores[i], reverse=True)
    # print(f"Best answer for {res.question.text}: {res[inds[0]]['id']}, recall: {scores[inds[0]]}")
    # return {result.question.id: {"best": result[inds[0]]['id'], "score": scores[inds[0]]}}
    res = []
    for ansid in inds:
        if scores[ansid] >= thr:
            res.append({'id': result[ansid]["id"], 'score': scores[ansid]})
    return res

In [None]:
compute_rouge_matches(res, 0.9)

In [None]:
compute_rouge_matches(results[3])

In [None]:
import json

questions = [json.loads(s) for s in open("../benchmark/zdocs/questions_all.jsonl").readlines()]

In [None]:
questions[0]['metadata']['gold-urls']

In [None]:
from tqdm.notebook import tqdm
from docuverse.utils import get_orig_docid
from docuverse.utils import parallel_process
import copy
outq = copy.deepcopy(questions)
not_found = []

def append(ll, vals):
    for v in vals:
        if v not in ll:
            ll.append(int(v))

def process_query(data):
    q=data[0]
    result=data[1]
    gold_urls = q['metadata']['gold-urls'].split(";")
    relevant = []
    q['metadata']['norm-gold-urls'] = []
    for gold_url in gold_urls:
        gold_url = normalize_url(gold_url)
        q['metadata']['norm-gold-urls'].append(gold_url)
        rels = [get_orig_docid(p) for p in url2id[gold_url]] if gold_url in url2id else []
        append(relevant, rels)
    q['metadata']['gold-urls'] = gold_urls
    matches = compute_rouge_matches(result)
    if len(matches) > 0:
        # q['metadata']['relevant'] = [get_orig_docid(m['id']) for m in matches]
        append(relevant, [get_orig_docid(m['id']) for m in matches])
        # relevant += [get_orig_docid(m['id']) for m in matches if get_orig_docid(m['id']) not in relevant]
        q['metadata']['rouge_scores'] = [m['score'] for m in matches]
    else:
        q['metadata']['rouge_scores'] = []
        # gurl = q['metadata']['gold-urls']
        # relevant = [url2id[gurl].keys()] if gurl in url2id else []
        # if not relevant:
        #     gurl = normalize_url(gurl)
        #     relevant = [url2id[gurl].keys()] if gurl in url2id else []
    if not relevant:
        not_found.append(i)
    q['relevant'] = relevant

# outq = parallel_process(process_query, list(zip(questions, results)), num_threads=5)

for i, q in tqdm(enumerate(zip(outq, results)), total=len(outq)):
    process_query(q)

In [None]:
[q for q in outq if len(q['metadata']['gold-urls']) > 1]

In [None]:
with open("../benchmark/zdocs/questions_all_fixed_new.jsonl", "w") as f:
    for q in outq:
        f.write(json.dumps(q) + '\n')

In [None]:
open("benchmark/zdocs/missing_urls.new.txt", "w").writelines([q['metadata']['gold-urls']+"\n" for q in outq if q['metadata']['relevant']==[] and q['metadata']['answer']!=""])

In [None]:
# Fix the questions' urls: remove tags
import json
qfile = "benchmark/zdocs/questions_all_fixed.jsonl"
qfile_fixed = "benchmark/zdocs/questions_all_fixed_new.jsonl"
qs = open(qfile).readlines()
with open(qfile_fixed, "w") as out:
    for line in qs:
        q = json.loads(line)
        url = q['metadata']['gold-urls']
        pos = url.find('#')
        if pos >= 0:
            q['metadata']['gold_urls'] = url[:pos]
        out.write(json.dumps(q)+"\n")

Fix the questions to have the 'latest' tag (according to Jaydeep, this was agreed with the Z team).

In [None]:
questions=engine.read_questions()

In [None]:
qid=0
url=questions[qid]['metadata']['gold-urls']
norm_url = normalize_url(url)
print(f"Url in url2id: {norm_url in url2id}")
print(f"Normalized url in url2id: {norm_url in url2id}")
found=compute_rouge_matches(results[qid], 0.9)
found_id=found[0]['id']
print(found_id)
print(f"Id for results[{qid}]: {found_id}, \nurl:{data[id2pos[found_id]]['metadata']['url']}")