In [2]:
import re
import json
import aiohttp
import asyncio
import numpy as np
import pandas as pd
from tqdm import tqdm

# Prepare datasets

In [3]:
def get_qald_questions(split='test'):
    questions = []

    with open(f'../data/qald/qald_{split}.json', 'r') as f:
        data = json.load(f)['questions']
        
    for item in data:
        for q in item['question']:
            if q['language'] == 'en':
                question = q['string']
                
        questions.append({
            'question': question,
            'gold_entities': [a.split('wd:')[1] for a in re.findall(r'\b(wd:[Q]\d+)\b', item['query']['sparql'])]
        })
        
    return questions

def get_lcquad_questions(split='test'):
    questions = []
    
    with open(f'../data/lcquad/lcquad_2_{split}.json', 'r') as f:
        data = json.load(f)

    for item in data:
        questions.append({
            'question': item['en_question'],
            'gold_entities': [a.split('wd:')[1] for a in re.findall(r'\b(wd:[Q]\d+)\b', item['query'])]
        })
    
    return questions


def get_pat_questions(split='test'):
    with open(f'../data/pat/custom_iid_pat_{split}.json', 'r') as f:
        data = json.load(f)
    
    questions = []
    
    for item in data:
        questions.append({
            'question': item['question'],
            'gold_entities': [item['subject']['subject']]
        })
    
    return questions

# Prepare Entity Linkers

In [4]:
from refined.inference.processor import Refined
import spacy

SPACY_MODEL = spacy.load("en_core_web_md")
SPACY_MODEL.add_pipe("entityLinker", last=True)

def get_spacy_entities(text, nlp=SPACY_MODEL):
    doc = nlp(text)
    
    return [{
        'id': f'Q{ent.get_id()}',
        'url': ent.get_url(),
        'label': ent.get_label(),
        'description': ent.get_description()
    } for ent in doc._.linkedEntities]


REFINED_MODEL = Refined.from_pretrained(model_name='wikipedia_model_with_numbers', entity_set="wikidata")

def get_refined_entities(text, refined=REFINED_MODEL):
    return [{
        'id': span.__dict__['predicted_entity'].wikidata_entity_id,
        'label': span.__dict__['predicted_entity'].wikipedia_entity_title,
        'text': span.__dict__['text'],
        'span': tuple([span.__dict__['start'], span.__dict__['ln']]),
        'score': span.__dict__['entity_linking_model_confidence_score']
        } for span in refined.process_text(text) if span.predicted_entity is not None]

async def get_falcon_entities(text):
    url = 'https://labs.tib.eu/falcon/falcon2/api?mode=long'
    payload = {"text": text}
    headers = {"Content-Type": "application/json"}
    
    retries = 20
    delay = 1  # seconds

    for attempt in range(retries):
        try:
            async with aiohttp.ClientSession() as session:
                async with session.post(url, headers=headers, data=json.dumps(payload)) as response:
                    if response.status == 200:
                        response_data = await response.json()

                        entities = [
                            entity["URI"].split("/")[-1] for entity in response_data.get("entities_wikidata", [])
                        ]
                        relations = [
                            relation["URI"].split("/")[-1] for relation in response_data.get("relations_wikidata", [])
                        ]

                        return entities
                    else:
                        print(f"Unexpected status code: {response.status}")
        except Exception as e:
            continue
        
        if attempt < retries - 1:
            print(f"Retrying in {delay + attempt} seconds...")
            await asyncio.sleep(delay + attempt)

    print("Failed to retrieve data after multiple attempts.")
    return []

  from .autonotebook import tqdm as notebook_tqdm
  checkpoint = torch.load(io.BytesIO(f.read()), map_location="cpu")


# Prepare metrics

In [5]:
def calculate_metrics(pred, gold):
    em = set(gold) == set(pred)

    true_positives = set(gold) & set(pred) 

    precision = len(true_positives) / len(pred) if pred else 0.0

    # Recall: Proportion of gold entities that are correctly predicted
    recall = len(true_positives) / len(gold) if gold else 0.0

    # F1-Score: Harmonic mean of Precision and Recall
    f1 = 2 * (precision * recall) / (precision + recall) if precision + recall > 0 else 0.0

    return {'em': em, 'f1': f1, 'recall': recall}

# Pipeline

In [6]:
linkers = {
    'falcon': lambda question: get_falcon_entities(question),
    'refined': lambda question: [ent['id'] for ent in get_refined_entities(question)],
    'spacy': lambda question: [ent['id'] for ent in get_spacy_entities(question)]
}

datasets = {
    'qald': get_qald_questions,
    'lcquad': get_lcquad_questions,
    'pat': get_pat_questions
}

In [6]:
async def pipeline(name, split):
    dataset = datasets[name](split)
    
    linking_metrics = {linker: {} for linker in linkers.keys()}
    
    for item in tqdm(dataset):
        gold = item['gold_entities']
        
        for linker_name, linker in linkers.items():
            if linker_name == 'falcon':
                item[linker_name] = await linker(item['question'])
            else:
                item[linker_name] = linker(item['question'])
    
            if gold != []:
                for metric, value in calculate_metrics(item[linker_name], gold).items():
                    linking_metrics[linker_name][metric] = linking_metrics[linker_name].get(metric, []) + [value]
    
    for linker in linkers.keys():
        for metric in linking_metrics[linker].keys():
            linking_metrics[linker][metric] = round(np.mean(linking_metrics[linker][metric]), 2)
    
    with open(f'../data/{name}/{name}_{split}_entities.json', 'w', encoding='utf-8') as f:
        json.dump(dataset, f, ensure_ascii=False, indent=4)
    
    return linking_metrics

In [12]:
# results = {}

for name in list(datasets.keys())[2:]:
    if name != 'lcquad':
        print(name)
        linking_metrics = await pipeline(name, 'test')
        results[name] = pd.DataFrame(linking_metrics)

pat


100%|███████████████████████████████████████| 1233/1233 [47:48<00:00,  2.33s/it]


In [93]:
results['qald']

Unnamed: 0,falcon,refined,spacy
em,0.2,0.48,0.16
f1,0.33,0.65,0.49
recall,0.34,0.63,0.61


In [10]:
results['lcquad']

Unnamed: 0,falcon,refined,spacy
em,0.26,0.36,0.08
f1,0.43,0.57,0.46
recall,0.46,0.59,0.66


In [13]:
results['pat']

Unnamed: 0,falcon,refined,spacy
em,0.11,0.93,0.0
f1,0.35,0.93,0.4
recall,0.48,0.94,0.79
