# Combinación de modelos para elaborar la estrategia 3 de la memoria

Importar librerías

In [1]:
from transformers import pipeline
from refined.inference.processor import Refined
import tqdm as notebook_tqdm
import torch
from typing import List
from typing import Dict
from typing import Set
from typing import Iterable
from refined.data_types.doc_types import Doc
from refined.data_types.modelling_types import BatchedElementsTns
from refined.utilities.preprocessing_utils import convert_doc_to_tensors
from refined.data_types.base_types import Span
from refined.utilities.preprocessing_utils import pad
from refined.data_types.modelling_types import ModelReturn
from collections import defaultdict
from refined.utilities.general_utils import round_list
from refined.data_types.base_types import Entity
import re
import pandas as pd
import json
from langchain.chat_models import ChatOllama
from langchain_core.prompts import ChatPromptTemplate
from langchain.prompts import PromptTemplate
from langchain_core.output_parsers import StrOutputParser

  from .autonotebook import tqdm as notebook_tqdm
2024-08-13 19:07:50.614033: I tensorflow/core/util/util.cc:169] oneDNN custom operations are on. You may see slightly different numerical results due to floating-point round-off errors from different computation orders. To turn them off, set the environment variable `TF_ENABLE_ONEDNN_OPTS=0`.
2024-08-13 19:07:50.616984: W tensorflow/stream_executor/platform/default/dso_loader.cc:64] Could not load dynamic library 'libcudart.so.11.0'; dlerror: libcudart.so.11.0: cannot open shared object file: No such file or directory
2024-08-13 19:07:50.616995: I tensorflow/stream_executor/cuda/cudart_stub.cc:29] Ignore above cudart dlerror if you do not have a GPU set up on your machine.


Definoms clase central

In [12]:
class KGBuilder:
    # Models that build the data
    # Entity Linking Model
    refined = Refined.from_pretrained(model_name='wikipedia_model_with_numbers', entity_set='wikidata',use_precomputed_descriptions=True)
    # Relation Extraction Model
    triplet_extractor = pipeline('text2text-generation',model='Babelscape/rebel-large',tokenizer='Babelscape/rebel-large',device='cuda')
    # LLM for relation classification
    #llm = ChatOllama(model="llama3:8b")
    template = """
        Given a relationship between two entities, answer only with 'True' of 'False.
        - 'True' in case the relationship corresponds to reality.
        - 'False' in case that the relationship is wrong. 
        Do not take into consideration the previous classifications. Do not answer randomly, just base the classification in your universe knowledge.
        The input relationship that you have to analize is the following:
        {dict}
    """
    #prompt = ChatPromptTemplate.from_template(template=template)

    #chain = prompt | llm | StrOutputParser()
    
    def __find_all(substring, string):
        return [m.start() for m in re.finditer(re.escape(substring), string)]
    
    def filter_spans(spans:List[Span])->List[Span]:
        spans = [span for span in spans if span.entity_linking_model_confidence_score is not  None and span.entity_linking_model_confidence_score >= 0.75 and span.predicted_entity.wikidata_entity_id is not None]
        i = 0
        while i < len(spans):
            j = i + 1
            while j < len(spans):
                if spans[j].text == spans[i].text and spans[j].predicted_entity.wikidata_entity_id == spans[i].predicted_entity.wikidata_entity_id:
                    del spans[j]

                j += 1
                        
            
            i += 1

        
        return spans
    
    def filter_relationships(triplets:List[Dict])->List[Dict]:
        relations = set()
        for i in triplets:
            relations.add(str(i))

        triplets = []
        for i in relations:
            try:
                triplets.append(json.loads(i.replace("'",'"')))
            except Exception as e:
                pass


        return triplets
    
    def entity_linking(text:str)->List[Span]:
        spans_el = KGBuilder.refined.process_text(text)
        spans_el = KGBuilder.filter_spans(spans_el)
        return spans_el
      
    def relation_extraction(text:str)->List[Dict]:
        triplets = KGBuilder.extract_triplets(KGBuilder.triplet_extractor.tokenizer.batch_decode([KGBuilder.triplet_extractor(text, return_tensors=True, return_text=False)[0]["generated_token_ids"]])[0])
        triplets = KGBuilder.filter_relationships(triplets)

        return triplets
     
    def link_spans_with_relationships(triplets:List[Dict],spans:List[Span]):
        i = 0
        entities_with_relation = set()
        while i < len(triplets):
            for span in spans:
                if triplets[i]['head'] == span.text:
                    triplets[i]['head'] = span
                    entities_with_relation.add(span)
                
                if triplets[i]['tail'] == span.text:
                    triplets[i]['tail'] = span
                    entities_with_relation.add(span)

            if not isinstance(triplets[i]['head'],Span) or not isinstance(triplets[i]['tail'],Span) or triplets[i]['head'].predicted_entity.wikidata_entity_id == triplets[i]['tail'].predicted_entity.wikidata_entity_id:
                del triplets[i]
                i = i - 1
            

            i += 1

        entities_set = set()
        for span in spans:
            entities_set.add(span)

        entities_with_no_relation = entities_set.difference(entities_with_relation)

        return (triplets,entities_with_no_relation)
    
    def extract_triplets_from_spans_with_no_relationship(text:str,spans_no_relationship:List[str])->List[Dict]:
        model = KGBuilder.triplet_extractor.model
        tokenizer = KGBuilder.triplet_extractor.tokenizer
        new_triplets = []
        gen_kwargs = {
            "max_length": 1024,
            "length_penalty": 1,
            "num_beams": 3,
        }

        model_inputs = tokenizer(text, max_length=1024, padding=True, truncation=True, return_tensors = 'pt')

        for span in  spans_no_relationship:
            if span.predicted_entity is not None:
                output = f"""<s><triplet> {span.text} <subj>"""
                model_outputs = tokenizer(output, max_length=1024, padding=True, truncation=True, return_tensors = 'pt', add_special_tokens=False)
                generated_tokens = model.generate(
                    model_inputs["input_ids"].to(model.device),
                    decoder_input_ids=model_outputs["input_ids"].to(model.device),
                    attention_mask=model_inputs["attention_mask"].to(model.device),
                    bad_words_ids=tokenizer(["<triplet>"], add_special_tokens=False).input_ids, # don't generate <triplet>
                    **gen_kwargs,
                )

                decoded_preds = tokenizer.batch_decode(generated_tokens, skip_special_tokens=False)

                for _,sentence in enumerate(decoded_preds):
                    new_triplets += KGBuilder.extract_triplets(sentence)

        
        return KGBuilder.filter_relationships(new_triplets)

    def verify_relationships(triplets:List[Dict])->List[Dict]:

        accurrate_triplets = []

        for triplet in  triplets:

            head_entity = triplet['head'].predicted_entity.wikipedia_entity_title if triplet['head'].predicted_entity.wikipedia_entity_title is not None else triplet['head'].text
            tail_entity = triplet['tail'].predicted_entity.wikipedia_entity_title if triplet['tail'].predicted_entity.wikipedia_entity_title is not None else triplet['tail'].text
            dictionary = head_entity+"("+triplet['head'].predicted_entity.wikidata_entity_id+")-"+triplet['type']+"->"+tail_entity +"("+triplet['tail'].predicted_entity.wikidata_entity_id + ")"
            result = KGBuilder.chain.invoke({"dict": dictionary})
            if result == "True":
                accurrate_triplets.append(triplet)
        
        print("Relationships before verification: ",len(triplets))
        print("Relationships after verification: ",len(accurrate_triplets))

        return accurrate_triplets

    def build_graph(self,text:str):
        triplets = KGBuilder.relation_extraction(text)
        spans_el_base = KGBuilder.entity_linking(text)
        triplets_base = triplets.copy()
        spans_el_base_copy = spans_el_base.copy()

        doc,tns = KGBuilder.preprocess_doc_el(text)
        spans_from_re = KGBuilder.extract_entities_from_relation_extraction(triplets,doc.doc_id,text)
        spans_re = KGBuilder.process_doc_el(doc,tns,spans_from_re)
        spans_re_copy = spans_re.copy()
        spans_re = KGBuilder.filter_spans(spans_re)
        

        spans = KGBuilder.filter_spans(spans_el_base + spans_re)
        triplets,spans_no_relationship = KGBuilder.link_spans_with_relationships(triplets,spans)

        new_triplets = KGBuilder.extract_triplets_from_spans_with_no_relationship(text,spans_no_relationship)
        triplets_base_er = new_triplets.copy()
        new_triplets = KGBuilder.link_spans_with_relationships(new_triplets,spans)[0]
        #triplets = KGBuilder.verify_relationships(triplets+new_triplets)
        triplets = triplets + new_triplets
        
        return [spans,triplets,spans_el_base_copy,spans_re_copy,triplets_base,triplets_base_er]
    
    def generate_embedding(token_id_values,attention_mask_values,token_type_values):
        output = KGBuilder.refined.model.transformer(
                        input_ids=token_id_values,
                        attention_mask=attention_mask_values,
                        token_type_ids=token_type_values,
                        position_ids=None,
                        head_mask=None,
                        inputs_embeds=None,
            )

        contextualised_embeddings = output.last_hidden_state
        
        return contextualised_embeddings
    
    def filter_mask_and_sums_from_spans(batch_elements,spans,device):
        ## TO-DO
        for batch_elem in batch_elements:
            index_start_batch = batch_elem.tokens[0].start
            index_end_batch = batch_elem.tokens[len(batch_elem.tokens)-1].end
            spans_in_batch = []

            for span in spans:
                if span.start>= index_start_batch and span.start + len(span.text)-1 <= index_end_batch:
                    spans_in_batch.append(span)

            batch_elem.add_spans(spans_in_batch)

        acc_sums_lst = [
                    [0] + list(map(lambda token: token.acc_sum, elem.tokens)) + [0]
                    for elem in batch_elements
                ]

        max_seq = max([len(batch_elem.tokens) + 2 for batch_elem in batch_elements])
        acc_sums = torch.tensor(
            pad(acc_sums_lst, seq_len=max_seq, pad_value=0), device=device, dtype=torch.long
        )

        b_entity_mask_lst = [elem.entity_mask for elem in batch_elements]
        b_entity_mask = torch.tensor(
            pad(b_entity_mask_lst, seq_len=-1, pad_value=0), device=device, dtype=torch.long
        )

        return acc_sums,b_entity_mask
    
    def filter_candidates(batch_elements,device):
        pem_values: List[List[float]] = []
        candidate_qcodes: List[str] = []
        candidate_qcodes_ints: List[List[int]] = []
        for batch_elem in batch_elements:
            for span in batch_elem.spans:
                pem_values.append(
                    [pem_value for _, pem_value in span.candidate_entities]
                )  # TODO unpad and pad here
                candidate_qcodes.extend(
                    [qcode for qcode, _ in span.candidate_entities]
                )  # should pad here
                # temporary hack (use negative IDs for additional entities IDs to avoid
                # collisions with Wikdata IDs
                candidate_qcodes_ints.append(
                    [int(qcode.replace("Q", "")) if 'Q' in qcode else int(qcode.replace("A", '-')) for qcode, _ in
                        span.candidate_entities]
                )

        num_cands = KGBuilder.refined.preprocessor.max_candidates
        num_ents = len([span for batch_elm in batch_elements for span in batch_elm.spans])
        cand_class_idx = KGBuilder.refined.preprocessor.get_classes_idx_for_qcode_batch(
                    candidate_qcodes, shape=(num_ents, num_cands, -1)
        )

        b_cand_desc_emb = None
        b_cand_desc = None

        b_cand_desc_emb = KGBuilder.refined.preprocessor.get_descriptions_emb_for_qcode_batch(
                        candidate_qcodes, shape=(num_ents, num_cands, -1)
                    ).to(device)
        b_cand_desc = None


        b_candidate_classes = torch.zeros(
                    size=(num_ents, num_cands, KGBuilder.refined.preprocessor.num_classes+1), dtype=torch.float32, device=device
        )
        first_idx = (
            torch.arange(num_ents, device=device)
                .unsqueeze(1)
                .unsqueeze(1)
                .expand(cand_class_idx.size())
        )
        snd_idx = torch.arange(num_cands, device=device).unsqueeze(1)
        b_candidate_classes[first_idx, snd_idx, cand_class_idx] = 1
        b_pem_values = torch.tensor(pem_values, device=device, dtype=torch.float32)
        b_candidate_qcode_values = torch.tensor(
            candidate_qcodes_ints, device=device, dtype=torch.long
        )

        return (b_candidate_qcode_values,b_pem_values,b_candidate_classes,b_cand_desc,b_cand_desc_emb)
    
    def call_model(batch,contextualised_embeddings,acc_sums,b_entity_mask,cand_desc,cand_desc_emb,candidate_pem_values,candidate_classes,device,spans,cand_ids)-> ModelReturn:
        mention_embeddings = KGBuilder.refined.model._get_mention_embeddings(
            sequence_output=contextualised_embeddings,
            token_acc_sums=acc_sums,
            entity_mask=b_entity_mask,
        )

        candidate_entity_targets = batch.candidate_target_values


        class_targets = KGBuilder.refined.model._expand_class_targets(
                    batch.class_target_values, index_tensor=batch.entity_index_mask_values
        )

        description_loss, candidate_description_scores = KGBuilder.refined.model.ed_2(
        candidate_desc=cand_desc,
        mention_embeddings=mention_embeddings,
        candidate_entity_targets=candidate_entity_targets,
        candidate_desc_emb=cand_desc_emb,
        )

        # forward pass of entity typing layer (using predetermined spans if provided else span identified by md layer)
        et_loss, et_activations = KGBuilder.refined.model.entity_typing(
        mention_embeddings=mention_embeddings, span_classes=class_targets
        )

        # forward pass of entity disambiguation layer
        ed_loss, ed_activations = KGBuilder.refined.model.entity_disambiguation(
        class_activations=et_activations.detach() if KGBuilder.refined.model.detach_ed_layer else et_activations,
        candidate_entity_targets=candidate_entity_targets,
        candidate_pem_values=candidate_pem_values,
        candidate_classes=candidate_classes,
        candidate_description_scores=candidate_description_scores.detach(),  # detach or not
        current_device=device,
        )
        
        return ModelReturn(
            None,
            None,
            et_loss,
            et_activations,
            ed_loss,
            ed_activations,
            spans,
            None,
            cand_ids,
            description_loss,
            candidate_description_scores,
        )

    def process_doc_el(doc:Doc,tns:Iterable[BatchedElementsTns],spans:List[Span]):
        for batch_idx,batch in enumerate(tns):
            ## MAYBE THIS CAUSE ERROR
            batch_elements = batch.batch_elements

            device = torch.device("cuda" if torch.cuda.is_available() else "cpu")

            token_id_values = batch.token_id_values.to(device)
            attention_mask_values = batch.attention_mask_values.to(device)
            token_type_values = batch.token_type_values.to(device)

            contextualised_embeddings = KGBuilder.generate_embedding(token_id_values,attention_mask_values,token_type_values)

            person_coreference = dict()

            person_coreference = KGBuilder.refined.preprocessor.add_candidates_to_spans(
                spans,person_coreference=person_coreference
            )
            acc_sums,b_entity_mask = KGBuilder.filter_mask_and_sums_from_spans(batch_elements,spans,device)

            (cand_ids,
            candidate_pem_values,
            candidate_classes,
            cand_desc,
            cand_desc_emb ) = KGBuilder.filter_candidates(batch_elements,device)

            output = KGBuilder.call_model(batch,contextualised_embeddings,acc_sums,b_entity_mask,cand_desc,cand_desc_emb,candidate_pem_values,candidate_classes,device,spans,cand_ids)


            spans = output.entity_spans

            cand_ids = torch.cat(
                        [output.cand_ids, torch.ones((output.cand_ids.size(0), 1), device=device, dtype=torch.long) * -1], 1
                    )

            ed_targets_predictions = output.ed_activations.argmax(dim=1)
            ed_targets_softmax = output.ed_activations.softmax(dim=1)


            description_scores = output.candidate_description_scores.detach().cpu().numpy()

            predicted_entity_ids = (
                cand_ids[torch.arange(cand_ids.size(0)), ed_targets_predictions].cpu().numpy().tolist()
            )
            predicted_entity_confidence = round_list(
                ed_targets_softmax[torch.arange(ed_targets_softmax.size(0)), ed_targets_predictions]
                    .cpu().detach()
                    .numpy()
                    .tolist(),
                4,
            )


            span_to_classes = defaultdict(list)
            span_indices, pred_class_indices = torch.nonzero(
                output.et_activations > 0.5, as_tuple=True
            )
            for span_idx, pred_class_idx, conf in zip(
                    span_indices.cpu().numpy().tolist(),
                    pred_class_indices.cpu().numpy().tolist(),
                    round_list(
                        output.et_activations[(span_indices, pred_class_indices)].cpu().detach().numpy().tolist(), 4
                    ),
            ):
                if pred_class_idx == 0:
                    continue  # skip padding class label
                class_id = KGBuilder.refined.preprocessor.index_to_class.get(pred_class_idx, "Q0")
                class_label = KGBuilder.refined.preprocessor.class_to_label.get(class_id, "no_label")
                span_to_classes[span_idx].append((class_id, class_label, conf))

            sorted_entity_ids_scores, old_indices = ed_targets_softmax.sort(descending=True)
            sorted_entity_ids_scores = sorted_entity_ids_scores.cpu().detach().numpy().tolist()
            sorted_entity_ids = KGBuilder.refined.sort_tensor(cand_ids, old_indices).cpu().numpy().tolist()


            for span_idx, span in enumerate(spans):
                wikidata_id = f'Q{str(predicted_entity_ids[span_idx])}'
                span.predicted_entity = Entity(
                    wikidata_entity_id=wikidata_id,
                    wikipedia_entity_title=KGBuilder.refined.preprocessor.qcode_to_wiki.get(wikidata_id)
                    if KGBuilder.refined.preprocessor.qcode_to_wiki is not None else None
                )
                span.entity_linking_model_confidence_score = predicted_entity_confidence[span_idx]
                span.top_k_predicted_entities = [
                    (Entity(wikidata_entity_id=f'Q{entity_id}',
                            wikipedia_entity_title=KGBuilder.refined.preprocessor.qcode_to_wiki.get(wikidata_id)
                            if KGBuilder.refined.preprocessor.qcode_to_wiki is not None else None
                            ),
                        round(score, 4))
                    for entity_id, score in
                    zip(sorted_entity_ids[span_idx], sorted_entity_ids_scores[span_idx])
                    if entity_id != 0
                ]

                span.candidate_entities = [
                    (qcode, round(conf, 4))
                    for qcode, conf in filter(lambda x: not x[0] == "Q0", span.candidate_entities)
                ]
                span.description_scores = round_list(
                    description_scores[span_idx].tolist(), 4
                )  # matches candidate order
                span.predicted_entity_types = span_to_classes[span_idx]
        
        
        return spans

    def preprocess_doc_el(text:str):
        doc = Doc.from_text(text,
                    preprocessor=KGBuilder.refined.preprocessor)
        tns: Iterable[BatchedElementsTns] = convert_doc_to_tensors(
                    doc,
                    KGBuilder.refined.preprocessor,
                    collate=True,
                    max_batch_size=16,
                    sort_by_tokens=False,
                    max_seq=KGBuilder.refined.max_seq,
            )
        
        return (doc,tns) 
        
    def extract_entities_from_relation_extraction(triplets,doc_id,text:str)->List[Span]:
        entity_mentions = set()
        for triplet in triplets:
            entity_mentions.add(triplet['head'])
            entity_mentions.add(triplet['tail'])
        
        spans: List[Span] = []

        for ent in entity_mentions:
            indexes_start = KGBuilder.__find_all(ent,text)
            for index_start in indexes_start:
                span = Span(
                    start = index_start,
                    ln = len(ent),
                    text = ent,
                    coarse_type=None,
                    coarse_mention_type=None,
                    doc_id=doc_id
                )

                spans.append(span)
        
        
        return spans

    def extract_triplets(text:str)->List[Dict]:
        triplets = []
        relation, subject, relation, object_ = '', '', '', ''
        text = text.strip()
        current = 'x'
        for token in text.replace("<s>", "").replace("<pad>", "").replace("</s>", "").split():
            if token == "<triplet>":
                current = 't'
                if relation != '':
                    triplets.append({'head': subject.strip(), 'type': relation.strip(),'tail': object_.strip()})
                    relation = ''
                subject = ''
            elif token == "<subj>":
                current = 's'
                if relation != '':
                    triplets.append({'head': subject.strip(), 'type': relation.strip(),'tail': object_.strip()})
                object_ = ''
            elif token == "<obj>":
                current = 'o'
                relation = ''
            else:
                if current == 't':
                    subject += ' ' + token
                elif current == 's':
                    object_ += ' ' + token
                elif current == 'o':
                    relation += ' ' + token
        if subject != '' and relation != '' and object_ != '':
            triplets.append({'head': subject.strip(), 'type': relation.strip(),'tail': object_.strip()})
        return triplets


In [13]:
def hash_of_span(self)-> int:
    
    text = self.text if self.text is not None else "None"
    entity_id = "ENTITY NONE"
    wikipedia_entity_title = "ENTITY NONE"
    
    if self.predicted_entity is not  None:    
        entity_id = self.predicted_entity.wikidata_entity_id if self.predicted_entity.wikidata_entity_id is not None else "None"
        wikipedia_entity_title = self.predicted_entity.wikipedia_entity_title if self.predicted_entity.wikipedia_entity_title is not None else "None"
    
    
    return hash(text + " " + entity_id + " "+ wikipedia_entity_title)

In [14]:
Span.__hash__ = hash_of_span

In [15]:
builder = KGBuilder()

In [16]:
spans,triplets,spans_el_base,spans_re,triplets_base,triplets_base_er  = builder.build_graph("""Star Wars: The Force Awakens (also known as Star Wars: Episode VII – The Force Awakens) is a 2015 American epic space opera film co-produced, co-written, and directed by J. J. Abrams. The sequel to Return of the Jedi (1983), it is the seventh film in the "Skywalker Saga". Set thirty years after Return of the Jedi, The Force Awakens follows Rey, Finn, Poe Dameron, and Han Solo's search for Luke Skywalker and their fight in the Resistance, led by General Leia Organa and veterans of the Rebel Alliance, against Kylo Ren and the First Order, a successor to the Galactic Empire. The ensemble cast includes Harrison Ford, Mark Hamill, Carrie Fisher, Adam Driver, Daisy Ridley, John Boyega, Oscar Isaac, Lupita Nyong'o, Andy Serkis, Domhnall Gleeson, Anthony Daniels, Peter Mayhew and Max von Sydow.

Work on a seventh entry in the "Skywalker Saga" commenced after the Walt Disney Company's acquisition of Lucasfilm in 2012. The film is the first Star Wars film to not extensively involve franchise creator George Lucas, who only served as a creative consultant in the early stages of production. The Force Awakens was produced by Abrams, his longtime collaborator Bryan Burk, and Lucasfilm president Kathleen Kennedy. Abrams and Lawrence Kasdan, co-writer of the original trilogy films The Empire Strikes Back (1980) and Return of the Jedi, rewrote an initial script by Michael Arndt. John Williams, composer for the previous episodic films, returned to compose the score. Principal photography began in April 2014 and concluded the following November. Filming took place on sets at Pinewood Studios in England, and on location mainly in Abu Dhabi, Iceland, and Ireland. On a budget of $447 million, it is the most expensive film ever made.

The Force Awakens premiered in Hollywood, Los Angeles, on December 14, 2015, and was released in the United States on December 18. It was positively received by critics, who found it an action-packed film with the mix of new and familiar actors capturing the nostalgia of the original trilogy and giving the franchise new energy.[4] The film grossed $2.07 billion worldwide, breaking various box office records and becoming the highest-grossing film in the United States and Canada, the highest-grossing film of 2015, and the third-highest-grossing film at the time of its release. It was nominated for five awards at the 88th Academy Awards and received numerous other accolades. The film was followed by The Last Jedi (2017) and The Rise of Skywalker (2019), rounding out the Star Wars sequel trilogy.""")

In [18]:
triplets_base

[{'head': ['The Force Awakens', Entity(wikidata_entity_id=Q6074, wikipedia_entity_title=Star Wars: The Force Awakens), 'WORK_OF_ART'],
  'type': 'follows',
  'tail': ['The Empire Strikes Back', Entity(wikidata_entity_id=Q181795, wikipedia_entity_title=The Empire Strikes Back), 'WORK_OF_ART']},
 {'head': ['Star Wars', Entity(wikidata_entity_id=Q462, wikipedia_entity_title=Star Wars), None],
  'type': 'has part',
  'tail': ['The Empire Strikes Back', Entity(wikidata_entity_id=Q181795, wikipedia_entity_title=The Empire Strikes Back), 'WORK_OF_ART']},
 {'head': ['The Empire Strikes Back', Entity(wikidata_entity_id=Q181795, wikipedia_entity_title=The Empire Strikes Back), 'WORK_OF_ART'],
  'type': 'screenwriter',
  'tail': ['Michael Arndt', Entity(wikidata_entity_id=Q705156, wikipedia_entity_title=Michael Arndt), 'PERSON']},
 {'head': ['The Force Awakens', Entity(wikidata_entity_id=Q6074, wikipedia_entity_title=Star Wars: The Force Awakens), 'WORK_OF_ART'],
  'type': 'followed by',
  'tail'

In [6]:
spans,triplets,spans_el_base,spans_re,triplets_base,triplets_base_er = builder.build_graph("Toyota Motor Corporation, founded in 1937 by Kiichiro Toyoda, is one of the world's leading automotive manufacturers. The company originated from the Toyoda Automatic Loom Works, which diversified into automobile production under Kiichiro's vision. Toyota's first passenger car, the Model AA, was produced in 1936. Post-World War II, the company faced financial difficulties but rebounded with innovative manufacturing techniques, including Just-In-Time production, which revolutionized the industry. The introduction of the Corolla in 1966 cemented Toyota's reputation for reliability and affordability. In the 21st century, Toyota became a pioneer in hybrid technology with the launch of the Prius in 1997, leading the global shift towards sustainable automotive solutions. Today, Toyota continues to innovate with advancements in electric vehicles, hydrogen fuel cells, and autonomous driving technologies, maintaining its position as a global automotive leader.")

In [7]:
spans

[['Toyota Motor Corporation', Entity(wikidata_entity_id=Q53268, wikipedia_entity_title=Toyota), 'ORG'],
 ['Kiichiro Toyoda', Entity(wikidata_entity_id=Q358556, wikipedia_entity_title=Kiichiro Toyoda), 'PERSON'],
 ['Toyoda Automatic Loom Works', Entity(wikidata_entity_id=Q1476105, wikipedia_entity_title=Toyota Industries), 'ORG'],
 ['Kiichiro', Entity(wikidata_entity_id=Q358556, wikipedia_entity_title=Kiichiro Toyoda), 'PERSON'],
 ['Toyota', Entity(wikidata_entity_id=Q53268, wikipedia_entity_title=Toyota), 'ORG'],
 ['Just-In-Time', Entity(wikidata_entity_id=Q380772, wikipedia_entity_title=Lean manufacturing), None],
 ['Prius', Entity(wikidata_entity_id=Q213115, wikipedia_entity_title=Toyota Prius), None],
 ['hydrogen fuel cells', Entity(wikidata_entity_id=Q180253, wikipedia_entity_title=Fuel cell), None],
 ['autonomous driving', Entity(wikidata_entity_id=Q741490, wikipedia_entity_title=Self-driving car), None],
 ['Kiichiro Toyoda', Entity(wikidata_entity_id=Q53268, wikipedia_entity_titl

In [7]:
triplets

[{'head': ['Toyota', Entity(wikidata_entity_id=Q53268, wikipedia_entity_title=Toyota), 'ORG'],
  'type': 'founded by',
  'tail': ['Kiichiro Toyoda', Entity(wikidata_entity_id=Q358556, wikipedia_entity_title=Kiichiro Toyoda), 'PERSON']},
 {'head': ['Prius', Entity(wikidata_entity_id=Q213115, wikipedia_entity_title=Toyota Prius), None],
  'type': 'manufacturer',
  'tail': ['Toyota Motor Corporation', Entity(wikidata_entity_id=Q53268, wikipedia_entity_title=Toyota), 'ORG']},
 {'head': ['Toyota Motor Corporation', Entity(wikidata_entity_id=Q53268, wikipedia_entity_title=Toyota), 'ORG'],
  'type': 'founded by',
  'tail': ['Kiichiro Toyoda', Entity(wikidata_entity_id=Q358556, wikipedia_entity_title=Kiichiro Toyoda), 'PERSON']},
 {'head': ['Toyoda Automatic Loom Works', Entity(wikidata_entity_id=Q1476105, wikipedia_entity_title=Toyota Industries), 'ORG'],
  'type': 'founded by',
  'tail': ['Kiichiro Toyoda', Entity(wikidata_entity_id=Q358556, wikipedia_entity_title=Kiichiro Toyoda), 'PERSON']

In [8]:
triplets_base_er

[{'head': ['Prius', Entity(wikidata_entity_id=Q213115, wikipedia_entity_title=Toyota Prius), None],
  'type': 'manufacturer',
  'tail': ['Toyota Motor Corporation', Entity(wikidata_entity_id=Q53268, wikipedia_entity_title=Toyota), 'ORG']},
 {'head': ['Kiichiro', Entity(wikidata_entity_id=Q358556, wikipedia_entity_title=Kiichiro Toyoda), 'PERSON'],
  'type': 'work period (start)',
  'tail': '1937'},
 {'head': ['Toyota Motor Corporation', Entity(wikidata_entity_id=Q53268, wikipedia_entity_title=Toyota), 'ORG'],
  'type': 'founded by',
  'tail': ['Kiichiro Toyoda', Entity(wikidata_entity_id=Q358556, wikipedia_entity_title=Kiichiro Toyoda), 'PERSON']},
 {'head': ['Toyoda Automatic Loom Works', Entity(wikidata_entity_id=Q1476105, wikipedia_entity_title=Toyota Industries), 'ORG'],
  'type': 'founded by',
  'tail': ['Kiichiro Toyoda', Entity(wikidata_entity_id=Q358556, wikipedia_entity_title=Kiichiro Toyoda), 'PERSON']},
 {'head': ['Toyota', Entity(wikidata_entity_id=Q53268, wikipedia_entity_

In [10]:
spans,triplets,spans_el_base,spans_re,triplets_base,triplets_base_er = builder.build_graph("""
The movie revolves around a civil war taking place "in a galaxy far far away." The Rebels are fighting against the nefarious Darth Vader and his Imperial forces from the Galactic Empire, a tyrannical army intent on destroying civilizations across the universe. Princess Leia is head of the Rebel's Alliance and manages to steal Imperial blueprints and details about a deadly weapon and space station called the Death Star. When she is captured by Imperial forces, she puts the plans in the memory of a droid, R2-D2, who escapes with another droid, C-3PO to the planet of Tatooine.

Jawa traders find the droids and sell them off to farmers Owen and Beru Lars. Owen and Beru have a nephew, Luke Skywalker, who is in charge of cleaning the droids and readying them for work on the farm. While cleaning the droids, Luke accidentally opens Princess Leia's message, which states that she needs help from Obi-Wan Kenobi. Luke and R2-D2 find a man named Ben Kenobi who reveals himself to be Obi-Wan, a former Jedi Knight whose duty was to maintain peace in the galaxy by using special powers from something called the Force. The Jedis were destroyed by the Empire and he has gone into hiding. Obi-Wan also reveals that Darth Vader killed Luke's father, and gives the young farm boy his father's lightsaber, a powerful, sword-like weapon.

Leia's message asks Obi-Wan to travel to Alderaan with the plans to give to her father. Luke decides to go with him, and they set out to Mos Eisley, a spaceport town, in search of a pilot who can bring them to Alderaan. They employ Han Solo, a sardonic smuggler, and his furry companion, Chewbacca, to take them to Alderaan using his ship, the Millennium Falcon.

When they arrive at Alderaan they discover that it has been completely destroyed by Grand Moff Tarkin and Darth Vader using the obliterating powers of the Death Star. Luke, Chewbacca and Han Solo sneak onto the Death Star, where they rescue Princess Leia and get on the Millennium Falcon. On the Death Star, Obi-Wan is able to disable the tractor beam, but in a duel with Darth Vader, he is killed.

""")

Relationships before verification:  26
Relationships after verification:  20


In [13]:
spans

[['Rebels', Entity(wikidata_entity_id=Q52316, wikipedia_entity_title=Rebel Alliance), None],
 ['Darth Vader', Entity(wikidata_entity_id=Q12206942, wikipedia_entity_title=Darth Vader), None],
 ['Imperial', Entity(wikidata_entity_id=Q52347, wikipedia_entity_title=Galactic Empire (Star Wars)), None],
 ['Galactic Empire', Entity(wikidata_entity_id=Q52347, wikipedia_entity_title=Galactic Empire (Star Wars)), None],
 ['Princess Leia', Entity(wikidata_entity_id=Q51797, wikipedia_entity_title=Princess Leia), 'PERSON'],
 ['Rebel', Entity(wikidata_entity_id=Q52316, wikipedia_entity_title=Rebel Alliance), None],
 ['Alliance', Entity(wikidata_entity_id=Q4732270, wikipedia_entity_title=Alliance (Firefly)), None],
 ['Death Star', Entity(wikidata_entity_id=Q19907, wikipedia_entity_title=Death Star), None],
 ['R2-D2', Entity(wikidata_entity_id=Q51788, wikipedia_entity_title=R2-D2), 'PERSON'],
 ['C-3PO', Entity(wikidata_entity_id=Q51787, wikipedia_entity_title=C-3PO), None],
 ['Tatooine', Entity(wikida

In [11]:
triplets_base_er

[{'head': ['Alliance', Entity(wikidata_entity_id=Q4732270, wikipedia_entity_title=Alliance (Firefly)), None],
  'type': 'opposite of',
  'tail': ['Galactic Empire', Entity(wikidata_entity_id=Q52347, wikipedia_entity_title=Galactic Empire (Star Wars)), None]},
 {'head': ['Jedis', Entity(wikidata_entity_id=Q51724, wikipedia_entity_title=Jedi), None],
  'type': 'has part',
  'tail': 'Owen'},
 {'head': ['Rebels', Entity(wikidata_entity_id=Q52316, wikipedia_entity_title=Rebel Alliance), None],
  'type': 'chairperson',
  'tail': ['Han Solo', Entity(wikidata_entity_id=Q51802, wikipedia_entity_title=Han Solo), 'PERSON']},
 {'head': ['R2-D2', Entity(wikidata_entity_id=Q51788, wikipedia_entity_title=R2-D2), 'PERSON'],
  'type': 'instance of',
  'tail': 'droid'},
 {'head': ['Rebels', Entity(wikidata_entity_id=Q52316, wikipedia_entity_title=Rebel Alliance), None],
  'type': 'chairperson',
  'tail': ['Princess Leia', Entity(wikidata_entity_id=Q51797, wikipedia_entity_title=Princess Leia), 'PERSON']

In [10]:
triplets

[{'head': ['Han Solo', Entity(wikidata_entity_id=Q51802, wikipedia_entity_title=Han Solo), 'PERSON'],
  'type': 'owner of',
  'tail': ['Millennium Falcon', Entity(wikidata_entity_id=Q19901, wikipedia_entity_title=Millennium Falcon), None]},
 {'head': ['Princess Leia', Entity(wikidata_entity_id=Q51797, wikipedia_entity_title=Princess Leia), 'PERSON'],
  'type': 'member of',
  'tail': ['Jedi Knight', Entity(wikidata_entity_id=Q51724, wikipedia_entity_title=Jedi), None]},
 {'head': ['Millennium Falcon', Entity(wikidata_entity_id=Q19901, wikipedia_entity_title=Millennium Falcon), None],
  'type': 'owned by',
  'tail': ['Han Solo', Entity(wikidata_entity_id=Q51802, wikipedia_entity_title=Han Solo), 'PERSON']},
 {'head': ['Rebels', Entity(wikidata_entity_id=Q52316, wikipedia_entity_title=Rebel Alliance), None],
  'type': 'chairperson',
  'tail': ['Han Solo', Entity(wikidata_entity_id=Q51802, wikipedia_entity_title=Han Solo), 'PERSON']},
 {'head': ['Rebel', Entity(wikidata_entity_id=Q52316, w

In [11]:
triplets_base_er

[{'head': ['Rebels', Entity(wikidata_entity_id=Q52316, wikipedia_entity_title=Rebel Alliance), None],
  'type': 'chairperson',
  'tail': ['Han Solo', Entity(wikidata_entity_id=Q51802, wikipedia_entity_title=Han Solo), 'PERSON']},
 {'head': ['Rebel', Entity(wikidata_entity_id=Q52316, wikipedia_entity_title=Rebel Alliance), None],
  'type': 'opposite of',
  'tail': ['Galactic Empire', Entity(wikidata_entity_id=Q52347, wikipedia_entity_title=Galactic Empire (Star Wars)), None]},
 {'head': ['Luke Skywalker', Entity(wikidata_entity_id=Q51746, wikipedia_entity_title=Luke Skywalker), 'PERSON'],
  'type': 'member of',
  'tail': ['Jedi Knight', Entity(wikidata_entity_id=Q51724, wikipedia_entity_title=Jedi), None]},
 {'head': ['Darth Vader', Entity(wikidata_entity_id=Q12206942, wikipedia_entity_title=Darth Vader), None],
  'type': 'member of',
  'tail': ['Jedi Knight', Entity(wikidata_entity_id=Q51724, wikipedia_entity_title=Jedi), None]},
 {'head': ['Galactic Empire', Entity(wikidata_entity_id=

In [12]:
spans

[['Rebels', Entity(wikidata_entity_id=Q52316, wikipedia_entity_title=Rebel Alliance), None],
 ['Darth Vader', Entity(wikidata_entity_id=Q12206942, wikipedia_entity_title=Darth Vader), None],
 ['Imperial', Entity(wikidata_entity_id=Q52347, wikipedia_entity_title=Galactic Empire (Star Wars)), None],
 ['Galactic Empire', Entity(wikidata_entity_id=Q52347, wikipedia_entity_title=Galactic Empire (Star Wars)), None],
 ['Princess Leia', Entity(wikidata_entity_id=Q51797, wikipedia_entity_title=Princess Leia), 'PERSON'],
 ['Rebel', Entity(wikidata_entity_id=Q52316, wikipedia_entity_title=Rebel Alliance), None],
 ['Alliance', Entity(wikidata_entity_id=Q4732270, wikipedia_entity_title=Alliance (Firefly)), None],
 ['Death Star', Entity(wikidata_entity_id=Q19907, wikipedia_entity_title=Death Star), None],
 ['R2-D2', Entity(wikidata_entity_id=Q51788, wikipedia_entity_title=R2-D2), 'PERSON'],
 ['C-3PO', Entity(wikidata_entity_id=Q51787, wikipedia_entity_title=C-3PO), None],
 ['Tatooine', Entity(wikida

In [12]:
spans,triplets,spans_el_base,spans_re,triplets_base,triplets_base_er = builder.build_graph(
"""
The Lord of the Rings, fantasy novel by J.R.R. Tolkien initially published in three parts as The Fellowship of the Ring (1954), The Two Towers (1955), and The Return of the King (1955). The novel, set in the Third Age of Middle-earth, formed a sequel to Tolkien’s The Hobbit (1937) and was succeeded by his posthumous The Silmarillion (1977). The Lord of the Rings is the saga of a group of sometimes reluctant heroes who set forth to save their world from consummate evil. Its many worlds and creatures were drawn from Tolkien’s extensive knowledge of philology and folklore.

At 33, the age of adulthood among hobbits, Frodo Baggins receives a magic Ring of Invisibility from his uncle Bilbo. Frodo, a Christlike figure, learns that the ring has the power to control the entire world and, he discovers, to corrupt its owner. A fellowship of hobbits, elves, dwarfs, and men is formed to destroy the ring by casting it into the volcanic fires of the Crack of Doom, where it was forged. They are opposed on their harrowing mission by the evil Sauron and his Black Riders.
""")

Relationships with no verification:  31
Relationships with verification:  30


In [13]:
triplets

[{'head': ['The Lord of the Rings', Entity(wikidata_entity_id=Q15228, wikipedia_entity_title=The Lord of the Rings), 'WORK_OF_ART'],
  'type': 'has part',
  'tail': ['The Two Towers', Entity(wikidata_entity_id=Q332388, wikipedia_entity_title=The Two Towers), 'WORK_OF_ART']},
 {'head': ['The Fellowship of the Ring', Entity(wikidata_entity_id=Q208002, wikipedia_entity_title=The Fellowship of the Ring), 'WORK_OF_ART'],
  'type': 'part of the series',
  'tail': ['The Lord of the Rings', Entity(wikidata_entity_id=Q15228, wikipedia_entity_title=The Lord of the Rings), 'WORK_OF_ART']},
 {'head': ['The Lord of the Rings', Entity(wikidata_entity_id=Q15228, wikipedia_entity_title=The Lord of the Rings), 'WORK_OF_ART'],
  'type': 'has part',
  'tail': ['The Return of the King', Entity(wikidata_entity_id=Q332581, wikipedia_entity_title=The Return of the King), 'WORK_OF_ART']},
 {'head': ['The Lord of the Rings', Entity(wikidata_entity_id=Q15228, wikipedia_entity_title=The Lord of the Rings), 'WORK

In [14]:
spans,triplets,spans_el_base,spans_re,triplets_base,triplets_base_er = builder.build_graph(
"""
Apple is a technology company. It founded by Steve Jobs and Steve Wozniak. The headquarters is in Cupertino California.
"""
)

Relationships with no verification:  5
Relationships with verification:  4


In [25]:
spans,triplets,spans_el_base,spans_re,triplets_base,triplets_base_er = builder.build_graph(
    "Real Madrid, one of the most iconic football clubs in the world, was founded in 1902 in Madrid, Spain. Known for their distinctive all-white kits, Los Blancos have a rich history marked by numerous domestic and international titles. They have won a record 14 UEFA Champions League titles, showcasing their dominance in European football. Legendary players like Alfredo Di Stéfano, Cristiano Ronaldo, and Zinedine Zidane have donned the Real Madrid jersey, contributing to their global fanbase. The club's home ground, Santiago Bernabéu Stadium, is a fortress renowned for its electric atmosphere. Real Madrid continues to be a symbol of excellence and ambition in football."
)

Relationships with no verification:  10
Relationships with verification:  10


In [26]:
spans

[['Real Madrid', Entity(wikidata_entity_id=Q8682, wikipedia_entity_title=Real Madrid CF), 'ORG'],
 ['Madrid', Entity(wikidata_entity_id=Q2807, wikipedia_entity_title=Madrid), 'ORG'],
 ['Spain', Entity(wikidata_entity_id=Q29, wikipedia_entity_title=Spain), 'GPE'],
 ['Los Blancos', Entity(wikidata_entity_id=Q8682, wikipedia_entity_title=Real Madrid CF), 'ORG'],
 ['UEFA Champions League', Entity(wikidata_entity_id=Q18756, wikipedia_entity_title=UEFA Champions League), 'EVENT'],
 ['Alfredo Di Stéfano', Entity(wikidata_entity_id=Q164546, wikipedia_entity_title=Alfredo Di Stéfano), 'PERSON'],
 ['Cristiano Ronaldo', Entity(wikidata_entity_id=Q11571, wikipedia_entity_title=Cristiano Ronaldo), 'PERSON'],
 ['Zinedine Zidane', Entity(wikidata_entity_id=Q1835, wikipedia_entity_title=Zinedine Zidane), 'PERSON'],
 ['Santiago Bernabéu Stadium', Entity(wikidata_entity_id=Q164027, wikipedia_entity_title=Santiago Bernabéu Stadium), 'FAC']]

In [27]:
triplets

[{'head': ['Real Madrid', Entity(wikidata_entity_id=Q8682, wikipedia_entity_title=Real Madrid CF), 'ORG'],
  'type': 'headquarters location',
  'tail': ['Madrid', Entity(wikidata_entity_id=Q2807, wikipedia_entity_title=Madrid), 'ORG']},
 {'head': ['Real Madrid', Entity(wikidata_entity_id=Q8682, wikipedia_entity_title=Real Madrid CF), 'ORG'],
  'type': 'home venue',
  'tail': ['Santiago Bernabéu Stadium', Entity(wikidata_entity_id=Q164027, wikipedia_entity_title=Santiago Bernabéu Stadium), 'FAC']},
 {'head': ['Santiago Bernabéu Stadium', Entity(wikidata_entity_id=Q164027, wikipedia_entity_title=Santiago Bernabéu Stadium), 'FAC'],
  'type': 'occupant',
  'tail': ['Real Madrid', Entity(wikidata_entity_id=Q8682, wikipedia_entity_title=Real Madrid CF), 'ORG']},
 {'head': ['Los Blancos', Entity(wikidata_entity_id=Q8682, wikipedia_entity_title=Real Madrid CF), 'ORG'],
  'type': 'location of formation',
  'tail': ['Madrid', Entity(wikidata_entity_id=Q2807, wikipedia_entity_title=Madrid), 'ORG'

In [17]:
spans,triplets,spans_el_base,spans_re,triplets_base,triplets_base_er = builder.build_graph(
"Formula 1: Drive to Survive is a television documentary series produced in a collaboration between Netflix and Formula One, to give a behind-the-scenes look at the drivers and races of the Formula One World Championship. The series has received one season per year since 2019, and a total of six seasons have been released, with the latest season aired on 23 February 2024. A crossover event with Full Swing broadcast on Netflix in November 2023 called The Netflix Cup."
)

Relationships with no verification:  2
Relationships with verification:  1


In [23]:
triplets

[{'head': ['Formula 1: Drive to Survive', Entity(wikidata_entity_id=Q62002541, wikipedia_entity_title=Formula 1: Drive to Survive), 'WORK_OF_ART'],
  'type': 'original broadcaster',
  'tail': ['Netflix', Entity(wikidata_entity_id=Q907311, wikipedia_entity_title=Netflix), 'FAC']}]

In [14]:
triplets[4]

{'head': ['California', Entity(wikidata_entity_id=Q99, wikipedia_entity_title=California), 'GPE'],
 'type': 'capital',
 'tail': ['Cupertino', Entity(wikidata_entity_id=Q189471, wikipedia_entity_title=Cupertino, California), 'ORG']}

In [22]:
spans_re[4].entity_linking_model_confidence_score

0.5086

In [17]:
triplets_base

[{'head': ['The Return of the King', Entity(wikidata_entity_id=Q332581, wikipedia_entity_title=The Return of the King), 'WORK_OF_ART'],
  'type': 'part of the series',
  'tail': ['The Lord of the Rings', Entity(wikidata_entity_id=Q15228, wikipedia_entity_title=The Lord of the Rings), 'WORK_OF_ART']},
 {'head': ['The Two Towers', Entity(wikidata_entity_id=Q332388, wikipedia_entity_title=The Two Towers), 'WORK_OF_ART'],
  'type': 'part of the series',
  'tail': ['The Lord of the Rings', Entity(wikidata_entity_id=Q15228, wikipedia_entity_title=The Lord of the Rings), 'WORK_OF_ART']},
 {'head': ['The Lord of the Rings', Entity(wikidata_entity_id=Q15228, wikipedia_entity_title=The Lord of the Rings), 'WORK_OF_ART'],
  'type': 'has part',
  'tail': ['The Two Towers', Entity(wikidata_entity_id=Q332388, wikipedia_entity_title=The Two Towers), 'WORK_OF_ART']},
 {'head': ['The Lord of the Rings', Entity(wikidata_entity_id=Q15228, wikipedia_entity_title=The Lord of the Rings), 'WORK_OF_ART'],
  '

In [16]:
spans

[['The Lord of the Rings', Entity(wikidata_entity_id=Q15228, wikipedia_entity_title=The Lord of the Rings), 'WORK_OF_ART'],
 ['fantasy', Entity(wikidata_entity_id=Q132311, wikipedia_entity_title=Fantasy), None],
 ['J.R.R. Tolkien', Entity(wikidata_entity_id=Q892, wikipedia_entity_title=J. R. R. Tolkien), 'PERSON'],
 ['The Fellowship of the Ring', Entity(wikidata_entity_id=Q208002, wikipedia_entity_title=The Fellowship of the Ring), 'WORK_OF_ART'],
 ['The Two Towers', Entity(wikidata_entity_id=Q332388, wikipedia_entity_title=The Two Towers), 'WORK_OF_ART'],
 ['The Return of the King', Entity(wikidata_entity_id=Q332581, wikipedia_entity_title=The Return of the King), 'WORK_OF_ART'],
 ['Tolkien', Entity(wikidata_entity_id=Q892, wikipedia_entity_title=J. R. R. Tolkien), 'PERSON'],
 ['The Silmarillion', Entity(wikidata_entity_id=Q79762, wikipedia_entity_title=The Silmarillion), 'WORK_OF_ART'],
 ['philology', Entity(wikidata_entity_id=Q40634, wikipedia_entity_title=Philology), None],
 ['Frod

In [15]:
triplets

[{'head': ['The Return of the King', Entity(wikidata_entity_id=Q332581, wikipedia_entity_title=The Return of the King), 'WORK_OF_ART'],
  'type': 'part of the series',
  'tail': ['The Lord of the Rings', Entity(wikidata_entity_id=Q15228, wikipedia_entity_title=The Lord of the Rings), 'WORK_OF_ART']},
 {'head': ['The Two Towers', Entity(wikidata_entity_id=Q332388, wikipedia_entity_title=The Two Towers), 'WORK_OF_ART'],
  'type': 'part of the series',
  'tail': ['The Lord of the Rings', Entity(wikidata_entity_id=Q15228, wikipedia_entity_title=The Lord of the Rings), 'WORK_OF_ART']},
 {'head': ['The Lord of the Rings', Entity(wikidata_entity_id=Q15228, wikipedia_entity_title=The Lord of the Rings), 'WORK_OF_ART'],
  'type': 'has part',
  'tail': ['The Two Towers', Entity(wikidata_entity_id=Q332388, wikipedia_entity_title=The Two Towers), 'WORK_OF_ART']},
 {'head': ['The Lord of the Rings', Entity(wikidata_entity_id=Q15228, wikipedia_entity_title=The Lord of the Rings), 'WORK_OF_ART'],
  '

In [35]:
spans_re[3].entity_linking_model_confidence_score

0.4742

In [37]:
triplets

[{'head': ['Tatooine', Entity(wikidata_entity_id=Q723764, wikipedia_entity_title=Tatooine), None],
  'type': 'present in work',
  'tail': ['Star Wars', Entity(wikidata_entity_id=Q462, wikipedia_entity_title=Star Wars), None]},
 {'head': ['Endor', Entity(wikidata_entity_id=Q12180673, wikipedia_entity_title=Endor (Star Wars)), None],
  'type': 'present in work',
  'tail': ['Star Wars', Entity(wikidata_entity_id=Q462, wikipedia_entity_title=Star Wars), None]},
 {'head': ['Sith', Entity(wikidata_entity_id=Q51771, wikipedia_entity_title=Sith), None],
  'type': 'has part',
  'tail': ['Darth Vader', Entity(wikidata_entity_id=Q12206942, wikipedia_entity_title=Darth Vader), None]},
 {'head': ['Jedi Knights', Entity(wikidata_entity_id=Q51724, wikipedia_entity_title=Jedi), None],
  'type': 'present in work',
  'tail': ['Star Wars', Entity(wikidata_entity_id=Q462, wikipedia_entity_title=Star Wars), None]},
 {'head': ['Empire', Entity(wikidata_entity_id=Q52347, wikipedia_entity_title=Galactic Empir

In [32]:
spans_re

[['Jaguar', Entity(wikidata_entity_id=Q30055, wikipedia_entity_title=Jaguar Cars), None],
 ['jaguar', Entity(wikidata_entity_id=Q35694, wikipedia_entity_title=Jaguar), None]]

In [34]:
triplets

[]

In [31]:
spans_re

[['jaguar', Entity(wikidata_entity_id=Q35694, wikipedia_entity_title=Jaguar), None]]

In [32]:
spans_el_base

[['jaguar', Entity(wikidata_entity_id=Q35694, wikipedia_entity_title=Jaguar), None]]

In [29]:
spans[2].predicted_entity.wikidata_entity_id is None

1.0

In [21]:
triplets

[{'head': ['aguar', Entity not linked to a knowledge base, None],
  'type': 'country of origin',
  'tail': ['United States', Entity(wikidata_entity_id=Q30, wikipedia_entity_title=United States), 'GPE']}]

In [11]:
spans

[['Toyota Motor Corporation', Entity(wikidata_entity_id=Q53268, wikipedia_entity_title=Toyota), 'ORG'],
 ['1937', Entity(parsed_string=[timepoint: ["1937"]]), 'DATE'],
 ['Kiichiro Toyoda', Entity(wikidata_entity_id=Q358556, wikipedia_entity_title=Kiichiro Toyoda), 'PERSON'],
 ['one', None, 'CARDINAL'],
 ['Toyoda Automatic Loom Works', Entity(wikidata_entity_id=Q1476105, wikipedia_entity_title=Toyota Industries), 'ORG'],
 ['Kiichiro', Entity(wikidata_entity_id=Q358556, wikipedia_entity_title=Kiichiro Toyoda), 'PERSON'],
 ['Toyota', Entity(wikidata_entity_id=Q53268, wikipedia_entity_title=Toyota), 'ORG'],
 ['first', None, 'ORDINAL'],
 ['1936', Entity(parsed_string=[timepoint: ["1936"]]), 'DATE'],
 ['Just-In-Time', Entity(wikidata_entity_id=Q380772, wikipedia_entity_title=Lean manufacturing), None],
 ['1966', Entity(parsed_string=[timepoint: ["1966"]]), 'DATE'],
 ['the 21st century', None, 'DATE'],
 ['Prius', Entity(wikidata_entity_id=Q213115, wikipedia_entity_title=Toyota Prius), None],


In [20]:
for span in spans_no_relationship:
    print(span.text," ",span.entity_linking_model_confidence_score)

Spain   0.9131
Xavi Hernandez   0.9939
one   None
UEFA Champions League   0.9542
Johan Cruyff   0.9974
Barcelona   0.8146
Lionel Messi   0.9941
Europe   0.977
Barça   0.8094
blaugrana   0.9912


In [27]:
spans_no_relationship

{['Barcelona', Entity(wikidata_entity_id=Q7156, wikipedia_entity_title=FC Barcelona), None],
 ['Barça', Entity(wikidata_entity_id=Q7156, wikipedia_entity_title=FC Barcelona), 'ORG'],
 ['Europe', Entity(wikidata_entity_id=Q46, wikipedia_entity_title=Europe), 'GPE'],
 ['Johan Cruyff', Entity(wikidata_entity_id=Q17163, wikipedia_entity_title=Johan Cruyff), 'PERSON'],
 ['Lionel Messi', Entity(wikidata_entity_id=Q615, wikipedia_entity_title=Lionel Messi), 'PERSON'],
 ['Spain', Entity(wikidata_entity_id=Q29, wikipedia_entity_title=Spain), 'GPE'],
 ['UEFA Champions League', Entity(wikidata_entity_id=Q18756, wikipedia_entity_title=UEFA Champions League), 'EVENT'],
 ['Xavi Hernandez', Entity(wikidata_entity_id=Q17500, wikipedia_entity_title=Xavi), 'PERSON'],
 ['blaugrana', Entity(wikidata_entity_id=Q7156, wikipedia_entity_title=FC Barcelona), None],
 ['one', None, 'CARDINAL']}

In [27]:
model = KGBuilder.triplet_extractor.model
tokenizer = KGBuilder.triplet_extractor.tokenizer
new_triplets = []
text = "Marie Curie was a pioneering physicist and chemist. She discovered radioactivity and won Nobel Prizes in Physics and Chemistry."
gen_kwargs = {
    "max_length": 1024,
    "length_penalty": 1,
    "num_beams": 3,
}

model_inputs = tokenizer(text, max_length=1024, padding=True, truncation=True, return_tensors = 'pt')

for span in  spans_no_relationship:
    if span.predicted_entity is not None:
        output = f"""<s><triplet> {span.text} <subj>"""
        model_outputs = tokenizer(output, max_length=1024, padding=True, truncation=True, return_tensors = 'pt', add_special_tokens=False)
        generated_tokens = model.generate(
            model_inputs["input_ids"].to(model.device),
            decoder_input_ids=model_outputs["input_ids"].to(model.device),
            attention_mask=model_inputs["attention_mask"].to(model.device),
            bad_words_ids=tokenizer(["<triplet>"], add_special_tokens=False).input_ids, # don't generate <triplet>
            **gen_kwargs,
        )

        decoded_preds = tokenizer.batch_decode(generated_tokens, skip_special_tokens=False)

        for _,sentence in enumerate(decoded_preds):
            new_triplets += KGBuilder.extract_triplets(sentence)

In [22]:
new_triplets

[{'head': 'Spain', 'type': 'sport', 'tail': 'football'},
 {'head': 'Xavi Hernandez',
  'type': 'member of sports team',
  'tail': 'FC Barcelona'},
 {'head': 'Xavi Hernandez',
  'type': 'member of sports team',
  'tail': 'FC Barcelona'},
 {'head': 'UEFA Champions League', 'type': 'sport', 'tail': 'football'},
 {'head': 'Johan Cruyff',
  'type': 'member of sports team',
  'tail': 'FC Barcelona'},
 {'head': 'Johan Cruyff',
  'type': 'member of sports team',
  'tail': 'FC Barcelona'},
 {'head': 'Barcelona', 'type': 'country', 'tail': 'Spain'},
 {'head': 'Lionel Messi',
  'type': 'member of sports team',
  'tail': 'FC Barcelona'},
 {'head': 'Lionel Messi',
  'type': 'member of sports team',
  'tail': 'FC Barcelona'},
 {'head': 'Europe', 'type': 'sport', 'tail': 'football'},
 {'head': 'Barça', 'type': 'inception', 'tail': '1899'},
 {'head': 'Barça', 'type': 'headquarters location', 'tail': 'Barcelona'},
 {'head': 'Barça', 'type': 'home venue', 'tail': 'Camp Nou'},
 {'head': 'Barça', 'type': 

In [30]:
KGBuilder.link_spans_with_relationships(KGBuilder.filter_relationships(new_triplets),spans)

([{'head': ['Barça', Entity(wikidata_entity_id=Q7156, wikipedia_entity_title=FC Barcelona), 'ORG'],
   'type': 'headquarters location',
   'tail': ['Barcelona', Entity(wikidata_entity_id=Q1492, wikipedia_entity_title=Barcelona), 'ORG']},
  {'head': ['blaugrana', Entity(wikidata_entity_id=Q7156, wikipedia_entity_title=FC Barcelona), None],
   'type': 'used by',
   'tail': ['FC Barcelona', Entity(wikidata_entity_id=Q7156, wikipedia_entity_title=FC Barcelona), 'ORG']},
  {'head': ['Johan Cruyff', Entity(wikidata_entity_id=Q17163, wikipedia_entity_title=Johan Cruyff), 'PERSON'],
   'type': 'member of sports team',
   'tail': ['FC Barcelona', Entity(wikidata_entity_id=Q7156, wikipedia_entity_title=FC Barcelona), 'ORG']},
  {'head': ['Barcelona', Entity(wikidata_entity_id=Q1492, wikipedia_entity_title=Barcelona), 'ORG'],
   'type': 'country',
   'tail': ['Spain', Entity(wikidata_entity_id=Q29, wikipedia_entity_title=Spain), 'GPE']},
  {'head': ['Barça', Entity(wikidata_entity_id=Q7156, wikip

In [18]:
new_triplets

[{'head': 'Cristiano Ronaldo',
  'type': 'member of sports team',
  'tail': 'Real Madrid'},
 {'head': 'Cristiano Ronaldo',
  'type': 'member of sports team',
  'tail': 'Real Madrid'},
 {'head': 'Zinedine Zidane',
  'type': 'member of sports team',
  'tail': 'Real Madrid'},
 {'head': 'Zinedine Zidane',
  'type': 'member of sports team',
  'tail': 'Real Madrid'},
 {'head': 'Los Blancos', 'type': 'inception', 'tail': '1902'},
 {'head': 'Los Blancos', 'type': 'location of formation', 'tail': 'Madrid'},
 {'head': 'Los Blancos',
  'type': 'home venue',
  'tail': 'Santiago Bernabéu Stadium'},
 {'head': 'UEFA Champions League', 'type': 'winner', 'tail': 'Real Madrid'},
 {'head': 'UEFA Champions League', 'type': 'winner', 'tail': 'Real Madrid'},
 {'head': 'Spain', 'type': 'capital', 'tail': 'Madrid'},
 {'head': 'Alfredo Di Stéfano',
  'type': 'member of sports team',
  'tail': 'Real Madrid'},
 {'head': 'Alfredo Di Stéfano',
  'type': 'member of sports team',
  'tail': 'Real Madrid'}]

In [21]:
prueba = spans[2]

In [19]:
triplets_with_spans = triplets

In [91]:
i = 0
while i < len(triplets):
    for span in spans:
        if triplets[i]['head'] == span.text:
            triplets[i]['head'] = span
        
        if triplets[i]['tail'] == span.text:
            triplets[i]['tail'] = span

    if not isinstance(triplets[i]['head'],Span) or not isinstance(triplets[i]['tail'],Span):
        del triplets[i]
        i = i - 1
    

    i += 1

In [14]:
triplets

[{'head': ['Real Madrid', Entity(wikidata_entity_id=Q8682, wikipedia_entity_title=Real Madrid CF), 'ORG'],
  'type': 'headquarters location',
  'tail': ['Madrid', Entity(wikidata_entity_id=Q2807, wikipedia_entity_title=Madrid), 'ORG']},
 {'head': ['Real Madrid', Entity(wikidata_entity_id=Q8682, wikipedia_entity_title=Real Madrid CF), 'ORG'],
  'type': 'home venue',
  'tail': ['Santiago Bernabéu Stadium', Entity(wikidata_entity_id=Q164027, wikipedia_entity_title=Santiago Bernabéu Stadium), 'FAC']},
 {'head': ['Santiago Bernabéu Stadium', Entity(wikidata_entity_id=Q164027, wikipedia_entity_title=Santiago Bernabéu Stadium), 'FAC'],
  'type': 'occupant',
  'tail': ['Real Madrid', Entity(wikidata_entity_id=Q8682, wikipedia_entity_title=Real Madrid CF), 'ORG']},
 {'head': ['Real Madrid', Entity(wikidata_entity_id=Q8682, wikipedia_entity_title=Real Madrid CF), 'ORG'],
  'type': 'inception',
  'tail': ['1902', Entity(parsed_string=[timepoint: ["1902"]]), 'DATE']}]

In [15]:
triplets_with_spans

[{'head': ['Real Madrid', Entity(wikidata_entity_id=Q8682, wikipedia_entity_title=Real Madrid CF), 'ORG'],
  'type': 'headquarters location',
  'tail': ['Madrid', Entity(wikidata_entity_id=Q2807, wikipedia_entity_title=Madrid), 'ORG']},
 {'head': ['Real Madrid', Entity(wikidata_entity_id=Q8682, wikipedia_entity_title=Real Madrid CF), 'ORG'],
  'type': 'home venue',
  'tail': ['Santiago Bernabéu Stadium', Entity(wikidata_entity_id=Q164027, wikipedia_entity_title=Santiago Bernabéu Stadium), 'FAC']},
 {'head': ['Santiago Bernabéu Stadium', Entity(wikidata_entity_id=Q164027, wikipedia_entity_title=Santiago Bernabéu Stadium), 'FAC'],
  'type': 'occupant',
  'tail': ['Real Madrid', Entity(wikidata_entity_id=Q8682, wikipedia_entity_title=Real Madrid CF), 'ORG']},
 {'head': ['Real Madrid', Entity(wikidata_entity_id=Q8682, wikipedia_entity_title=Real Madrid CF), 'ORG'],
  'type': 'inception',
  'tail': ['1902', Entity(parsed_string=[timepoint: ["1902"]]), 'DATE']}]

In [33]:
hash(span.text + " " + span.predicted_entity.wikidata_entity_id + " "+ span.predicted_entity.wikipedia_entity_title)

TypeError: can only concatenate str (not "NoneType") to str

In [79]:
hash(span)

8459790261135955224

In [78]:
entities_with_relation.add(span)

In [48]:
hash(span)

8640970520782498198

In [97]:
triplets_with_spans = triplets.copy()

In [90]:
triplets

[{'head': ['Real Madrid', Entity(wikidata_entity_id=Q8682, wikipedia_entity_title=Real Madrid CF), 'ORG'],
  'type': 'headquarters location',
  'tail': ['Madrid', Entity(wikidata_entity_id=Q2807, wikipedia_entity_title=Madrid), 'ORG']},
 {'head': ['Real Madrid', Entity(wikidata_entity_id=Q8682, wikipedia_entity_title=Real Madrid CF), 'ORG'],
  'type': 'home venue',
  'tail': ['Santiago Bernabéu Stadium', Entity(wikidata_entity_id=Q164027, wikipedia_entity_title=Santiago Bernabéu Stadium), 'FAC']},
 {'head': ['Santiago Bernabéu Stadium', Entity(wikidata_entity_id=Q164027, wikipedia_entity_title=Santiago Bernabéu Stadium), 'FAC'],
  'type': 'occupant',
  'tail': ['Real Madrid', Entity(wikidata_entity_id=Q8682, wikipedia_entity_title=Real Madrid CF), 'ORG']},
 {'head': ['Real Madrid', Entity(wikidata_entity_id=Q8682, wikipedia_entity_title=Real Madrid CF), 'ORG'],
  'type': 'inception',
  'tail': ['1902', Entity(parsed_string=[timepoint: ["1902"]]), 'DATE']}]

In [64]:
spans

[['Real Madrid', Entity(wikidata_entity_id=Q8682, wikipedia_entity_title=Real Madrid CF), 'ORG'],
 ['one', None, 'CARDINAL'],
 ['1902', Entity(parsed_string=[timepoint: ["1902"]]), 'DATE'],
 ['Madrid', Entity(wikidata_entity_id=Q2807, wikipedia_entity_title=Madrid), 'ORG'],
 ['Spain', Entity(wikidata_entity_id=Q29, wikipedia_entity_title=Spain), 'GPE'],
 ['Los Blancos', Entity(wikidata_entity_id=Q8682, wikipedia_entity_title=Real Madrid CF), 'ORG'],
 ['14', None, 'CARDINAL'],
 ['UEFA Champions League', Entity(wikidata_entity_id=Q18756, wikipedia_entity_title=UEFA Champions League), 'EVENT'],
 ['Alfredo Di Stéfano', Entity(wikidata_entity_id=Q164546, wikipedia_entity_title=Alfredo Di Stéfano), 'PERSON'],
 ['Cristiano Ronaldo', Entity(wikidata_entity_id=Q11571, wikipedia_entity_title=Cristiano Ronaldo), 'PERSON'],
 ['Zinedine Zidane', Entity(wikidata_entity_id=Q1835, wikipedia_entity_title=Zinedine Zidane), 'PERSON'],
 ['Santiago Bernabéu Stadium', Entity(wikidata_entity_id=Q164027, wiki

In [84]:
span.predicted_entity.

Entity(parsed_string=[timepoint: ["1902"]])

In [98]:
entities_with_relation = set()
for triplet in triplets_with_spans:
    for span in spans:
        if triplet['head'] == span.text:
            triplet['head'] = span
            entities_with_relation.add(span)
        
        if triplet['tail'] == span.text:
            triplet['tail'] = span
            entities_with_relation.add(span)


['Real Madrid', Entity(wikidata_entity_id=Q8682, wikipedia_entity_title=Real Madrid CF), 'ORG']
['Real Madrid', Entity(wikidata_entity_id=Q8682, wikipedia_entity_title=Real Madrid CF), 'ORG']
['Santiago Bernabéu Stadium', Entity(wikidata_entity_id=Q164027, wikipedia_entity_title=Santiago Bernabéu Stadium), 'FAC']
['Real Madrid', Entity(wikidata_entity_id=Q8682, wikipedia_entity_title=Real Madrid CF), 'ORG']


In [100]:
entities_set = set()
for span in spans:
    if span.predicted_entity is not None:
        entities_set.add(span)

In [101]:
entities_set.difference(entities_with_relation)

{['Alfredo Di Stéfano', Entity(wikidata_entity_id=Q164546, wikipedia_entity_title=Alfredo Di Stéfano), 'PERSON'],
 ['Cristiano Ronaldo', Entity(wikidata_entity_id=Q11571, wikipedia_entity_title=Cristiano Ronaldo), 'PERSON'],
 ['Los Blancos', Entity(wikidata_entity_id=Q8682, wikipedia_entity_title=Real Madrid CF), 'ORG'],
 ['Spain', Entity(wikidata_entity_id=Q29, wikipedia_entity_title=Spain), 'GPE'],
 ['UEFA Champions League', Entity(wikidata_entity_id=Q18756, wikipedia_entity_title=UEFA Champions League), 'EVENT'],
 ['Zinedine Zidane', Entity(wikidata_entity_id=Q1835, wikipedia_entity_title=Zinedine Zidane), 'PERSON']}

In [96]:
entities_with_relation

set()

In [46]:
triplets_with_spans

[{'head': ['Real Madrid', Entity(wikidata_entity_id=Q8682, wikipedia_entity_title=Real Madrid CF), 'ORG'],
  'type': 'headquarters location',
  'tail': ['Madrid', Entity(wikidata_entity_id=Q2807, wikipedia_entity_title=Madrid), 'ORG']},
 {'head': ['Real Madrid', Entity(wikidata_entity_id=Q8682, wikipedia_entity_title=Real Madrid CF), 'ORG'],
  'type': 'home venue',
  'tail': ['Santiago Bernabéu Stadium', Entity(wikidata_entity_id=Q164027, wikipedia_entity_title=Santiago Bernabéu Stadium), 'FAC']},
 {'head': ['Santiago Bernabéu Stadium', Entity(wikidata_entity_id=Q164027, wikipedia_entity_title=Santiago Bernabéu Stadium), 'FAC'],
  'type': 'occupant',
  'tail': ['Real Madrid', Entity(wikidata_entity_id=Q8682, wikipedia_entity_title=Real Madrid CF), 'ORG']},
 {'head': ['Real Madrid', Entity(wikidata_entity_id=Q8682, wikipedia_entity_title=Real Madrid CF), 'ORG'],
  'type': 'inception',
  'tail': ['1902', Entity(parsed_string=[timepoint: ["1902"]]), 'DATE']}]

In [45]:
entities_with_relation

set()

In [11]:
triplets_with_spans

[{'head': ['Real Madrid', Entity(wikidata_entity_id=Q8682, wikipedia_entity_title=Real Madrid CF), 'ORG'],
  'type': 'headquarters location',
  'tail': ['Madrid', Entity(wikidata_entity_id=Q2807, wikipedia_entity_title=Madrid), 'ORG']},
 {'head': ['Real Madrid', Entity(wikidata_entity_id=Q8682, wikipedia_entity_title=Real Madrid CF), 'ORG'],
  'type': 'home venue',
  'tail': ['Santiago Bernabéu Stadium', Entity(wikidata_entity_id=Q164027, wikipedia_entity_title=Santiago Bernabéu Stadium), 'FAC']},
 {'head': ['Santiago Bernabéu Stadium', Entity(wikidata_entity_id=Q164027, wikipedia_entity_title=Santiago Bernabéu Stadium), 'FAC'],
  'type': 'occupant',
  'tail': ['Real Madrid', Entity(wikidata_entity_id=Q8682, wikipedia_entity_title=Real Madrid CF), 'ORG']},
 {'head': ['Real Madrid', Entity(wikidata_entity_id=Q8682, wikipedia_entity_title=Real Madrid CF), 'ORG'],
  'type': 'inception',
  'tail': ['1902', Entity(parsed_string=[timepoint: ["1902"]]), 'DATE']}]

In [43]:
import json
relations = set()
for i in triplets:
    relations.add(str(i))

triplets = []
for i in relations:
    triplets.append(json.loads(i.replace("'",'"')))

In [44]:
triplets

[{'head': 'Real Madrid', 'type': 'inception', 'tail': '1902'},
 {'head': 'Santiago Bernabéu Stadium',
  'type': 'occupant',
  'tail': 'Real Madrid'},
 {'head': 'Real Madrid', 'type': 'headquarters location', 'tail': 'Madrid'},
 {'head': 'Real Madrid',
  'type': 'home venue',
  'tail': 'Santiago Bernabéu Stadium'}]

: 

In [30]:
relations

{"{'head': 'Real Madrid', 'type': 'headquarters location', 'tail': 'Madrid'}",
 "{'head': 'Real Madrid', 'type': 'home venue', 'tail': 'Santiago Bernabéu Stadium'}",
 "{'head': 'Real Madrid', 'type': 'inception', 'tail': '1902'}",
 "{'head': 'Santiago Bernabéu Stadium', 'type': 'occupant', 'tail': 'Real Madrid'}"}

In [11]:
spans_re

[['Real Madrid', Entity(wikidata_entity_id=Q8682, wikipedia_entity_title=Real Madrid CF), 'ORG'],
 ['one', None, 'CARDINAL'],
 ['1902', Entity(parsed_string=[timepoint: ["1902"]]), 'DATE'],
 ['Madrid', Entity(wikidata_entity_id=Q2807, wikipedia_entity_title=Madrid), 'ORG'],
 ['Spain', Entity(wikidata_entity_id=Q29, wikipedia_entity_title=Spain), 'GPE'],
 ['Los Blancos', Entity(wikidata_entity_id=Q8682, wikipedia_entity_title=Real Madrid CF), 'ORG'],
 ['14', None, 'CARDINAL'],
 ['UEFA Champions League', Entity(wikidata_entity_id=Q18756, wikipedia_entity_title=UEFA Champions League), 'EVENT'],
 ['Alfredo Di Stéfano', Entity(wikidata_entity_id=Q164546, wikipedia_entity_title=Alfredo Di Stéfano), 'PERSON'],
 ['Cristiano Ronaldo', Entity(wikidata_entity_id=Q11571, wikipedia_entity_title=Cristiano Ronaldo), 'PERSON'],
 ['Zinedine Zidane', Entity(wikidata_entity_id=Q1835, wikipedia_entity_title=Zinedine Zidane), 'PERSON'],
 ['Santiago Bernabéu Stadium', Entity(wikidata_entity_id=Q164027, wiki

In [11]:
spans_el_base

[['Real Madrid', Entity(wikidata_entity_id=Q8682, wikipedia_entity_title=Real Madrid CF), 'ORG'],
 ['one', None, 'CARDINAL'],
 ['1902', Entity(parsed_string=[timepoint: ["1902"]]), 'DATE'],
 ['Madrid', Entity(wikidata_entity_id=Q2807, wikipedia_entity_title=Madrid), 'ORG'],
 ['Spain', Entity(wikidata_entity_id=Q29, wikipedia_entity_title=Spain), 'GPE'],
 ['Los Blancos', Entity(wikidata_entity_id=Q8682, wikipedia_entity_title=Real Madrid CF), 'ORG'],
 ['14', None, 'CARDINAL'],
 ['UEFA Champions League', Entity(wikidata_entity_id=Q18756, wikipedia_entity_title=UEFA Champions League), 'EVENT'],
 ['Alfredo Di Stéfano', Entity(wikidata_entity_id=Q164546, wikipedia_entity_title=Alfredo Di Stéfano), 'PERSON'],
 ['Cristiano Ronaldo', Entity(wikidata_entity_id=Q11571, wikipedia_entity_title=Cristiano Ronaldo), 'PERSON'],
 ['Zinedine Zidane', Entity(wikidata_entity_id=Q1835, wikipedia_entity_title=Zinedine Zidane), 'PERSON'],
 ['Santiago Bernabéu Stadium', Entity(wikidata_entity_id=Q164027, wiki

In [5]:
spans_re

[['Real Madrid', Entity(wikidata_entity_id=Q8682, wikipedia_entity_title=Real Madrid CF), None],
 ['Madrid', Entity(wikidata_entity_id=Q2807, wikipedia_entity_title=Madrid), None],
 ['1902', Entity not linked to a knowledge base, None],
 ['Madrid', Entity(wikidata_entity_id=Q2807, wikipedia_entity_title=Madrid), None],
 ['Santiago Bernabéu Stadium', Entity(wikidata_entity_id=Q164027, wikipedia_entity_title=Santiago Bernabéu Stadium), None]]

In [29]:
entity_candidates = []
for span in spans_re:
    if span.entity_linking_model_confidence_score > 0.8:
        for 
        entity_candidates.append(span)

In [38]:
prueba = spans_re[0]


In [41]:
prueba.predicted_entity.wikidata_entity_id

'Q8682'

In [6]:
i = 0
while i < len(spans_re):
    j = i + 1
    while j < len(spans_re):
        print(i, " ", j)
        if spans_re[j].text == spans_re[i].text and spans_re[j].predicted_entity.wikidata_entity_id == spans_re[i].predicted_entity.wikidata_entity_id:
            print(spans_re)
            del spans_re[j]

        j += 1
                
    
    i += 1

0   1
0   2
0   3
0   4
1   2
1   3
[['Real Madrid', Entity(wikidata_entity_id=Q8682, wikipedia_entity_title=Real Madrid CF), None], ['Madrid', Entity(wikidata_entity_id=Q2807, wikipedia_entity_title=Madrid), None], ['1902', Entity not linked to a knowledge base, None], ['Madrid', Entity(wikidata_entity_id=Q2807, wikipedia_entity_title=Madrid), None], ['Santiago Bernabéu Stadium', Entity(wikidata_entity_id=Q164027, wikipedia_entity_title=Santiago Bernabéu Stadium), None]]
2   3


In [7]:
spans_re

[['Real Madrid', Entity(wikidata_entity_id=Q8682, wikipedia_entity_title=Real Madrid CF), None],
 ['Madrid', Entity(wikidata_entity_id=Q2807, wikipedia_entity_title=Madrid), None],
 ['1902', Entity not linked to a knowledge base, None],
 ['Santiago Bernabéu Stadium', Entity(wikidata_entity_id=Q164027, wikipedia_entity_title=Santiago Bernabéu Stadium), None]]

In [56]:
for span in spans_re:
    print(span.entity_linking_model_confidence_score)

0.9565
0.4561
0.2561
0.9994


In [57]:
spans_re

[['Real Madrid', Entity(wikidata_entity_id=Q8682, wikipedia_entity_title=Real Madrid CF), None],
 ['Madrid', Entity(wikidata_entity_id=Q2807, wikipedia_entity_title=Madrid), None],
 ['1902', Entity not linked to a knowledge base, None],
 ['Santiago Bernabéu Stadium', Entity(wikidata_entity_id=Q164027, wikipedia_entity_title=Santiago Bernabéu Stadium), None]]

In [30]:
entity_candidates

[['Real Madrid', Entity(wikidata_entity_id=Q8682, wikipedia_entity_title=Real Madrid CF), None],
 ['Madrid', Entity(wikidata_entity_id=Q2807, wikipedia_entity_title=Madrid), None],
 ['Real Madrid', Entity(wikidata_entity_id=Q8682, wikipedia_entity_title=Real Madrid CF), None],
 ['Santiago Bernabéu Stadium', Entity(wikidata_entity_id=Q164027, wikipedia_entity_title=Santiago Bernabéu Stadium), None],
 ['Real Madrid', Entity(wikidata_entity_id=Q8682, wikipedia_entity_title=Real Madrid CF), None]]

In [10]:
def intersect_spans(spans_el_base,spans_re):
    df_spans_el = pd.DataFrame({
        'entity': [span.text for span in spans_el_base],
        'word': [span.start for span in spans_el_base],
    })

    df_spans_re = pd.DataFrame({
        'entity': [span.text for span in spans_re],
        'word': [span.start for span in spans_re],
    })


    return df_spans_el,df_spans_re


In [11]:
df_spans_el,df_spans_re = intersect_spans(spans_el_base,spans_re)

In [12]:
df_spans_el

Unnamed: 0,entity,word
0,Real Madrid,0
1,one,13
2,1902,80
3,Madrid,88
4,Spain,96
5,Los Blancos,147
6,14,256
7,UEFA Champions League,259
8,European,319
9,Alfredo Di Stéfano,361


In [17]:
df_spans_re

Unnamed: 0,entity,word
0,Real Madrid,0
1,Madrid,5
2,1902,80
3,Madrid,88
4,Real Madrid,436
5,Madrid,441
6,Santiago Bernabéu Stadium,518
7,Real Madrid,597
8,Madrid,602


In [18]:
df_spans_re.join(df_spans_el.set_index(['entity', 'word']), on=['entity', 'word'], how='inner', lsuffix='_re', rsuffix='_el')

Unnamed: 0,entity,word
0,Real Madrid,0
2,1902,80
3,Madrid,88
4,Real Madrid,436
6,Santiago Bernabéu Stadium,518
7,Real Madrid,597


In [14]:
df_spans_el.join(df_spans_re.set_index(['entity', 'word']), on=['entity', 'word'], how='inner', lsuffix='_el', rsuffix='_re')

Unnamed: 0,entity,word
0,Real Madrid,0
2,1902,80
3,Madrid,88
12,Real Madrid,436
13,Santiago Bernabéu Stadium,518
14,Real Madrid,597


In [7]:
spans_el_base

[['Real Madrid', Entity(wikidata_entity_id=Q8682, wikipedia_entity_title=Real Madrid CF), 'ORG'],
 ['one', None, 'CARDINAL'],
 ['1902', Entity(parsed_string=[timepoint: ["1902"]]), 'DATE'],
 ['Madrid', Entity(wikidata_entity_id=Q2807, wikipedia_entity_title=Madrid), 'ORG'],
 ['Spain', Entity(wikidata_entity_id=Q29, wikipedia_entity_title=Spain), 'GPE'],
 ['Los Blancos', Entity(wikidata_entity_id=Q8682, wikipedia_entity_title=Real Madrid CF), 'ORG'],
 ['14', None, 'CARDINAL'],
 ['UEFA Champions League', Entity(wikidata_entity_id=Q18756, wikipedia_entity_title=UEFA Champions League), 'EVENT'],
 ['European', Entity(wikidata_entity_id=Q35572, wikipedia_entity_title=UEFA), 'ORG'],
 ['Alfredo Di Stéfano', Entity(wikidata_entity_id=Q164546, wikipedia_entity_title=Alfredo Di Stéfano), 'PERSON'],
 ['Cristiano Ronaldo', Entity(wikidata_entity_id=Q11571, wikipedia_entity_title=Cristiano Ronaldo), 'PERSON'],
 ['Zinedine Zidane', Entity(wikidata_entity_id=Q1835, wikipedia_entity_title=Zinedine Zida

In [3]:
! pip install -U langchain_community

[0m

In [24]:
str(triplets[0])

"{'head': ['Han Solo', Entity(wikidata_entity_id=Q51802, wikipedia_entity_title=Han Solo), 'PERSON'], 'type': 'owner of', 'tail': ['Millennium Falcon', Entity(wikidata_entity_id=Q19901, wikipedia_entity_title=Millennium Falcon), None]}"

In [20]:
from langchain.chat_models import ChatOllama
from langchain_core.prompts import ChatPromptTemplate
from langchain.prompts import PromptTemplate
from langchain_core.output_parsers import StrOutputParser


llm = ChatOllama(model="llama3:8b")



template = """
    Given a relationship between two entities, answer only with 'True' of 'False.
    - 'True' in case the relationship corresponds to reality.
    - 'False' in case that the relationship is wrong. 
    Do not take into consideration the previous classifications. Do not answer randomly, just base the classification in your universe knowledge.
    The input relationship that you have to analize is the following:
    {dict}
"""
prompt = ChatPromptTemplate.from_template(template=template)

chain = prompt | llm | StrOutputParser()



In [50]:
dictionary = """
Entity(wikidata_entity_id=Q4732270, wikipedia_entity_title=Alliance (Firefly)), None] - 'opposite of'-  Entity(wikidata_entity_id=Q52347, wikipedia_entity_title=Galactic Empire (Star Wars))
"""
chain.invoke({"dict": dictionary})

'False'

In [51]:
spans[2].predicted_entity

Entity(wikidata_entity_id=Q52347, wikipedia_entity_title=Galactic Empire (Star Wars))

In [51]:
result

True

In [77]:
chain

ChatPromptTemplate(input_variables=['dict'], messages=[HumanMessagePromptTemplate(prompt=PromptTemplate(input_variables=['dict'], template="\n    Given a relationship between two entities, answer only with 'True' in case the relationship  is strictly valid or 'False' in case that the relationship is wrong. Do not take\n    into consideration the previous classifications.\n    The input relationship that you have to analize is the following:\n    {dict}\n"))])
| ChatOllama(model='llama3:8b')
| StrOutputParser()

In [22]:
spans[0].predicted_entity.wikidata_entity_id

'Apple Inc.'

In [28]:
accurrate_triplets = []

for triplet in  triplets:

    head_entity = triplet['head'].predicted_entity.wikipedia_entity_title if triplet['head'].predicted_entity.wikipedia_entity_title is not None else triplet['head'].text
    tail_entity = triplet['tail'].predicted_entity.wikipedia_entity_title if triplet['tail'].predicted_entity.wikipedia_entity_title is not None else triplet['tail'].text
    dictionary = head_entity+"("+triplet['head'].predicted_entity.wikidata_entity_id+")-"+triplet['type']+"->"+tail_entity +"("+triplet['tail'].predicted_entity.wikidata_entity_id + ")"
    print(dictionary)
    result = chain.invoke({"dict": dictionary})
    print(result)
    if result == "True":
        accurrate_triplets.append(triplet)

Princess Leia(Q51797)-member of->Jedi(Q51724)
True
Han Solo(Q51802)-owner of->Millennium Falcon(Q19901)
True
Millennium Falcon(Q19901)-owned by->Han Solo(Q51802)
True
Rebel Alliance(Q52316)-chairperson->Princess Leia(Q51797)
True
Princess Leia(Q51797)-relative->Luke Skywalker(Q51746)
True
Alliance (Firefly)(Q4732270)-opposite of->Galactic Empire (Star Wars)(Q52347)
True
Jedi(Q51724)-has part->List of Star Wars characters(Q51805)
True
Galactic Empire (Star Wars)(Q52347)-head of state->Han Solo(Q51802)
False
Jawa(Q2994460)-location->Tatooine(Q723764)
True
Mos Eisley(Q946074)-located on terrain feature->Tatooine(Q723764)
True
Death Star(Q19907)-owned by->Galactic Empire (Star Wars)(Q52347)
True
Grand Moff Tarkin(Q51800)-member of->Galactic Empire (Star Wars)(Q52347)
True
Obi-Wan Kenobi(Q51740)-member of->Jedi(Q51724)
True
Death Star(Q19907)-owned by->Galactic Empire (Star Wars)(Q52347)
True
Obi-Wan Kenobi(Q51740)-member of->Jedi(Q51724)
True
Obi-Wan Kenobi(Q51740)-member of->Jedi(Q51724)


In [27]:
triplet

{'head': ['Jawa', Entity(wikidata_entity_id=Q2994460), None],
 'type': 'location',
 'tail': ['Tatooine', Entity(wikidata_entity_id=Q723764, wikipedia_entity_title=Tatooine), None]}

In [53]:
accurrate_triplets

[{'head': ['Princess Leia', Entity(wikidata_entity_id=Q51797, wikipedia_entity_title=Princess Leia), 'PERSON'],
  'type': 'member of',
  'tail': ['Jedi Knight', Entity(wikidata_entity_id=Q51724, wikipedia_entity_title=Jedi), None]},
 {'head': ['Millennium Falcon', Entity(wikidata_entity_id=Q19901, wikipedia_entity_title=Millennium Falcon), None],
  'type': 'owned by',
  'tail': ['Han Solo', Entity(wikidata_entity_id=Q51802, wikipedia_entity_title=Han Solo), 'PERSON']},
 {'head': ['Han Solo', Entity(wikidata_entity_id=Q51802, wikipedia_entity_title=Han Solo), 'PERSON'],
  'type': 'owner of',
  'tail': ['Millennium Falcon', Entity(wikidata_entity_id=Q19901, wikipedia_entity_title=Millennium Falcon), None]},
 {'head': ['Rebels', Entity(wikidata_entity_id=Q52316, wikipedia_entity_title=Rebel Alliance), None],
  'type': 'chairperson',
  'tail': ['Princess Leia', Entity(wikidata_entity_id=Q51797, wikipedia_entity_title=Princess Leia), 'PERSON']},
 {'head': ['Mos Eisley', Entity(wikidata_enti

In [38]:
triplets_base

[{'head': ['Princess Leia', Entity(wikidata_entity_id=Q51797, wikipedia_entity_title=Princess Leia), 'PERSON'],
  'type': 'member of',
  'tail': ['Jedi Knight', Entity(wikidata_entity_id=Q51724, wikipedia_entity_title=Jedi), None]},
 {'head': ['Millennium Falcon', Entity(wikidata_entity_id=Q19901, wikipedia_entity_title=Millennium Falcon), None],
  'type': 'owned by',
  'tail': ['Han Solo', Entity(wikidata_entity_id=Q51802, wikipedia_entity_title=Han Solo), 'PERSON']},
 {'head': ['Han Solo', Entity(wikidata_entity_id=Q51802, wikipedia_entity_title=Han Solo), 'PERSON'],
  'type': 'owner of',
  'tail': ['Millennium Falcon', Entity(wikidata_entity_id=Q19901, wikipedia_entity_title=Millennium Falcon), None]}]

In [39]:
spans[6]

['Alliance', Entity(wikidata_entity_id=Q4732270, wikipedia_entity_title=Alliance (Firefly)), None]

In [16]:
accurrate_triplets

[{'head': ['Princess Leia', Entity(wikidata_entity_id=Q51797, wikipedia_entity_title=Princess Leia), 'PERSON'],
  'type': 'member of',
  'tail': ['Jedi Knight', Entity(wikidata_entity_id=Q51724, wikipedia_entity_title=Jedi), None]},
 {'head': ['Millennium Falcon', Entity(wikidata_entity_id=Q19901, wikipedia_entity_title=Millennium Falcon), None],
  'type': 'owned by',
  'tail': ['Han Solo', Entity(wikidata_entity_id=Q51802, wikipedia_entity_title=Han Solo), 'PERSON']},
 {'head': ['Han Solo', Entity(wikidata_entity_id=Q51802, wikipedia_entity_title=Han Solo), 'PERSON'],
  'type': 'owner of',
  'tail': ['Millennium Falcon', Entity(wikidata_entity_id=Q19901, wikipedia_entity_title=Millennium Falcon), None]},
 {'head': ['Rebels', Entity(wikidata_entity_id=Q52316, wikipedia_entity_title=Rebel Alliance), None],
  'type': 'chairperson',
  'tail': ['Princess Leia', Entity(wikidata_entity_id=Q51797, wikipedia_entity_title=Princess Leia), 'PERSON']},
 {'head': ['Mos Eisley', Entity(wikidata_enti