# Evaluación estrategias

En este notebook se encuentra el código utilizado para evaluar las diferentes estrategias basadas en diferentes formas de combinación detalladas en la memoria del trabajo.

Importar librerías necesarias.

In [1]:
from transformers import pipeline
from refined.inference.processor import Refined
import tqdm as notebook_tqdm
import torch
from typing import List
from typing import Dict
from typing import Set
from typing import Iterable
from refined.data_types.doc_types import Doc
from refined.data_types.modelling_types import BatchedElementsTns
from refined.utilities.preprocessing_utils import convert_doc_to_tensors
from refined.data_types.base_types import Span
from refined.utilities.preprocessing_utils import pad
from refined.data_types.modelling_types import ModelReturn
from collections import defaultdict
from refined.utilities.general_utils import round_list
from refined.data_types.base_types import Entity
import re
import pandas as pd
import json
from langchain.chat_models import ChatOllama
from langchain_core.prompts import ChatPromptTemplate
from langchain.prompts import PromptTemplate
from langchain_core.output_parsers import StrOutputParser
import jsonlines

  from .autonotebook import tqdm as notebook_tqdm
2024-08-03 12:34:52.599469: I tensorflow/core/util/util.cc:169] oneDNN custom operations are on. You may see slightly different numerical results due to floating-point round-off errors from different computation orders. To turn them off, set the environment variable `TF_ENABLE_ONEDNN_OPTS=0`.
2024-08-03 12:34:52.603411: W tensorflow/stream_executor/platform/default/dso_loader.cc:64] Could not load dynamic library 'libcudart.so.11.0'; dlerror: libcudart.so.11.0: cannot open shared object file: No such file or directory
2024-08-03 12:34:52.603420: I tensorflow/stream_executor/cuda/cudart_stub.cc:29] Ignore above cudart dlerror if you do not have a GPU set up on your machine.


Carga de modelos y métodos auxiliares

In [3]:
refined = Refined.from_pretrained(model_name='wikipedia_model_with_numbers',
                                  entity_set="wikidata")
triplet_extractor = pipeline('text2text-generation', model='Babelscape/rebel-large', tokenizer='Babelscape/rebel-large',device='cuda')

In [2]:
# Function to parse the generated text and extract the triplets
def extract_triplets(text):
    triplets = []
    relation, subject, relation, object_ = '', '', '', ''
    text = text.strip()
    current = 'x'
    for token in text.replace("<s>", "").replace("<pad>", "").replace("</s>", "").split():
        if token == "<triplet>":
            current = 't'
            if relation != '':
                triplets.append({'head': subject.strip(), 'type': relation.strip(),'tail': object_.strip()})
                relation = ''
            subject = ''
        elif token == "<subj>":
            current = 's'
            if relation != '':
                triplets.append({'head': subject.strip(), 'type': relation.strip(),'tail': object_.strip()})
            object_ = ''
        elif token == "<obj>":
            current = 'o'
            relation = ''
        else:
            if current == 't':
                subject += ' ' + token
            elif current == 's':
                object_ += ' ' + token
            elif current == 'o':
                relation += ' ' + token
    if subject != '' and relation != '' and object_ != '':
        triplets.append({'head': subject.strip(), 'type': relation.strip(),'tail': object_.strip()})
    return triplets



In [2]:
def extract_entities(text)->dict:
    entities = refined.process_text(text)
    entities_dicc = {}
    
    for ent in entities:
        entities_dicc[ent.text] = ent

    return entities_dicc

def get_entity_from_dicc(dictionary:dict,term:str):
    entity = dictionary.get(term)
    if entity is None:
        for key in dictionary:
            if key in term or term in key:
                entity = dictionary.get(key)
                break
    
    return entity


def get_entity_id(entity):
    if entity.predicted_entity is None:
        name = None
    else: 
        name =  entity.predicted_entity.wikidata_entity_id  

    return name

## Definición estrategias

Definción de la estrategia 1.

In [14]:
def method_1(text):
    extracted_text = triplet_extractor.tokenizer.batch_decode([triplet_extractor(text, return_tensors=True, return_text=False)[0]["generated_token_ids"]])
    extracted_triplets = extract_triplets(extracted_text[0])
    relations_count = 0
    entities_dicc = extract_entities(text)
    relationship_list = []
    entities = [ get_entity_id(ent)  for ent in  entities_dicc.values()]
    entities =  [x  for x in filter(lambda x: x is not None, entities)]

    for triplet in extracted_triplets:
        ent_head = get_entity_from_dicc(entities_dicc,triplet['head'])
        ent_tail = get_entity_from_dicc(entities_dicc,triplet['tail'])
        if ent_head is not None and ent_tail is not None:
            relation_dicc = {
                "subject": {
                    "uri":get_entity_id(ent_head)
                },
                "predicate":{
                    "surfaceform": triplet['type'],
                },
                "object":{
                    "uri": get_entity_id(ent_tail)
                }
            }
            relationship_list.append(relation_dicc)

    return relationship_list,entities

Definición de la estrategia 2.

In [48]:
def method_2(text):
    
    extracted_text = triplet_extractor.tokenizer.batch_decode([triplet_extractor(text, return_tensors=True, return_text=False)[0]["generated_token_ids"]])
    extracted_triplets = extract_triplets(extracted_text[0])
    entities = set()
    relationships_list = []
    for triplet in extracted_triplets:
        aux = f"""{triplet['head']} - {triplet['type']} -> {triplet['tail']}"""
        entity_linking = refined.process_text(aux)
        relation = ""
        count = 0
        subject = None
        obj = None
        predicate = None
        nentities = len(entity_linking)
        for ent in entity_linking:
            count += 1
            if ent.text in triplet['head']:
                subject = get_entity_id(ent)
                if subject is None:
                    break
                entities.add(subject)
                #relation = f"""{ent.predicted_entity.wikidata_entity_id}-"""
            elif ent.text in triplet['tail']:
                obj = get_entity_id(ent)
                if obj is None:
                    break
                entities.add(obj)
                predicate = triplet['type']
                #relation += f"""{triplet['type']}->{ent.predicted_entity.wikidata_entity_id}"""
                relation_dicc = {
                    "subject":{
                        "uri":subject
                    },
                    "predicate":{
                        "surfaceform":predicate
                    },
                    "object":{
                        "uri":obj
                    }
                }

                relationships_list.append(relation_dicc)

                subject = None
                obj = None
                predicate = None


    return relationships_list, list(entities)




Definición de la estrategia 3

In [3]:
from kgbuilder import KGBuilder
from refined.data_types.base_types import Span
def hash_of_span(self)-> int:
    
    text = self.text if self.text is not None else "None"
    entity_id = "ENTITY NONE"
    wikipedia_entity_title = "ENTITY NONE"
    
    if self.predicted_entity is not  None:    
        entity_id = self.predicted_entity.wikidata_entity_id if self.predicted_entity.wikidata_entity_id is not None else "None"
        wikipedia_entity_title = self.predicted_entity.wikipedia_entity_title if self.predicted_entity.wikipedia_entity_title is not None else "None"
    
    
    return hash(text + " " + entity_id + " "+ wikipedia_entity_title)

Span.__hash__ = hash_of_span
#builder = KGBuilder()
#spans,triplets,spans_el_base,spans_re,triplets_base,triplets_base_er = builder.build_graph("Toyota Motor Corporation, founded in 1937 by Kiichiro Toyoda, is one of the world's leading automotive manufacturers. The company originated from the Toyoda Automatic Loom Works, which diversified into automobile production under Kiichiro's vision. Toyota's first passenger car, the Model AA, was produced in 1936. Post-World War II, the company faced financial difficulties but rebounded with innovative manufacturing techniques, including Just-In-Time production, which revolutionized the industry. The introduction of the Corolla in 1966 cemented Toyota's reputation for reliability and affordability. In the 21st century, Toyota became a pioneer in hybrid technology with the launch of the Prius in 1997, leading the global shift towards sustainable automotive solutions. Today, Toyota continues to innovate with advancements in electric vehicles, hydrogen fuel cells, and autonomous driving technologies, maintaining its position as a global automotive leader.")
builder = KGBuilder()
def method_3(text):
    spans,triplets,_,_,_,_ = builder.build_graph(text)
    entities = set()
    relations_set = set()
    relationships_list = []
    for span in spans:
        id = get_entity_id(span)
        if id is not None:
            entities.add(id)
    
    for triplet in triplets:
        subject = get_entity_id(triplet['head'])
        predicate = triplet['type']
        obj = get_entity_id(triplet['tail'])
        if subject is not None and obj is not None:
            relation_text = subject+predicate+obj
            if relation_text not in relations_set:
                relations_set.add(relation_text)
                relation_dicc = {
                            "subject":{
                                "uri":subject
                            },
                            "predicate":{
                                "surfaceform":predicate
                            },
                            "object":{
                                "uri":obj
                            }
                        }
                relationships_list.append(relation_dicc)

    
    return relationships_list,list(entities)

In [8]:
method_3("Coburg Peak (, ) is the rocky peak rising to 783 m in Erul Heights on Trinity Peninsula in Graham Land, Antarctica. It is surmounting Cugnot Ice Piedmont to the northeast.\n\nThe peak is named after the Bulgarian royal house of Coburg (Saxe-Coburg-Gotha), 1887–1946.")

You seem to be using the pipelines sequentially on GPU. In order to maximize efficiency please use a dataset


([{'subject': {'uri': 'Q618370'},
   'predicate': {'surfaceform': 'continent'},
   'object': {'uri': 'Q51'}},
  {'subject': {'uri': 'Q2038835'},
   'predicate': {'surfaceform': 'continent'},
   'object': {'uri': 'Q51'}},
  {'subject': {'uri': 'Q2038835'},
   'predicate': {'surfaceform': 'part of'},
   'object': {'uri': 'Q618370'}},
  {'subject': {'uri': 'Q5395951'},
   'predicate': {'surfaceform': 'located on terrain feature'},
   'object': {'uri': 'Q2038835'}},
  {'subject': {'uri': 'Q5192667'},
   'predicate': {'surfaceform': 'located on terrain feature'},
   'object': {'uri': 'Q2038835'}},
  {'subject': {'uri': 'Q5139027'},
   'predicate': {'surfaceform': 'mountain range'},
   'object': {'uri': 'Q5395951'}}],
 ['Q4996295',
  'Q5192667',
  'Q2038835',
  'Q618370',
  'Q5395951',
  'Q51',
  'Q5139027'])

In [49]:
method_2("Coburg Peak (, ) is the rocky peak rising to 783 m in Erul Heights on Trinity Peninsula in Graham Land, Antarctica. It is surmounting Cugnot Ice Piedmont to the northeast.\n\nThe peak is named after the Bulgarian royal house of Coburg (Saxe-Coburg-Gotha), 1887–1946.")

([{'subject': {'uri': 'Q2038835'},
   'predicate': {'surfaceform': 'part of'},
   'object': {'uri': 'Q618370'}},
  {'subject': {'uri': 'Q2038835'},
   'predicate': {'surfaceform': 'continent'},
   'object': {'uri': 'Q51'}},
  {'subject': {'uri': 'Q618370'},
   'predicate': {'surfaceform': 'continent'},
   'object': {'uri': 'Q51'}}],
 ['Q51', 'Q618370', 'Q2038835'])

In [35]:
method_1("Coburg Peak (, ) is the rocky peak rising to 783 m in Erul Heights on Trinity Peninsula in Graham Land, Antarctica. It is surmounting Cugnot Ice Piedmont to the northeast.\n\nThe peak is named after the Bulgarian royal house of Coburg (Saxe-Coburg-Gotha), 1887–1946.")

([{'subject': {'uri': 'Q2038835'},
   'predicate': {'surfaceform': 'part of'},
   'object': {'uri': 'Q618370'}},
  {'subject': {'uri': 'Q2038835'},
   'predicate': {'surfaceform': 'continent'},
   'object': {'uri': 'Q51'}},
  {'subject': {'uri': 'Q618370'},
   'predicate': {'surfaceform': 'continent'},
   'object': {'uri': 'Q51'}}],
 ['Q5139027',
  'Q5395951',
  'Q2038835',
  'Q618370',
  'Q51',
  'Q5192667',
  'Q219',
  'Q4996295'])

## Método en el que se define la evaluación

In [6]:
def evaluate(entities_test,relationships_test,entities,relationships):
    n_entities = len(entities_test)
    matched_entities = 0
    unmatched_entities = 0
    for ent in entities:
        if ent in entities_test:
            matched_entities += 1
        else:
            unmatched_entities += 1

    n_relationships = len(relationships_test)
    matched_relationships = 0
    unmatched_relationships = 0
    for rel in relationships:
        matched = False
        for test in relationships_test:
            if rel["subject"]["uri"] == test["subject"]["uri"] and rel["object"]["uri"] == test["object"]["uri"] and rel["predicate"]["surfaceform"] ==  test["predicate"]["surfaceform"]:
                matched = True
                break
        
        if matched:
            matched_relationships += 1
        else:
            unmatched_entities += 1
                


    return n_entities,matched_entities,unmatched_entities,n_relationships,matched_relationships,unmatched_relationships

In [9]:
def evaluate_method(method):
    file_path = 'en_test.jsonl'

    # Reading and processing the JSONL file
    n_docs = 300
    i = 0

    n_entities_list  = []
    matched_entities_list  = []
    unmatched_entities_list = []
    n_relationships_list = []
    matched_relationships_list  = []
    unmatched_relationships_list = []
    with jsonlines.open(file_path) as reader:
        for obj in reader:
            try:
                relationships,entities = method(obj['text'])
                entities_test = [i["uri"] for i in obj['entities']]
                relationshps_test = [i for i in obj['triples']]
                n_entities,matched_entities,unmatched_entities,n_relationships,matched_relationships,unmatched_relationships = evaluate(entities_test,relationshps_test,entities,relationships)
            except Exception as e:
               n_entities,matched_entities,unmatched_entities,n_relationships,matched_relationships,unmatched_relationships = None,None,None,None,None,None
            
            n_entities_list.append(n_entities)
            matched_entities_list.append(matched_entities)
            unmatched_entities_list.append(unmatched_entities)
            n_relationships_list.append(n_relationships)
            matched_relationships_list.append(matched_relationships)
            unmatched_relationships_list.append(unmatched_relationships)
            i+= 1
            print(i)
            # You can now process obj as needed
            if i > n_docs:
                break

    data = {
        'n_entities': n_entities_list,
        'matched_entities': matched_entities_list,
        'unmatched_entities': unmatched_entities_list,
        'n_relationships': n_relationships_list,
        'matched_relationships': matched_relationships_list,
        'unmatched_relationships': unmatched_relationships_list
    }

    stats = pd.DataFrame(data)
    return stats


In [None]:
stats_m1 = evaluate_method(method_1)

In [51]:
stats_m2 = evaluate_method(method_2)

1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
143
144
145
146
147
148
149
150
151
152
153
154
155
156
157
158
159
160
161
162
163
164
165
166
167
168
169
170
171
172
173
174
175
176
177
178
179
180
181
182
183
184
185
186
187
188
189
190
191
192
193
194
195
196
197
198
199
200
201
202
203
204
205
206
207
208
209
210
211
212
213
214
215
216
217
218
219
220
221
222
223
224
225
226
227
228
229
230
231
232
233
234
235
236
237
238
239
240
241
242
243
244
245
246
247
248
249
250
251
252
253
254
255
256
257
258
259
260
261
262
263
264
265
266
267
268
269
270
271
272
273
274
275
276
277


In [11]:
stats_m3 = evaluate_method(method_3)

1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
143
144
145
146
147
148
149
150
151
152
153
154
155
156
157
158
159
160
161
162
163
164
165
166
167
168
169
170
171
172
173
174
175
176
177
178
179
180
181
182
183
184
185
186
187
188
189
190
191
192
193
194
195
196
197
198
199
200
201
202
203
204
205
206
207
208
209
210
211
212
213
214
215
216
217
218
219
220
221
222
223
224
225
226
227
228
229
230
231
232
233
234
235
236
237
238
239
240
241
242
243
244
245
246
247
248
249
250
251
252
253
254
255
256
257
258
259
260
261
262
263
264
265
266
267
268
269
270
271
272
273
274
275
276
277


In [54]:
stats_m1.to_csv('./method1.csv',index=False)
stats_m2.to_csv('./method2.csv',index=False)
stats_m3.to_csv('./method3.csv',index=False)

## Evaluación resultados obtenidos

In [24]:
import pandas as pd
stats_m1 = pd.read_csv('./method1.csv')
stats_m2 = pd.read_csv('./method2.csv')
stats_m3 = pd.read_csv('./method3.csv')


In [25]:
stats_m2 = stats_m2[stats_m3['n_entities'].isna() == False]
stats_m1 = stats_m1[stats_m3['n_entities'].isna() == False]
stats_m3.dropna(inplace=True)

In [30]:
def extract_metrics(df):
    df['accuracy_entities'] = df['matched_entities']/stats_m1["n_entities"]
    df['accuracy_relationships'] = df['matched_relationships']/df['n_relationships']
    return df


In [31]:
stats_m1 =extract_metrics(stats_m1)
stats_m2 =extract_metrics(stats_m2)
stats_m3 =extract_metrics(stats_m3)

In [45]:
stats_m3

Unnamed: 0,n_entities,matched_entities,unmatched_entities,n_relationships,matched_relationships,unmatched_relationships,accuracy_entities,accuracy_relationships
0,10.0,7.0,2.0,3.0,3.0,0.0,0.700000,1.000000
1,6.0,5.0,5.0,2.0,2.0,0.0,0.833333,1.000000
2,21.0,13.0,16.0,5.0,0.0,0.0,0.619048,0.000000
3,7.0,5.0,6.0,4.0,3.0,0.0,0.714286,0.750000
4,31.0,9.0,0.0,1.0,0.0,0.0,0.290323,0.000000
...,...,...,...,...,...,...,...,...
293,7.0,6.0,3.0,7.0,5.0,0.0,0.857143,0.714286
295,5.0,2.0,0.0,2.0,1.0,0.0,0.400000,0.500000
297,4.0,0.0,0.0,1.0,0.0,0.0,0.000000,0.000000
298,4.0,3.0,1.0,4.0,3.0,0.0,0.750000,0.750000


In [32]:
print("Tasa de acierto promedio de enlazada de entidades por oración para el método 1 ", stats_m1['accuracy_entities'].mean())
print("Tasa de acierto promedio de enlazada de entidades por oración para el método 2 ", stats_m2['accuracy_entities'].mean())
print("Tasa de acierto promedio de enlazada de entidades por oración para el método 3 ", stats_m3['accuracy_entities'].mean())

Tasa de acierto promedio de enlazada de entidades por oración para el método 1  0.6160586773512117
Tasa de acierto promedio de enlazada de entidades por oración para el método 2  0.27030954250240674
Tasa de acierto promedio de enlazada de entidades por oración para el método 3  0.5900300958509763


In [33]:
print("Tasa de acierto promedio de relaciones por oración para el método 1 ", stats_m1['accuracy_relationships'].mean())
print("Tasa de acierto promedio de relaciones por oración para el método 2 ", stats_m2['accuracy_relationships'].mean())
print("Tasa de acierto promedio de relaciones por oración para el método 3 ", stats_m3['accuracy_relationships'].mean())

Tasa de acierto promedio de relaciones por oración para el método 1  0.49592291206215255
Tasa de acierto promedio de relaciones por oración para el método 2  0.33931585292344785
Tasa de acierto promedio de relaciones por oración para el método 3  0.580664389525149


In [37]:
print('Promedio de entidades que no coinciden: ',stats_m1['unmatched_entities'].mean())
print('Promedio de entidades que no coinciden: ',stats_m2['unmatched_entities'].mean())
print('Promedio de entidades que no coinciden: ',stats_m3['unmatched_entities'].mean())

Promedio de entidades que no coinciden:  3.490566037735849
Promedio de entidades que no coinciden:  1.271698113207547
Promedio de entidades que no coinciden:  5.158490566037736


In [35]:
print('Promedio de relaciones que no coinciden: ',stats_m1['unmatched_relationships'].mean())
print('Promedio de relaciones que no coinciden: ',stats_m2['unmatched_relationships'].mean())
print('Promedio de relaciones que no coinciden: ',stats_m3['unmatched_relationships'].mean())

Promedio de relaciones que no coinciden:  0.0
Promedio de relaciones que no coinciden:  0.0
Promedio de relaciones que no coinciden:  0.0
