In [None]:
# llama finetuned by gpt 500 samples is very bad but 2000 samples are good.
# treat gpt annotation as gold standard and compare llama with gpt 2000 samples.
# precision    0.539507
# recall       0.686333
# f1           0.60

# load input text sentences
from pandas import read_json, Series
CTD_RE_V1 = read_json('../label_studio/export/CTD_RE_v1.json').set_index('id')
sentences = Series(data = [row['text'] for row in CTD_RE_V1.data], index=CTD_RE_V1.index)

In [None]:
# load gpt annotations
from json import load
def format_gpt_relation(relation):
    return ((relation['subject_entity']['entity_name'], relation['subject_entity']['entity_type']),
                            relation['relation_phrase'],
                            (relation['object_entity']['entity_name'], relation['object_entity']['entity_type']))

def get_gpt_annotation(task_id, model_output_path):
    with open(model_output_path + '/task' + str(task_id) + '_gpt_annotation.json') as json_file:
        gpt_output = load(json_file)
        gpt_annotations = [format_gpt_relation(relation) for relation in gpt_output['relations']]
        json_file.close()
    return list(set(gpt_annotations))

# load llama_annotations
def format_matched_string(s):
    while s.startswith(("'", '"', '(')):
        s = s[1:]
    while s.endswith(("'", '"', ')')):
        s = s[:-1]
    return s

from re import findall
def get_llama_annotation(task_id, model_output_path):
    with open(model_output_path + str(task_id) + '.txt',"r") as f:
        llama_output = f.read()
        f.close()
    output_start_id = llama_output.find('### Extracted relations:')
    # find pattern: (({some_text}, {some_text}), {some_text}, ({some_text}, {some_text}))
    #triple_pattern = r"\(\(([^,]+),\s*([^,]+)\),\s*([^,]+),\s*\(([^,]+),\s*([^,]+)\)\)"
    # find pattern: non greedy ({some_text}, {some_text}), {some_text}, ({some_text}, {some_text})
    triple_pattern =  r"\(([^,]+?),\s*([^,]+?)\),\s*([^,]+?),\s*\(([^,]+?),\s*([^,]+?)\)"
    # TODO: problem with stripping ")" at the end of the string: TGF-beta(1
    matches = findall(triple_pattern, llama_output[output_start_id:])
    matches = [[format_matched_string(s) for s in match] for match in matches]
    return list(set([((m[0], m[1]), m[2], (m[3], m[4])) for m in matches]))


In [None]:
# load test sample ids
from csv import reader
with open("test_output_2000/sampled_test_ids.csv", "r") as file:
    sampled_test_ids = list(map(int, list(reader(file, delimiter=","))[0]))
    file.close()

In [None]:
"""  
task_id = sampled_test_ids[14]
print('--- task ' + str(task_id) + ' ---')
print(sentences[task_id])
print('--- gpt annotation: ---')
print(get_gpt_annotation(task_id, 'test_output_2000/gpt/gpt_annotation/'))
print('--- llama annotation: ---')
print(get_llama_annotation(task_id, 'test_output_2000/gpt/llama3-8b-CTD_RE_V1-finetune-r_8_la_32-checkpoint-260/'))

Here, I think llama annotation is better than gpt annotation.
--- task 22326 ---
Obvious decrease of TGF-beta(1) was found in troglitazone(15 micromol/L) treated group compared with group stimulated with 30 mmol/L D-glucose (P<0.05).
--- gpt annotation: ---
[(('troglitazone', 'Chemical'), 'decreases', ('TGF-beta(1)', 'Gene/Protein'))]
--- llama annotation: ---
[(('troglitazone', 'Chemical'), 'decreases', ('TGF-beta(1', 'Gene/Protein')), (('D-glucose', 'Chemical'), 'increases', ('TGF-beta(1', 'Gene/Protein'))]
"""

In [None]:
from pandas import DataFrame
import numpy as np
def evaluate(gpt_model_output_path, llama_model_output_path, task_ids):
    llama_annotations = []
    gpt_annotations = []
    intersects = []
    for task_id in task_ids:
        llama_annotation = get_llama_annotation(task_id, llama_model_output_path)
        gpt_annotation = get_gpt_annotation(task_id, gpt_model_output_path)
        llama_annotations.append(llama_annotation)
        gpt_annotations.append(gpt_annotation)
        intersects.append([r for r in gpt_annotation if r in llama_annotation])
    
    # true positive: number of relations that are in both gpt annotations and llama annotions
    tp = np.array([len(intersects[i]) for i in range(100)])
    # false positive: number of relations that are in llama annotations but not in gpt annotations
    fp = np.array([len(llama_annotations[i]) - len(intersects[i]) for i in range(100)])
    # false negative: number of relations that are in gpt annotations but not in llama annotations
    fn = np.array([len(gpt_annotations[i]) - len(intersects[i]) for i in range(100)])

    precision = tp/(tp+fp)
    recall = tp/(tp+fn)
    f1 = 2*precision*recall/(precision+recall)

    
    result = DataFrame({'task_id':task_ids,
                        'gpt_annotations': gpt_annotations,
                        'llama_annotations':llama_annotations,
                        'precision': precision,
                        'recall': recall,
                        'f1': f1})
    return result

In [1]:
p = 0.539507
r = 0.686333
f1 = 2 * p * r / (p + r)
print(f1)

0.6041268971986556
