In [None]:
import spacy
import scispacy
from scispacy.linking import EntityLinker
from sklearn.metrics import precision_recall_fscore_support
import pandas as pd

In [None]:
# Load the model
nlp = spacy.load("en_core_sci_scibert")
# Add UMLS linker to pipeline using the newer syntax
nlp.add_pipe("scispacy_linker", config={"resolve_abbreviations": True, "linker_name": "umls"})

In [None]:
def get_f1(row, model):
  gt = row['section_text']
  pred = row[model]
  def extract_cui_list(doc):
      """ Extract CUIs from the document """
      return [ent._.umls_ents[0][0] for ent in doc.ents if ent._.umls_ents]

  model_cuis = extract_cui_list(nlp(pred))
  ground_truth_cuis = extract_cui_list(nlp(gt))

  # Create binary lists for Precision, Recall, F1 calculation
  all_cuis = set(model_cuis + ground_truth_cuis)
  model_binary = [1 if cui in model_cuis else 0 for cui in all_cuis]
  truth_binary = [1 if cui in ground_truth_cuis else 0 for cui in all_cuis]

  precision, recall, f1, _ = precision_recall_fscore_support(truth_binary, model_binary, average='binary')

  return f1

In [None]:
df = pd.read_csv('gpt_best_prompts_summs.csv')

In [None]:
model_lst = ["text-ada-001", "text-babbage-001","text-curie-001", "text-davinci-003", "gpt-3.5-turbo", "gpt-4"]
for model in model_lst:
  df[f"{model}_f1"] = df.apply(get_f1, axis=1, args=(model,))

In [None]:
df.to_csv('best_prompt_summs_med_overlap.csv')