# Full pipeline evaluation

This notebook is used to evaluate the entire pipeline. It compares the predictions of the pipeline with the ground truth author and affiliation data, along with predictions made using other strategies.

Ground truth authors and affiliations were cataloged by hand using SHROOM, and are downloaded as Cocina from SDR by the `preprints:download` task (see README.md).

In [1]:
# set up project root path for imports
import sys
import os
import pathlib
root = os.path.abspath(os.path.join(os.getcwd(), os.pardir))
PROJECT_ROOT = pathlib.Path(root)

# make scripts in scripts/ importable and import the analysis pipeline
sys.path.insert(1, str(PROJECT_ROOT / 'scripts'))
from api import analyze_pdf
from utils import get_cocina_affiliations

# Load the models
import spacy
ner = spacy.load("en_core_web_trf")
ner.disable_pipes("parser")
textcat = spacy.load(PROJECT_ROOT / 'training' / 'textcat' / 'model-best')

# convenience function for fetching preprint text
def get_preprint_text(preprint_id):
    fp = PROJECT_ROOT / "assets" / "preprints" / "txt" / f"{preprint_id}.txt"
    try:
        return fp.read_text(encoding='utf-8')
    except FileNotFoundError:
        print(f"Preprint text not found for {preprint_id}")
        return ""

# convenience function for fetching gold affiliations from cocina
import json
def get_gold_affiliations(preprint_id):
    fp = PROJECT_ROOT / "assets" / "preprints" / "json" / f"{preprint_id}.json"
    try:
        json_str = fp.read_text(encoding='utf-8')
        cocina = json.loads(json_str)
        return get_cocina_affiliations(cocina)
    except FileNotFoundError:
        print(f"Cocina data not found for {preprint_id}")
        return ""


  from .autonotebook import tqdm as notebook_tqdm
  _torch_pytree._register_pytree_node(
  _torch_pytree._register_pytree_node(
  model.load_state_dict(torch.load(filelike, map_location=device))


In [2]:
# set up data table with columns for gold and predicted affiliations
import pandas as pd
preprints = pd.read_csv(PROJECT_ROOT / 'assets' / 'preprints.csv')
preprints['text'] = ''
preprints['pred'] = ''
preprints['gold'] = ''

# add the full text and gold affiliations to the data table
for i, row in preprints.iterrows():
    openalex_url = row['OpenAlex ID']
    preprint_id = openalex_url.split('/')[-1]
    preprint_text = get_preprint_text(preprint_id)
    preprint_file = PROJECT_ROOT / "assets" / "preprints" / "pdf" / f"{preprint_id}.pdf"
    preprints.at[i, 'text'] = preprint_text
    preprints.at[i, 'gold'] = get_gold_affiliations(preprint_id)

# limit to only rows where we have gold affiliations
preprints = preprints[preprints['gold'] != '']

In [3]:
# add predicted affiliations for each preprint
for i, row in preprints.iterrows():    
  with preprint_file.open(mode="rb") as f:
      preprints.at[i, 'pred'] = await analyze_pdf(f, ner, textcat)

# display HTML
from IPython.display import display
display(preprints)

IndexError: pop from empty list