# Full pipeline evaluation

This notebook is used to evaluate the entire pipeline. It compares the predictions of the pipeline with the ground truth author and affiliation data, along with predictions made using other strategies.

Ground truth authors and affiliations were cataloged by hand using SHROOM, and are downloaded as Cocina from SDR by the `preprints:download` task (see README.md).

In [None]:
# set up project root path for imports
import sys
import os
import pathlib
root = os.path.abspath(os.path.join(os.getcwd(), os.pardir))
PROJECT_ROOT = pathlib.Path(root)

# make scripts in scripts/ importable and import the analysis pipeline
sys.path.insert(1, str(PROJECT_ROOT / 'scripts'))
from utils import get_cocina_affiliations

# Load the models
import spacy
ner = spacy.load("en_core_web_trf")
ner.disable_pipes("parser")
textcat = spacy.load(PROJECT_ROOT / 'training' / 'textcat' / 'model-best')

# convenience function for fetching preprint text
def get_preprint_text(preprint_id):
    fp = PROJECT_ROOT / "assets" / "preprints" / "txt" / f"{preprint_id}.txt"
    try:
        return fp.read_text(encoding='utf-8')
    except FileNotFoundError:
        print(f"Preprint text not found for {preprint_id}")
        return ""

# convenience function for fetching gold affiliations from cocina
import json
def get_gold_affiliations(preprint_id):
    fp = PROJECT_ROOT / "assets" / "preprints" / "json" / f"{preprint_id}.json"
    try:
        json_str = fp.read_text(encoding='utf-8')
        cocina = json.loads(json_str)
        return get_cocina_affiliations(cocina)
    except FileNotFoundError:
        print(f"Cocina data not found for {preprint_id}")
        return ""
    
# convenience function for loading pre-saved predictions from results/
results_path = PROJECT_ROOT / 'results'
def load_predictions():
    prediction_files = list(results_path.glob("*.json"))
    predictions = {}
    for prediction_file in prediction_files:
        preprint_id = prediction_file.stem
        with prediction_file.open(mode="r") as f:
            try:
                contents = json.load(f)
                predictions[preprint_id] = contents
            except json.JSONDecodeError:
                print(f"Error loading {prediction_file}")
                continue
    return predictions


  _torch_pytree._register_pytree_node(
  _torch_pytree._register_pytree_node(


ImportError: cannot import name 'analyze_pdf' from 'api' (/Users/budak/Developer/preprint_affiliation_parsing/scripts/api.py)

In [None]:
# set up data table with columns for gold and predicted affiliations
import pandas as pd
preprints = pd.read_csv(PROJECT_ROOT / 'assets' / 'preprints.csv')
preprints['gold'] = ''

# add the full text and gold affiliations to the data table
for i, row in preprints.iterrows():
    openalex_url = row['OpenAlex ID']
    preprint_id = openalex_url.split('/')[-1]
    preprint_text = get_preprint_text(preprint_id)
    preprint_file = PROJECT_ROOT / "assets" / "preprints" / "pdf" / f"{preprint_id}.pdf"
    preprints.at[i, 'gold'] = get_gold_affiliations(preprint_id)
    preprints.at[i, 'text'] = preprint_text
    

# keep only the columns we need
preprints = preprints[['OpenAlex ID', 'DRUID', 'text', 'gold']]

# limit to only rows where we have gold affiliations
preprints = preprints[preprints['gold'] != '']

In [None]:
from utils import get_affiliation_dict, analyze_pdf_text
from tqdm.notebook import tqdm

# set this and run cell to force re-running predictions
FORCE_RERUN = False

# add a column for predictions
preprints['pred'] = ''

# if we don't have any saved predictions, run prediction for every preprint
predictions = load_predictions()
if not predictions or FORCE_RERUN:
    print("No predictions found, running prediction for all preprints")
    for i, row in tqdm(preprints.iterrows(), total=len(preprints), desc="Predicting"):
        preprint_id = row['OpenAlex ID'].split('/')[-1]
        preprint_file = PROJECT_ROOT / "assets" / "preprints" / "txt" / f"{preprint_id}.txt"
        pdf_text = preprint_file.read_text(encoding='utf-8')
        try:
            result = analyze_pdf_text(pdf_text, textcat, ner)
            affiliations = get_affiliation_dict(result)
        except ValueError as e:
            print(f"Error analyzing {preprint_id}: {e}")
            affiliations = {}
        with (results_path / f"{preprint_id}.json").open(mode="w") as f:
            json.dump(affiliations, f)
    predictions = load_predictions()
else:
    print("Using saved predictions")

# set predictions for each preprint in the data table
for i, row in preprints.iterrows():
    preprint_id = row['OpenAlex ID'].split('/')[-1]
    if preprint_id in predictions:
        preprints.at[i, 'pred'] = predictions[preprint_id]

ModuleNotFoundError: No module named 'utils'