# Affiliation extraction evaluation

This notebook is used to evaluate the affiliation text extraction step in the pipeline.

We import the function used to extract affiliation text, run it on each of our test preprints, and compare the result to hand-curated text files containing the "ideal" affiliation text (i.e. including the authors, their affiliations, and as little extraneous text as possible). We evaluate the similarity of the extracted text to the ideal text on several metrics.



In [1]:
# set up project root path for imports
import sys
import os
import pathlib
root = os.path.abspath(os.path.join(os.getcwd(), os.pardir))
PROJECT_ROOT = pathlib.Path(root)

# make scripts in scripts/ importable and import the affiliation extraction function
sys.path.insert(1, str(PROJECT_ROOT / 'scripts'))
from utils import get_affiliation_text

# set up affiliation extraction model
import spacy
textcat = spacy.load(PROJECT_ROOT / 'training' / 'textcat' / 'model-best')
threshold = 0.75

# convenience function for fetching preprint text
def get_preprint_text(preprint_id):
    fp = PROJECT_ROOT / "assets" / "preprints" / "txt" / f"{preprint_id}.txt"
    try:
        return fp.read_text(encoding='utf-8')
    except FileNotFoundError:
        print(f"Preprint text not found for {preprint_id}")
        return ""

# convenience function for fetching gold affiliation text
def get_gold_affiliation_text(preprint_id):
    fp = PROJECT_ROOT / "datasets" / "curated" / f"{preprint_id}.txt"
    try:
        return fp.read_text(encoding='utf-8')
    except FileNotFoundError:
        print(f"Gold affiliation text not found for {preprint_id}")
        return ""


  from .autonotebook import tqdm as notebook_tqdm
  _torch_pytree._register_pytree_node(
  _torch_pytree._register_pytree_node(


In [None]:
# set up data table with columns for ideal and extracted affiliation text
import pandas as pd
preprints = pd.read_csv(PROJECT_ROOT / 'assets' / 'preprints.csv')
preprints['text'] = ''
preprints['pred'] = ''
preprints['gold'] = ''

# add the full text, ideal affiliation text, and predicted affiliation text to the data table
for i, row in preprints.iterrows():
    openalex_url = row['OpenAlex ID']
    preprint_id = openalex_url.split('/')[-1]
    preprint_text = get_preprint_text(preprint_id)
    preprints.at[i, 'text'] = preprint_text
    preprints.at[i, 'pred'] = get_affiliation_text(preprint_text, textcat, threshold)
    preprints.at[i, 'gold'] = get_gold_affiliation_text(preprint_id)

In [None]:
# keep only the columns we need
preprints = preprints[['OpenAlex ID', 'DRUID', 'text', 'pred', 'gold']]

# remove rows where we don't have gold affiliation text
preprints = preprints[preprints['gold'] != ""]

# display HTML
from IPython.display import display
display(preprints)


Unnamed: 0,OpenAlex ID,DRUID,text,pred,gold
2,https://openalex.org/W2988715931,druid:by432qh9476,Deep Hough Voting for 3D Object Detection in P...,Charles R. Qi 1 Or Litany 1 Kaiming He 1 Leoni...,Charles R. Qi 1 Or Litany 1 Kaiming He 1 Leoni...
6,https://openalex.org/W2901173781,druid:cv788rk8775,Page 1 of 36 AIDS Research and Human Retroviru...,"Benjamin Chimukangara, 1, 2, 3 Ayesha BM Khars...","Benjamin Chimukangara, 1, 2, 3 Ayesha BM Khars..."
7,https://openalex.org/W3013783484,druid:df625mx6241,not certified by peer review) is the author/fu...,"Keri A. McKiernan 1, Anna K. Koster 1, 2, Merr...","Keri A. McKiernan 1 , Anna K. Koster 1, 2 , Me..."
25,https://openalex.org/W3000588783,druid:jf164cr9577,UCSF UC San Francisco Previously Published Wor...,University of California,"Jason D. Roberts, MD, MAS 1, *, † , S. Yukiko ..."
45,https://openalex.org/W2890142544,druid:pf494rb6827,KAT6A Syndrome: genotype-phenotype correlation...,"Authors Kennedy, Joanna; Goudie, David; Blair,...","Authors Joanna Kennedy, MBBS 1, 2 , David Goud..."
47,https://openalex.org/W2976790204,druid:pg336cw7885,SHARP UPPER BOUNDS FOR FRACTIONAL MOMENTS OF T...,"WINSTON HEAP, MAKSYM RADZIWI�L�L, AND K. SOUND...","MAKSYM RADZIWI�L�L, AND K. SOUNDARARAJAN\n"
48,https://openalex.org/W2941345678,druid:pg684bh6859,The Professional Journal of the Earthquake Eng...,"Vitor Silva 1, Sinan Akkar 2, Jack Baker 3, Pa...","Vitor Silva 1 , Sinan Akkar 2 , Jack Baker 3 ,..."
51,https://openalex.org/W3028990183,druid:pw298qp9279,Responding to COVID-19 Through Surveys of Publ...,"Schuster, Christian  Lauren Weitzman  Kim Sa...","Schuster, Christian  Lauren Weitzman  Kim Sa..."
54,https://openalex.org/W2914653242,druid:qj258wc5060,Journal Title XX(X):1–18\nc ⃝The Author(s) 201...,"D. Kim 1, S. Jorgensen 2, J. Lee 3, J. Ahn 3, ...","D. Kim 1 , S. Jorgensen 2 , J. Lee 3 , J. Ahn ..."
57,https://openalex.org/W2977841367,druid:qs929mg6711,Workload-indexed blood pressure response is su...,"Kristofer Hedman, Nicholas Cauwenberghs, Jeffr...","Kristofer Hedman a, b (MD, PhD), Nicholas Cauw..."
