In [1]:
import re
import json
import csv
import random
from pathlib import Path
import pandas as pd
import json
from tqdm import tqdm
import leafmap
from datasets import load_dataset
from openai import OpenAI

## Load metadata dataset

In [3]:
dataset = load_dataset("arxiv_dataset", data_dir='./data', split='train', verification_mode='no_checks')
df = dataset.to_pandas()
df.head()

Unnamed: 0,id,submitter,authors,title,comments,journal-ref,doi,report-no,categories,license,abstract,update_date
0,704.0001,Pavel Nadolsky,"C. Bal\'azs, E. L. Berger, P. M. Nadolsky, C.-...",Calculation of prompt diphoton production cros...,"37 pages, 15 figures; published version","Phys.Rev.D76:013009,2007",10.1103/PhysRevD.76.013009,ANL-HEP-PR-07-12,hep-ph,,A fully differential calculation in perturba...,2008-11-26
1,704.0002,Louis Theran,Ileana Streinu and Louis Theran,Sparsity-certifying Graph Decompositions,To appear in Graphs and Combinatorics,,,,math.CO cs.CG,http://arxiv.org/licenses/nonexclusive-distrib...,"We describe a new algorithm, the $(k,\ell)$-...",2008-12-13
2,704.0003,Hongjun Pan,Hongjun Pan,The evolution of the Earth-Moon system based o...,"23 pages, 3 figures",,,,physics.gen-ph,,The evolution of Earth-Moon system is descri...,2008-01-13
3,704.0004,David Callan,David Callan,A determinant of Stirling cycle numbers counts...,11 pages,,,,math.CO,,We show that a determinant of Stirling cycle...,2007-05-23
4,704.0005,Alberto Torchinsky,Wael Abu-Shammala and Alberto Torchinsky,From dyadic $\Lambda_{\alpha}$ to $\Lambda_{\a...,,"Illinois J. Math. 52 (2008) no.2, 681-689",,,math.CA math.FA,,In this paper we show how to compute the $\L...,2013-10-15


## Filter to PDF collection and append first page of text

In [4]:
pdf_files = list(Path("./data/pdf").glob("*.pdf"))
pdf_files.sort()

text_files = list(Path("./data/text").glob("*.txt"))
text_files.sort()

id_pattern = re.compile(r"(.+)v\d+\.[pdf|txt]")
ids = [
    id_pattern.match(pdf.name).group(1) for pdf in pdf_files
]

df_sample = df[df['id'].isin(ids)].copy()

def get_pdf_path(id):
    for pdf in pdf_files:
        if id in str(pdf.stem):
            return pdf.name
    return None

def get_text(id):
    for text in text_files:
        if id in str(text.stem):
            return text.read_text()

df_sample['src_pdf'] = df_sample.apply(lambda row: get_pdf_path(row['id']), axis=1)
df_sample['text'] = df_sample.apply(lambda row: get_text(row['id']), axis=1)
df_sample.dropna(subset = ['src_pdf'], inplace=True)
df_sample.to_csv("./data/arxiv_metadata.csv", index=False, quoting=csv.QUOTE_ALL)
df_sample.to_parquet("./data/arxiv_metadata.parquet", index=False)
df_sample.head()


Unnamed: 0,id,submitter,authors,title,comments,journal-ref,doi,report-no,categories,license,abstract,update_date,src_pdf,text
1922271,2310.00014,Yong Ren,"Yong Ren, Tao Wang, Jiangyan Yi, Le Xu, Jianhu...",Fewer-token Neural Speech Codec with Time-inva...,Submitted to ICASSP 2024,,,,cs.SD eess.AS,http://arxiv.org/licenses/nonexclusive-distrib...,Language model based text-to-speech (TTS) mo...,2023-10-03,2310.00014v1.pdf,FEWER-TOKEN NEURAL SPEECH CODEC WITH TIME-INVA...
1922288,2310.00031,Markus Marks,"Neehar Kondapaneni, Markus Marks, Manuel Knott...",Text-image Alignment for Diffusion-based Perce...,Project page: https://www.vision.caltech.edu/t...,,,,cs.CV,http://arxiv.org/licenses/nonexclusive-distrib...,Diffusion models are generative models with ...,2023-10-06,2310.00031v1.pdf,Text-image Alignment for Diffusion-based Perce...
1922289,2310.00032,Qinghua Xu,"Qinghua Xu, Tao Yue, Shaukat Ali and Maite Arr...","Pretrain, Prompt, and Transfer: Evolving Digit...",,,,,cs.SE,http://creativecommons.org/licenses/by/4.0/,"Cyber-Physical Systems (CPSs), e.g., elevato...",2023-10-06,2310.00032v2.pdf,"PRETRAIN, PROMPT, AND TRANSFER: EVOLVING DIGIT..."
1922291,2310.00034,Yuzhang Shang,"Yuzhang Shang, Zhihang Yuan, Qiang Wu, Zhen Dong",PB-LLM: Partially Binarized Large Language Models,Frist work using network binarization for larg...,,,,cs.LG cs.AI cs.CL,http://arxiv.org/licenses/nonexclusive-distrib...,"This paper explores network binarization, a ...",2023-10-03,2310.00034v1.pdf,PB-LLM: PARTIALLY BINARIZED LARGE LANGUAGE\nMO...
1922292,2310.00035,Xi Wang,"Xi Wang, Laurence Aitchison, Maja Rudolph",LoRA ensembles for large language model fine-t...,Update the title in the PDF file,,,,cs.LG cs.AI,http://creativecommons.org/licenses/by/4.0/,Finetuned LLMs often exhibit poor uncertaint...,2023-10-06,2310.00035v1.pdf,Preprint. Under review\nENSEMBLE OF LOW-RANK A...


In [5]:
df = pd.read_parquet("./data/arxiv_metadata.parquet")
len(df)

1782

## Run text through OpenAI

In [2]:
import os
from dotenv import load_dotenv # pip install python-dotenv

# load the .env file containing your API key
load_dotenv()

# print (obfuscated) API key
print(f"OPENAI_API_KEY: {os.getenv('OPENAI_API_KEY')[:4]}...")

OPENAI_API_KEY: sk-3...


In [None]:

user_prompt_instructions = """
Extract the title and authors and affiliations from the first page of a scientific paper. 

Use the following step-by-step instructions to respond to user inputs.

Extract the title and authors from the first page of a scientific paper. The paper text will snipped will be delimited by triple quotes. Geolocate each author affiliation with latitude and longitude.

The output should have the following format:

{ "title": "The paper's title",
  "authors": [
    {
      "name": "Yong Ren",
      "email": null,
      "affiliations": [ "list of indices" ]
    }
  ],
 "affiliations": [ {"index": "the index", "name": "The affiliation name", "longitude": "the longitude", "latitude": "the latitude" } ]
 ]
}

"""

In [None]:
client = OpenAI()

def analyze_text(text):
    response = client.chat.completions.create(
        model="gpt-4-1106-preview",
        messages=[
            {
                "role": "system",
                "content": "You are an expert research librarian. You are precise and can analyze the structure of papers very well. You return information in json format."
            },
            {
                "role": "user",
                "content": user_prompt_instructions + '\n\n"""' + text + '\n\n"""'
            }
        ],
        response_format={"type": "json_object"},
        temperature=0,
        max_tokens=2048,
        top_p=1,
        frequency_penalty=0,
        presence_penalty=0,
        seed=42,
    )
    return json.loads(response.choices[0].message.content)

In [None]:
text_files = list(Path("./data/text").glob("*.txt"))
random.seed(42)
random.shuffle(text_files)
text_files_sample = text_files[:100]

In [None]:
data = dict()
for text_file in text_files_sample:
    try:
        result = analyze_text(text_file.read_text())
        print(result)
        data[text_file.name] = result
    except Exception as e:
        print(text_file.name, e)

In [None]:
df_data = []
id_pattern = re.compile(r"(.+)v\d+\.txt")

for k, v in data.items():
    try:
        assert "authors" in v, "authors not found"
        assert "affiliations" in v, "affiliations not found"
        for auth in v['authors']:
            assert "name" in auth, "name not found"
            assert "email" in auth, "email not found"
            assert "affiliations" in auth, "affiliations not found"
        for aff in v['affiliations']:
            assert "index" in aff, "index not found"
            assert "name" in aff, "name not found"
            assert "longitude" in aff, "longitude not found"
            assert "latitude" in aff, "latitude not found"
        for author in v['authors']:
            for index in author['affiliations']:
                row = dict()
                row['id'] = id_pattern.match(k).group(1)
                row['title'] = v['title']
                row['author'] = author['name']
                row['email'] = author['email']
                for aff in v['affiliations']:
                    if aff['index'] == int(index):
                        row['affiliation'] = aff['name']
                        row['longitude'] = aff['longitude']
                        row['latitude'] = aff['latitude']
                        df_data.append(row)
    except Exception as e:
        print(k, e)
        
df_output = pd.DataFrame(df_data)
df_output.to_csv("./data/arxiv_metadata_output.csv", index=False, quoting=csv.QUOTE_ALL)
df_output.head()

## Evaluation

In [None]:
true_positives = []
false_positives = []
true_negatives = []
false_negatives = []

for id in df_output['id']:
    print(id)
    gold_title = list(df_sample[df_sample['id'] == id]['title'])[0]

    gold_authors = list(df_sample[df_sample['id'] == id]['authors'])[0]
    gold_authors = {a.strip() for a in gold_authors.split(',')}

    predicted = df_output[df_output['id'] == id]
    predicted_title = list(predicted['title'])[0]
    predicted_authors = set(predicted['author'])
    
    for author in predicted_authors:
        if author in gold_authors:
            true_positives.append((id, author))
        else:
            false_positives.append((id, author))
    
    for author in gold_authors:
        if author not in predicted_authors:
            false_negatives.append((id, author))

print("true_positives:", true_positives)
print("false_positives:", false_positives)
print("false_negatives:", false_negatives)
print("precision:", len(true_positives) / (len(true_positives) + len(false_positives)))
print("recall:", len(true_positives) / (len(true_positives) + len(false_negatives)))

## Mapping

In [3]:
df_output = pd.read_csv("./data/arxiv_metadata_output.csv")
map = leafmap.Map()
map.add_xy_data(df_output, x="longitude", y="latitude", layer_name="World Cities")
map

Map(center=[20, 0], controls=(ZoomControl(options=['position', 'zoom_in_text', 'zoom_in_title', 'zoom_out_text…

In [4]:
from ipywidgets import HTML
from ipyleaflet import Map, Marker, Popup

center = (42.0451, -87.6877)
m = Map(center=center, zoom=2, close_popup_on_click=False)


for row in df_output.iterrows():
    marker = Marker(location=(row[1]['latitude'], row[1]['longitude']))
    message = HTML()
    message.value = f"{row[1]['author']} <b>{row[1]['affiliation']}</b>"
    message.placeholder = "Author"
    message.description = "Author"
    marker.popup = message
    m.add_layer(marker)

m


Map(center=[42.0451, -87.6877], close_popup_on_click=False, controls=(ZoomControl(options=['position', 'zoom_i…