In [12]:
import re
import json
import random
from pathlib import Path
import pandas as pd
import json
from tqdm import tqdm
import leafmap
from datasets import load_dataset
from openai import OpenAI

In [21]:
# datafile = Path("../data/arxiv-metadata-oai-snapshot.json")
# lines = []
# with open(datafile) as f:
#     lines = [line for line in f]

# data = []
# for i in tqdm(range(len(lines))):
#     line = lines[i]
#     data.append(json.loads(line))

# data_v1_2023 = [d for d in data if '2023' in d['versions'][0]['created']]

## Load and filter metadata dataset

In [6]:
dataset = load_dataset("arxiv_dataset", data_dir='.', split='train', verification_mode='no_checks')
ddf = dataset.to_pandas()

In [7]:
id_pattern = re.compile(r"(.+)v\d+\.pdf")
pdf_files = list(Path("../data/pdf").glob("*.pdf"))
pdf_files.sort()
ids = [
    id_pattern.match(pdf.name).group(1) for pdf in pdf_files
]

ddf_sample = ddf[ddf['id'].isin(ids)].copy()

def get_pdf_path(id):
    for pdf in pdf_files:
        if id in str(pdf.stem):
            return pdf.name
    return None

ddf_sample['pdf'] = ddf_sample.apply(lambda row: get_pdf_path(row['id']), axis=1)
ddf_sample.dropna(subset = ['pdf'], inplace=True)
ddf_sample.head()

Unnamed: 0,id,submitter,authors,title,comments,journal-ref,doi,report-no,categories,license,abstract,update_date,pdf
1922271,2310.00014,Yong Ren,"Yong Ren, Tao Wang, Jiangyan Yi, Le Xu, Jianhu...",Fewer-token Neural Speech Codec with Time-inva...,Submitted to ICASSP 2024,,,,cs.SD eess.AS,http://arxiv.org/licenses/nonexclusive-distrib...,Language model based text-to-speech (TTS) mo...,2023-10-03,2310.00014v1.pdf
1922288,2310.00031,Markus Marks,"Neehar Kondapaneni, Markus Marks, Manuel Knott...",Text-image Alignment for Diffusion-based Perce...,Project page: https://www.vision.caltech.edu/t...,,,,cs.CV,http://arxiv.org/licenses/nonexclusive-distrib...,Diffusion models are generative models with ...,2023-10-06,2310.00031v1.pdf
1922289,2310.00032,Qinghua Xu,"Qinghua Xu, Tao Yue, Shaukat Ali and Maite Arr...","Pretrain, Prompt, and Transfer: Evolving Digit...",,,,,cs.SE,http://creativecommons.org/licenses/by/4.0/,"Cyber-Physical Systems (CPSs), e.g., elevato...",2023-10-06,2310.00032v2.pdf
1922291,2310.00034,Yuzhang Shang,"Yuzhang Shang, Zhihang Yuan, Qiang Wu, Zhen Dong",PB-LLM: Partially Binarized Large Language Models,Frist work using network binarization for larg...,,,,cs.LG cs.AI cs.CL,http://arxiv.org/licenses/nonexclusive-distrib...,"This paper explores network binarization, a ...",2023-10-03,2310.00034v1.pdf
1922292,2310.00035,Xi Wang,"Xi Wang, Laurence Aitchison, Maja Rudolph",LoRA ensembles for large language model fine-t...,Update the title in the PDF file,,,,cs.LG cs.AI,http://creativecommons.org/licenses/by/4.0/,Finetuned LLMs often exhibit poor uncertaint...,2023-10-06,2310.00035v1.pdf


## Run text through OpenAI

In [8]:
import os
from dotenv import load_dotenv # pip install python-dotenv

# load the .env file containing your API key
load_dotenv()

# print (obfuscated) API key
print(f"OPENAI_API_KEY: {os.getenv('OPENAI_API_KEY')[:4]}...")

OPENAI_API_KEY: sk-3...


In [9]:

user_prompt_instructions = """
Extract the title and authors and affiliations from the first page of a scientific paper. 

Use the following step-by-step instructions to respond to user inputs.

Extract the title and authors from the first page of a scientific paper. The paper text will snipped will be delimited by triple quotes. Geolocate each author affiliation with latitude and longitude.

The output should have the following format:

{ "title": "The paper's title",
  "authors": [
    {
      "name": "Yong Ren",
      "email": null,
      "affiliations": [ "list of indices" ]
    }
  ],
 "affiliations": [ {"index": "the index", "name": "The affiliation name", "longitude": "the longitude", "latitude": "the latitude" } ]
 ]
}

"""

In [13]:
client = OpenAI()

def analyze_text(text):
    response = client.chat.completions.create(
        model="gpt-4-1106-preview",
        messages=[
            {
                "role": "system",
                "content": "You are an expert research librarian. You are precise and can analyze the structure of papers very well. You return information in json format."
            },
            {
            "role": "user",
            "content": user_prompt_instructions + '\n\n"""' + text + '\n\n"""'
            }
        ],
        response_format={"type": "json_object"},
        temperature=0,
        max_tokens=2048,
        top_p=1,
        frequency_penalty=0,
        presence_penalty=0,
        seed=42,
    )
    return json.loads(response.choices[0].message.content)

In [14]:
text_files = list(Path("../data/text").glob("*.txt"))
random.seed(42)
random.shuffle(text_files)
text_files_sample = text_files[:5]

In [15]:
data = dict()
for text_file in text_files_sample:
    try:
        result = analyze_text(text_file.read_text())
        print(result)
        data[text_file.name] = result
    except Exception as e:
        print(text_file.name, e)

{'title': 'USE YOUR INSTINCT: INSTRUCTION OPTIMIZATION USING NEURAL BANDITS COUPLED WITH TRANSFORMERS', 'authors': [{'name': 'Xiaoqiang Lin', 'email': 'xiaoqiang.lin@comp.nus.edu.sg', 'affiliations': [1]}, {'name': 'Zhaoxuan Wu', 'email': 'wu.zhaoxuan@comp.nus.edu.sg', 'affiliations': [2, 3]}, {'name': 'Zhongxiang Dai', 'email': 'dzx@nus.edu.sg', 'affiliations': [1]}, {'name': 'Wenyang Hu', 'email': 'wenyang@comp.nus.edu.sg', 'affiliations': [1, 2]}, {'name': 'Yao Shu', 'email': 'shuyao95@gmail.com', 'affiliations': [4]}, {'name': 'See-Kiong Ng', 'email': 'seekiong@nus.edu.sg', 'affiliations': [1, 2]}, {'name': 'Patrick Jaillet', 'email': 'jaillet@mit.edu', 'affiliations': [5]}, {'name': 'Bryan Kian Hsiang Low', 'email': 'lowkh@comp.nus.edu.sg', 'affiliations': [1]}], 'affiliations': [{'index': 1, 'name': 'Department of Computer Science, National University of Singapore', 'longitude': '103.7764', 'latitude': '1.2966'}, {'index': 2, 'name': 'Institute of Data Science, National Universit

In [47]:
id_pattern = re.compile(r"(.+)v\d+\.txt")

df_data = []
for k, v in data.items():
    assert "authors" in v, "authors not found"
    assert "affiliations" in v, "affiliations not found"
    for auth in v['authors']:
        assert "name" in auth, "name not found"
        assert "email" in auth, "email not found"
        assert "affiliations" in auth, "affiliations not found"
    for aff in v['affiliations']:
        assert "index" in aff, "index not found"
        assert "name" in aff, "name not found"
        assert "longitude" in aff, "longitude not found"
        assert "latitude" in aff, "latitude not found"
    for author in v['authors']:
        for index in author['affiliations']:
            row = dict()
            row['id'] = id_pattern.match(k).group(1)
            row['title'] = v['title']
            row['author'] = author['name']
            row['email'] = author['email']
            for aff in v['affiliations']:
                if aff['index'] == int(index):
                    row['affiliation'] = aff['name']
                    row['longitude'] = aff['longitude']
                    row['latitude'] = aff['latitude']
                    df_data.append(row)
        
df = pd.DataFrame(df_data)
df.head()

Unnamed: 0,id,title,author,email,affiliation,longitude,latitude
0,2310.02905,USE YOUR INSTINCT: INSTRUCTION OPTIMIZATION US...,Xiaoqiang Lin,xiaoqiang.lin@comp.nus.edu.sg,"Department of Computer Science, National Unive...",103.7764,1.2966
1,2310.02905,USE YOUR INSTINCT: INSTRUCTION OPTIMIZATION US...,Zhaoxuan Wu,wu.zhaoxuan@comp.nus.edu.sg,"Institute of Data Science, National University...",103.7758,1.2942
2,2310.02905,USE YOUR INSTINCT: INSTRUCTION OPTIMIZATION US...,Zhaoxuan Wu,wu.zhaoxuan@comp.nus.edu.sg,Integrative Sciences and Engineering Programme...,103.7738,1.2995
3,2310.02905,USE YOUR INSTINCT: INSTRUCTION OPTIMIZATION US...,Zhongxiang Dai,dzx@nus.edu.sg,"Department of Computer Science, National Unive...",103.7764,1.2966
4,2310.02905,USE YOUR INSTINCT: INSTRUCTION OPTIMIZATION US...,Wenyang Hu,wenyang@comp.nus.edu.sg,"Department of Computer Science, National Unive...",103.7764,1.2966


## Evaluation

In [57]:
true_positives = []
false_positives = []
true_negatives = []
false_negatives = []

for id in df['id']:
    print(id)
    gold_title = list(ddf_sample[ddf_sample['id'] == id]['title'])[0]

    gold_authors = list(ddf_sample[ddf_sample['id'] == id]['authors'])[0]
    gold_authors = {a.strip() for a in gold_authors.split(',')}

    predicted = df[df['id'] == id]
    predicted_title = list(predicted['title'])[0]
    predicted_authors = set(predicted['author'])
    
    for author in predicted_authors:
        if author in gold_authors:
            true_positives.append((id, author))
        else:
            false_positives.append((id, author))
    
    for author in gold_authors:
        if author not in predicted_authors:
            false_negatives.append((id, author))

print("true_positives:", true_positives)
print("false_positives:", false_positives)
print("false_negatives:", false_negatives)
print("precision:", len(true_positives) / (len(true_positives) + len(false_positives)))
print("recall:", len(true_positives) / (len(true_positives) + len(false_negatives)))

2310.02905
2310.02905
2310.02905
2310.02905
2310.02905
2310.02905
2310.02905
2310.02905
2310.02905
2310.02905
2310.02905
2310.13226
2310.13226
2310.13226
2310.13226
2310.04353
2310.04353
2310.04353
2310.05161
2310.05161
2310.04815
2310.04815
2310.04815
2310.04815
2310.04815
2310.04815
2310.04815
true_positives: [('2310.02905', 'Wenyang Hu'), ('2310.02905', 'Yao Shu'), ('2310.02905', 'Bryan Kian Hsiang Low'), ('2310.02905', 'Zhongxiang Dai'), ('2310.02905', 'Patrick Jaillet'), ('2310.02905', 'See-Kiong Ng'), ('2310.02905', 'Zhaoxuan Wu'), ('2310.02905', 'Xiaoqiang Lin'), ('2310.02905', 'Wenyang Hu'), ('2310.02905', 'Yao Shu'), ('2310.02905', 'Bryan Kian Hsiang Low'), ('2310.02905', 'Zhongxiang Dai'), ('2310.02905', 'Patrick Jaillet'), ('2310.02905', 'See-Kiong Ng'), ('2310.02905', 'Zhaoxuan Wu'), ('2310.02905', 'Xiaoqiang Lin'), ('2310.02905', 'Wenyang Hu'), ('2310.02905', 'Yao Shu'), ('2310.02905', 'Bryan Kian Hsiang Low'), ('2310.02905', 'Zhongxiang Dai'), ('2310.02905', 'Patrick Jail

## Mapping

In [18]:
map = leafmap.Map()
map.add_xy_data(df, x="longitude", y="latitude", layer_name="World Cities")
map

Map(center=[20, 0], controls=(ZoomControl(options=['position', 'zoom_in_text', 'zoom_in_title', 'zoom_out_text…

In [1]:
from ipywidgets import HTML
from ipyleaflet import Map, Marker, Popup

center = (42.0451, -87.6877)
m = Map(center=center, zoom=9, close_popup_on_click=False)
marker1 = Marker(location=(42.0451, -87.6877))
marker2 = Marker(location=(42.0451, 87.6877))

m.add_layer(marker1)
m.add_layer(marker2)


message = HTML()
message.value = "Hello <b>World</b>"
message.placeholder = "Some HTML"
message.description = "Some HTML"


# Popup associated to a layer
marker1.popup = message
marker2.popup = message

m


Map(center=[42.0451, -87.6877], close_popup_on_click=False, controls=(ZoomControl(options=['position', 'zoom_i…