In [2]:
import re
import json
import csv
import random
from pathlib import Path
import pandas as pd
import json
import math
from tqdm import tqdm
import openai
from typing import Dict

In [3]:
path = Path('./data/arxiv_metadata.parquet')
assert path.exists()
df = pd.read_parquet(path)
print(f"Loaded {len(df)} rows")
df.head()

Loaded 1782 rows


Unnamed: 0,id,submitter,authors,title,comments,journal-ref,doi,report-no,categories,license,abstract,update_date,src_pdf,text
0,2310.00014,Yong Ren,"Yong Ren, Tao Wang, Jiangyan Yi, Le Xu, Jianhu...",Fewer-token Neural Speech Codec with Time-inva...,Submitted to ICASSP 2024,,,,cs.SD eess.AS,http://arxiv.org/licenses/nonexclusive-distrib...,Language model based text-to-speech (TTS) mo...,2023-10-03,2310.00014v1.pdf,FEWER-TOKEN NEURAL SPEECH CODEC WITH TIME-INVA...
1,2310.00031,Markus Marks,"Neehar Kondapaneni, Markus Marks, Manuel Knott...",Text-image Alignment for Diffusion-based Perce...,Project page: https://www.vision.caltech.edu/t...,,,,cs.CV,http://arxiv.org/licenses/nonexclusive-distrib...,Diffusion models are generative models with ...,2023-10-06,2310.00031v1.pdf,Text-image Alignment for Diffusion-based Perce...
2,2310.00032,Qinghua Xu,"Qinghua Xu, Tao Yue, Shaukat Ali and Maite Arr...","Pretrain, Prompt, and Transfer: Evolving Digit...",,,,,cs.SE,http://creativecommons.org/licenses/by/4.0/,"Cyber-Physical Systems (CPSs), e.g., elevato...",2023-10-06,2310.00032v2.pdf,"PRETRAIN, PROMPT, AND TRANSFER: EVOLVING DIGIT..."
3,2310.00034,Yuzhang Shang,"Yuzhang Shang, Zhihang Yuan, Qiang Wu, Zhen Dong",PB-LLM: Partially Binarized Large Language Models,Frist work using network binarization for larg...,,,,cs.LG cs.AI cs.CL,http://arxiv.org/licenses/nonexclusive-distrib...,"This paper explores network binarization, a ...",2023-10-03,2310.00034v1.pdf,PB-LLM: PARTIALLY BINARIZED LARGE LANGUAGE\nMO...
4,2310.00035,Xi Wang,"Xi Wang, Laurence Aitchison, Maja Rudolph",LoRA ensembles for large language model fine-t...,Update the title in the PDF file,,,,cs.LG cs.AI,http://creativecommons.org/licenses/by/4.0/,Finetuned LLMs often exhibit poor uncertaint...,2023-10-06,2310.00035v1.pdf,Preprint. Under review\nENSEMBLE OF LOW-RANK A...


## Run text through OpenAI

In [4]:
import os
from dotenv import load_dotenv

# load the .env file containing your API key
load_dotenv()

# print (obfuscated) API key
print(f"OPENAI_API_KEY: {os.getenv('OPENAI_API_KEY')[:4]}...")

OPENAI_API_KEY: sk-3...


In [5]:

user_prompt_instructions = """
Extract the title and authors and affiliations from the first page of a scientific paper. 

Use the following step-by-step instructions to respond to user inputs.

Extract the title and authors from the first page of a scientific paper. The paper text will snipped will be delimited by triple quotes. Geolocate each author affiliation with latitude and longitude.

The output should have the following format:

{ "title": "The paper's title",
  "authors": [
    {
      "name": "Yong Ren",
      "email": null,
      "affiliations": [ "list of indices" ]
    }
  ],
 "affiliations": [ {"index": "the index", "name": "The affiliation name", "longitude": "the longitude", "latitude": "the latitude" } ]
 ]
}

"""

In [6]:
def validate_response_data(data: Dict):
    assert "title" in data, "title not found"
    assert "authors" in data, "title not found"
    for auth in data['authors']:
        assert "name" in auth, "name not found"
        assert "email" in auth, "email not found"
        assert "affiliations" in auth, "affiliations not found"
    assert "affiliations" in data, "affiliations not found"
    for aff in data['affiliations']:
        assert "index" in aff, "index not found"
        assert "name" in aff, "name not found"
        assert "longitude" in aff, "longitude not found"
        assert "latitude" in aff, "latitude not found"


def analyze_text(client, text):
    response = client.chat.completions.create(
        model="gpt-4-1106-preview",
        messages=[
            {
                "role": "system",
                "content": "You are an expert research librarian. You are precise and can analyze the structure of papers very well. You return information in json format."
            },
            {
                "role": "user",
                "content": user_prompt_instructions + '\n\n"""' + text + '\n\n"""'
            }
        ],
        response_format={"type": "json_object"},
        temperature=0,
        max_tokens=2048,
        top_p=1,
        frequency_penalty=0,
        presence_penalty=0,
        seed=42,
    )
    try:
        data = json.loads(response.choices[0].message.content)
        print(data)
        validate_response_data(data)
        return json.dumps(data)
    except Exception as e:
        print(e)
        return str(e)


In [7]:
client = openai.Client()
df_sample = df.sample(100, random_state=42)
df_sample['extracted_info'] = df_sample['text'].apply(lambda x: analyze_text(client, x))

{'title': 'A cry for help: Early detection of brain injury in newborns', 'authors': [{'name': 'Charles C. Onu', 'email': None, 'affiliations': [1, 2, 3]}, {'name': 'Samantha Latremouille', 'email': None, 'affiliations': [1]}, {'name': 'Arsenii Gorin', 'email': None, 'affiliations': [1]}, {'name': 'Junhao Wang', 'email': None, 'affiliations': [1]}, {'name': 'Uchenna Ekwochi', 'email': None, 'affiliations': [4]}, {'name': 'Peter O. Ubuane', 'email': None, 'affiliations': [5]}, {'name': 'Omolara A. Kehinde', 'email': None, 'affiliations': [5, 6]}, {'name': 'Muhammad A. Salisu', 'email': None, 'affiliations': [5, 6]}, {'name': 'Datonye Briggs', 'email': None, 'affiliations': [7]}, {'name': 'Yoshua Bengio', 'email': None, 'affiliations': [2, 8]}, {'name': 'Doina Precup', 'email': None, 'affiliations': [2, 3, 9]}], 'affiliations': [{'index': 1, 'name': 'Ubenwa Health, Montréal, Canada', 'longitude': '-73.5673', 'latitude': '45.5017'}, {'index': 2, 'name': 'Mila-Québec AI Institute, Montréal,

In [8]:
extracted_data = []
for row in df_sample.itertuples():
    try:
        data = json.loads(row.extracted_info)
        print(data)
    except:
        print(f"Error loading data for {row.id}")
        continue
    for author in data['authors']:
        for index in author['affiliations']:
            extract = dict()
            extract['id'] = row.id
            extract['author'] = author['name']
            for aff in data['affiliations']:
                if aff['index'] == int(index):
                    extract['affiliation'] = aff['name']
                    extract['longitude'] = aff['longitude']
                    extract['latitude'] = aff['latitude']
                    extracted_data.append(extract)
    
df_extracted = pd.DataFrame(extracted_data)
df_extracted.to_csv('./data/extracted_data.csv', index=False)
df_extracted.to_parquet('./data/extracted_data.parquet')
df_extracted.head()

{'title': 'A cry for help: Early detection of brain injury in newborns', 'authors': [{'name': 'Charles C. Onu', 'email': None, 'affiliations': [1, 2, 3]}, {'name': 'Samantha Latremouille', 'email': None, 'affiliations': [1]}, {'name': 'Arsenii Gorin', 'email': None, 'affiliations': [1]}, {'name': 'Junhao Wang', 'email': None, 'affiliations': [1]}, {'name': 'Uchenna Ekwochi', 'email': None, 'affiliations': [4]}, {'name': 'Peter O. Ubuane', 'email': None, 'affiliations': [5]}, {'name': 'Omolara A. Kehinde', 'email': None, 'affiliations': [5, 6]}, {'name': 'Muhammad A. Salisu', 'email': None, 'affiliations': [5, 6]}, {'name': 'Datonye Briggs', 'email': None, 'affiliations': [7]}, {'name': 'Yoshua Bengio', 'email': None, 'affiliations': [2, 8]}, {'name': 'Doina Precup', 'email': None, 'affiliations': [2, 3, 9]}], 'affiliations': [{'index': 1, 'name': 'Ubenwa Health, Montréal, Canada', 'longitude': '-73.5673', 'latitude': '45.5017'}, {'index': 2, 'name': 'Mila-Québec AI Institute, Montréal,

Unnamed: 0,id,author,affiliation,longitude,latitude
0,2310.08338,Charles C. Onu,"Ubenwa Health, Montréal, Canada",-73.5673,45.5017
1,2310.08338,Charles C. Onu,"Mila-Québec AI Institute, Montréal, Canada",-73.5673,45.5017
2,2310.08338,Charles C. Onu,"McGill University, Montréal, Canada",-73.5827,45.5048
3,2310.08338,Samantha Latremouille,"Ubenwa Health, Montréal, Canada",-73.5673,45.5017
4,2310.08338,Arsenii Gorin,"Ubenwa Health, Montréal, Canada",-73.5673,45.5017
