In [2]:
!pip install langchain_google_vertexai pymupdf langchain-core

Collecting langchain_google_vertexai
  Downloading langchain_google_vertexai-2.0.3-py3-none-any.whl.metadata (3.8 kB)
Collecting pymupdf
  Downloading PyMuPDF-1.24.10-cp310-none-manylinux2014_x86_64.whl.metadata (3.4 kB)
Collecting langchain-core
  Downloading langchain_core-0.3.8-py3-none-any.whl.metadata (6.3 kB)
Collecting google-cloud-storage<3.0.0,>=2.17.0 (from langchain_google_vertexai)
  Downloading google_cloud_storage-2.18.2-py2.py3-none-any.whl.metadata (9.1 kB)
Collecting httpx<0.28.0,>=0.27.0 (from langchain_google_vertexai)
  Downloading httpx-0.27.2-py3-none-any.whl.metadata (7.1 kB)
Collecting httpx-sse<0.5.0,>=0.4.0 (from langchain_google_vertexai)
  Downloading httpx_sse-0.4.0-py3-none-any.whl.metadata (9.0 kB)
Collecting pydantic<3,>=2 (from langchain_google_vertexai)
  Downloading pydantic-2.9.2-py3-none-any.whl.metadata (149 kB)
[2K     [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m149.4/149.4 kB[0m [31m5.4 MB/s[0m eta [36m0:00:00[0m
[?25hCollecting

In [4]:
from google.auth import default, transport

from langchain_google_vertexai import ChatVertexAI

ENDPOINT_ID="3135878630672957440"
PROJECT_ID="768608702519"

credentials, _ = default()
auth_request = transport.requests.Request()
credentials.refresh(auth_request)

llm = ChatVertexAI(
    model="llama-3-2-1b-instruct",
    full_model_name=f"projects/{PROJECT_ID}/locations/us-central1/endpoints/{ENDPOINT_ID}",
    temperature=0,
    max_tokens=None,
    max_retries=6,
    stop=None,
    # other params...
)

print(llm.invoke("Who was the first president of the United States?"))


content='The first president of the United States was George Washington. He served two terms in office from April 30, 1789, to March 4, 1797.' additional_kwargs={} response_metadata={'is_blocked': False, 'safety_ratings': [], 'usage_metadata': {'prompt_token_count': 20, 'candidates_token_count': 36, 'total_token_count': 56, 'cached_content_token_count': 0}, 'finish_reason': 'STOP'} id='run-8e66f20f-e61c-44ed-880b-c0ad23e657dd-0' usage_metadata={'input_tokens': 20, 'output_tokens': 36, 'total_tokens': 56}


In [14]:
import pymupdf
import json
from google.cloud import storage
from langchain_core.output_parsers import JsonOutputParser
from langchain_core.prompts import PromptTemplate
from pydantic import BaseModel, Field
from typing import List, Optional

BUCKET_NAME = "cloud-ai-platform-e215f7f7-a526-4a66-902d-eb69384ef0c4"
storage_client = storage.Client()
bucket = storage_client.bucket(BUCKET_NAME)

# Define your desired data structure.
class Author(BaseModel):
    first: str = Field(description="first name and any middle name or initials")
    last: str = Field(description="last name")

class ArticleMetadata(BaseModel):
    title: str = Field(description="title of the article")
    authors: List[Author] = Field(..., description="List of authors.")
    keywords: List[str] = Field(..., description="List of keywords.")
    abstract: Optional[str] = Field(description="the abstract of the article")

def file_content(file) -> str:
    blob = storage.Blob(file, bucket)

    doc = pymupdf.open("pdf", blob.download_as_bytes())
    file_content= ""
    for page in doc.pages(0, 3, 1): # first three pages
      file_content += page.get_text()
    return file_content

def create_chain() -> str:
    template = """Here is an academic paper: <paper>{file_content}</paper>


    Please extract the following data from the paper.
    The title of the article.
    The list of authors split into first and last name. The first name may include any initials or middle names as well. Use "first" and "last" as the JSON keys for the name.
    The abstract if it is explicitly provided. Do not try to generate an abstract unless it is present.
    The list of keywords that are explicitly provided on the paper. Do not try to generate keywords unless they are present. If there are no keywords represent this as an empty list.

    {format_instructions}
    """
    parser = JsonOutputParser(pydantic_object=ArticleMetadata)
    prompt = PromptTemplate(
      input_variables=["file_content"],
      template=template,
      partial_variables={"format_instructions": parser.get_format_instructions()},
    )
    return prompt | llm | parser

# Send Google Cloud Storage Document to Vertex AI
def process_document(
    file: str,
) -> str:
    chain = create_chain()
    return json.dumps(chain.invoke({"file_content": file_content(file)}))
    # return chain.invoke({"file_content": file_content(file)})

def process_row(filename, alex_id, bucket):
    print(filename)
    result = process_document(filename)
    blob = bucket.blob(f'preprints/results-llama-3-2-1b-instruct/{alex_id}.json')
    with blob.open("w") as f:
      f.write(result)

In [12]:
print(process_document('preprints/chemrxiv.org/W4304614191.pdf'))

The output should be formatted as a JSON instance that conforms to the JSON schema below.

As an example, for the schema {"properties": {"foo": {"title": "Foo", "description": "a list of strings", "type": "array", "items": {"type": "string"}}}, "required": ["foo"]}
the object {"foo": ["bar", "baz"]} is a well-formatted instance of the schema. The object {"properties": {"foo": ["bar", "baz"]}} is not well-formatted.

Here is the output schema:
```
{"$defs": {"Author": {"properties": {"first": {"description": "first name and any middle name or initials", "title": "First", "type": "string"}, "last": {"description": "last name", "title": "Last", "type": "string"}}, "required": ["first", "last"], "title": "Author", "type": "object"}}, "properties": {"title": {"description": "title of the article", "title": "Title", "type": "string"}, "authors": {"description": "List of authors.", "items": {"$ref": "#/$defs/Author"}, "title": "Authors", "type": "array"}, "keywords": {"description": "List of 

In [15]:
import csv
DIR = 'preprints/'
FILENAME = f'{DIR}records-48.csv'

blob = bucket.blob(FILENAME)
with blob.open() as csvfile:
  reader = csv.reader(csvfile)
  next(reader) # skip headers
  for row in reader:
    filename = row[1].removeprefix(f"https://storage.cloud.google.com/{BUCKET_NAME}/")
    alex_id = row[0].removeprefix("https://openalex.org/")
    process_row(filename, alex_id, bucket)


preprints/arxiv.org/W2988715931.pdf
preprints/arxiv.org/W3202546816.pdf
preprints/osf.io/W4212832069.pdf
preprints/discovery.ucl.ac.uk/W2901173781.pdf
preprints/www.biorxiv.org/W3013783484.pdf
preprints/eartharxiv.org/W4226140866.pdf
preprints/www.biorxiv.org/W4399283731.pdf
preprints/arxiv.org/W3126527592.pdf
preprints/escholarship.org/W3093048371.pdf
preprints/escholarship.org/W4205859241.pdf
preprints/www.medrxiv.org/W4322154575.pdf
preprints/www.researchsquare.com/W4297341531.pdf
preprints/www.medrxiv.org/W4393156923.pdf
preprints/escholarship.org/W3000588783.pdf
preprints/www.medrxiv.org/W3091235616.pdf
preprints/preprints.apsanet.org/W4296103990.pdf
preprints/ora.ox.ac.uk/W3170856023.pdf
preprints/arxiv.org/W3164284701.pdf
preprints/arxiv.org/W3183339884.pdf
preprints/www.researchsquare.com/W4393028888.pdf
preprints/www.researchsquare.com/W4394813096.pdf
preprints/chemrxiv.org/W4304614191.pdf
preprints/inria.hal.science/W4386128506.pdf
preprints/osf.io/W4376255784.pdf
preprints/a