In [None]:
%pip install langchain-google-vertexai pymupdf langchain langchain-core

Collecting langchain-google-vertexai
  Downloading langchain_google_vertexai-2.0.1-py3-none-any.whl.metadata (3.8 kB)
Collecting google-cloud-storage<3.0.0,>=2.17.0 (from langchain-google-vertexai)
  Downloading google_cloud_storage-2.18.2-py2.py3-none-any.whl.metadata (9.1 kB)
Collecting httpx-sse<0.5.0,>=0.4.0 (from langchain-google-vertexai)
  Downloading httpx_sse-0.4.0-py3-none-any.whl.metadata (9.0 kB)
Downloading langchain_google_vertexai-2.0.1-py3-none-any.whl (86 kB)
[2K   [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m86.9/86.9 kB[0m [31m1.3 MB/s[0m eta [36m0:00:00[0m
[?25hDownloading google_cloud_storage-2.18.2-py2.py3-none-any.whl (130 kB)
[2K   [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m130.5/130.5 kB[0m [31m2.7 MB/s[0m eta [36m0:00:00[0m
[?25hDownloading httpx_sse-0.4.0-py3-none-any.whl (7.8 kB)
Installing collected packages: httpx-sse, google-cloud-storage, langchain-google-vertexai
  Attempting uninstall: google-cloud-storage
    Found 

In [79]:
from google.auth import default, transport

credentials, _ = default()
auth_request = transport.requests.Request()
credentials.refresh(auth_request)

In [80]:
from langchain_google_vertexai import VertexAI, HarmBlockThreshold, HarmCategory
from langchain_google_vertexai import VertexAIModelGarden
from langchain_google_vertexai.model_garden_maas.llama import VertexModelGardenLlama
from langchain_core.prompts import PromptTemplate
from langchain_core.output_parsers import JsonOutputParser
from pydantic import BaseModel, Field
from typing import List, Optional

from langchain.globals import set_verbose, set_debug

safety_settings = {
    HarmCategory.HARM_CATEGORY_UNSPECIFIED: HarmBlockThreshold.BLOCK_NONE,
    HarmCategory.HARM_CATEGORY_DANGEROUS_CONTENT: HarmBlockThreshold.BLOCK_NONE,
    HarmCategory.HARM_CATEGORY_HATE_SPEECH: HarmBlockThreshold.BLOCK_NONE,
    HarmCategory.HARM_CATEGORY_HARASSMENT: HarmBlockThreshold.BLOCK_NONE,
    HarmCategory.HARM_CATEGORY_SEXUALLY_EXPLICIT: HarmBlockThreshold.BLOCK_NONE,
}

# llm = VertexAI(
#     model_name="gemini-1.5-flash",
#     verbose=True,
#     safety_settings=safety_settings,
# )
llm = VertexModelGardenLlama(
    model="meta/llama3-405b-instruct-maas",
    # other params...
)
# llm = VertexAIModelGarden(project=PROJECT_ID, endpoint_id=ENDPOINT_ID)


In [81]:
import pymupdf
import json
from google.cloud import storage


BUCKET_NAME = "cloud-ai-platform-e215f7f7-a526-4a66-902d-eb69384ef0c4"
storage_client = storage.Client()
bucket = storage_client.bucket(BUCKET_NAME)

# Define your desired data structure.
class Author(BaseModel):
    first: str = Field(description="first name and any middle name or initials")
    last: str = Field(description="last name")

class ArticleMetadata(BaseModel):
    title: str = Field(description="title of the article")
    authors: List[Author] = Field(..., description="List of authors.")
    keywords: List[str] = Field(..., description="List of keywords.")
    abstract: Optional[str] = Field(description="the abstract of the article")

def file_content(file) -> str:
    blob = storage.Blob(file, bucket)

    doc = pymupdf.open("pdf", blob.download_as_bytes())
    file_content= ""
    for page in doc.pages(0, 3, 1): # first three pages
      file_content += page.get_text()
    return file_content

def create_chain() -> str:
    template = """Here is an academic paper: <paper>{file_content}</paper>

    Please extract the following data from the paper.
    The title of the article.
    The list of authors split into first and last name. The first name may include any initials or middle names as well. Use "first" and "last" as the JSON keys for the name.
    The abstract if it is explicitly provided. Do not try to generate an abstract unless it is present.
    The list of keywords that are explicitly provided on the paper. Do not try to generate keywords unless they are present. If there are no keywords represent this as an empty list.
    Do not provide any comments in the JSON because they are not permitted in the spec.
    """
    parser = JsonOutputParser(pydantic_object=ArticleMetadata)
    prompt = PromptTemplate(
      input_variables=["file_content"],
      template=template,
      partial_variables={"format_instructions": parser.get_format_instructions()},
    )
    return prompt | llm | parser

# Send Google Cloud Storage Document to Vertex AI
def process_document(
    file: str,
) -> str:
    chain = create_chain()
    return json.dumps(chain.invoke({"file_content": file_content(file)}))
    # return chain.invoke({"file_content": file_content(file)})

def process_row(filename, alex_id, bucket):
    print(filename)
    result = process_document(filename)
    blob = bucket.blob(f'preprints/results-llama-3/{alex_id}.json')
    with blob.open("w") as f:
      f.write(result)

In [73]:
# This allows us to do a quick test before running the whole batch.
# llm.invoke("What day comes after Friday?")
# print(json.dumps(ArticleMetadata.model_json_schema(), indent=2))
# set_debug(False)
print(process_document('preprints/chemrxiv.org/W4304614191.pdf'))

{"title": "\u03b2-Lactamases evolve against antibiotics by acquiring large active-site electric fields", "authors": [{"first": "Zhe", "last": "Ji"}, {"first": "Steven G.", "last": "Boxer"}], "abstract": "A compound bound covalently to an enzyme active site can act either as a substrate if the covalent linkage is readily broken up by the enzyme or as an inhibitor if the bond dissociates slowly. We tracked the reactivity of such bonds associated with the rise of the resistance to penicillin G (PenG) in protein evolution from penicillin-binding proteins (PBPs) to TEM \u03b2-lactamases, and with the development of avibactam (Avb) to overcome the resistance. We found that the ester linkage in PBP\u2013PenG is resistant to hydrolysis mainly due to the small electric fields present in the protein active site. Conversely, the same linkage in the descendant TEM\u2013PenG experiences large electric fields which stabilize the more charge-separated transition state and thus lower the free energy b

In [82]:
import csv
DIR = 'preprints/'
FILENAME = f'{DIR}records.csv'

blob = bucket.blob(FILENAME)
with blob.open() as csvfile:
  reader = csv.reader(csvfile)
  next(reader) # skip headers
  for row in reader:
    filename = row[1].removeprefix(f"https://storage.cloud.google.com/{BUCKET_NAME}/")
    alex_id = row[0].removeprefix("https://openalex.org/")
    process_row(filename, alex_id, bucket)


preprints/arxiv.org/W2988715931.pdf
preprints/arxiv.org/W3202546816.pdf
preprints/osf.io/W4212832069.pdf
preprints/discovery.ucl.ac.uk/W2901173781.pdf
preprints/www.biorxiv.org/W3013783484.pdf
preprints/eartharxiv.org/W4226140866.pdf
preprints/www.biorxiv.org/W4399283731.pdf
preprints/arxiv.org/W3126527592.pdf
preprints/escholarship.org/W3093048371.pdf
preprints/escholarship.org/W4205859241.pdf
preprints/www.medrxiv.org/W4322154575.pdf
preprints/www.researchsquare.com/W4297341531.pdf
preprints/www.medrxiv.org/W4393156923.pdf
preprints/escholarship.org/W3000588783.pdf
preprints/www.medrxiv.org/W3091235616.pdf
preprints/preprints.apsanet.org/W4296103990.pdf
preprints/ora.ox.ac.uk/W3170856023.pdf
preprints/arxiv.org/W3164284701.pdf
preprints/arxiv.org/W3183339884.pdf
preprints/www.researchsquare.com/W4393028888.pdf
preprints/www.researchsquare.com/W4394813096.pdf
preprints/chemrxiv.org/W4304614191.pdf
preprints/inria.hal.science/W4386128506.pdf
preprints/osf.io/W4376255784.pdf
preprints/a