In [1]:
! pip3 install --upgrade --user google-cloud-aiplatform

Collecting google-cloud-aiplatform
  Downloading google_cloud_aiplatform-1.67.1-py2.py3-none-any.whl.metadata (32 kB)
Downloading google_cloud_aiplatform-1.67.1-py2.py3-none-any.whl (5.2 MB)
[2K   [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m5.2/5.2 MB[0m [31m18.3 MB/s[0m eta [36m0:00:00[0m
[?25hInstalling collected packages: google-cloud-aiplatform
[0mSuccessfully installed google-cloud-aiplatform-1.67.1


In [2]:
from google.cloud import storage

BUCKET_NAME = "cloud-ai-platform-e215f7f7-a526-4a66-902d-eb69384ef0c4"
storage_client = storage.Client()
bucket = storage_client.bucket(BUCKET_NAME)

In [3]:
import vertexai
from vertexai.generative_models import (
    GenerationConfig,
    GenerativeModel,
    Part,
    HarmCategory,
    HarmBlockThreshold,
)
model = GenerativeModel("gemini-1.0-pro-vision")

model = GenerativeModel(
    "gemini-1.5-flash",
    safety_settings={
        HarmCategory.HARM_CATEGORY_DANGEROUS_CONTENT: HarmBlockThreshold.BLOCK_ONLY_HIGH
    },
)

# Define project information
PROJECT_ID = "sul-ai-sandbox"  # @param {type:"string"}
LOCATION = "us-central1"  # @param {type:"string"}

# Initialize Vertex AI
vertexai.init(project=PROJECT_ID, location=LOCATION)

# This Generation Config sets the model to respond in JSON format.
generation_config = GenerationConfig(
    temperature=0.0, response_mime_type="application/json"
)

PDF_MIME_TYPE = "application/pdf"

# Send Google Cloud Storage Document to Vertex AI
def process_document(
    prompt: str,
    file_uri: str,
    mime_type: str = PDF_MIME_TYPE,
    generation_config: GenerationConfig | None = None,
) -> str:
    # Load file directly from Google Cloud Storage
    file_part = Part.from_uri(
        uri=file_uri,
        mime_type=mime_type,
    )

    # Load contents
    contents = [file_part, prompt]

    # Send to Gemini
    response = model.generate_content(contents, generation_config=generation_config)

    return response.text

In [5]:
import csv

DIR = 'preprints/'
FILENAME = f'{DIR}records.csv'

prompt = """
Format all responses as valid JSON.
Examine this article and extract the following data:
The title of the article.
The list of authors split into first and last name. The first name may include any initials or middle names as well. Use "first" and "last" as the JSON keys for the name.
The abstract if it is explicitly provided. Do not try to generate an abstract unless it is present.
The list of keywords if they are explicitly provided. Do not try to generate keywords unless they are present.

If any of the requested data can not be found, represent it as null in the JSON.
"""

blob = bucket.blob(FILENAME)
with blob.open() as csvfile:
  reader = csv.reader(csvfile)
  next(reader) # skip headers
  for row in reader:
    filename = row[1].removeprefix("https://storage.cloud.google.com/")
    pdf_file = Part.from_uri(
      uri=f'gs://{filename}',
      mime_type="application/pdf",
    )
    print(filename)
    result = process_document(prompt, f'gs://{filename}', generation_config=generation_config)
    alex_id = row[0].removeprefix("https://openalex.org/")
    blob = bucket.blob(f'preprints/results-gemini-1.5-flash/{alex_id}.json')
    with blob.open("w") as f:
      f.write(result)


cloud-ai-platform-e215f7f7-a526-4a66-902d-eb69384ef0c4/preprints/arxiv.org/W2988715931.pdf
cloud-ai-platform-e215f7f7-a526-4a66-902d-eb69384ef0c4/preprints/arxiv.org/W3202546816.pdf
cloud-ai-platform-e215f7f7-a526-4a66-902d-eb69384ef0c4/preprints/osf.io/W4212832069.pdf
cloud-ai-platform-e215f7f7-a526-4a66-902d-eb69384ef0c4/preprints/discovery.ucl.ac.uk/W2901173781.pdf
cloud-ai-platform-e215f7f7-a526-4a66-902d-eb69384ef0c4/preprints/www.biorxiv.org/W3013783484.pdf
cloud-ai-platform-e215f7f7-a526-4a66-902d-eb69384ef0c4/preprints/eartharxiv.org/W4226140866.pdf
cloud-ai-platform-e215f7f7-a526-4a66-902d-eb69384ef0c4/preprints/www.biorxiv.org/W4399283731.pdf
cloud-ai-platform-e215f7f7-a526-4a66-902d-eb69384ef0c4/preprints/arxiv.org/W3126527592.pdf
cloud-ai-platform-e215f7f7-a526-4a66-902d-eb69384ef0c4/preprints/escholarship.org/W3093048371.pdf
cloud-ai-platform-e215f7f7-a526-4a66-902d-eb69384ef0c4/preprints/escholarship.org/W4205859241.pdf
cloud-ai-platform-e215f7f7-a526-4a66-902d-eb69384ef0