### Gemini-MultiModal-usecases.ipynb File

In [None]:
'''
Pre-requisites:
    GCP account need to be created.
    GCP project need to be linked with GCP billing/service account.
    GCP project need to be linked with GCP service account key file.(json file)
    Download json file and set env variable GOOGLE_APPLICATION_CREDENTIALS to the path of the downloaded file.
    eg: export GOOGLE_APPLICATION_CREDENTIALS="/home/user/gcp_project/data/json/service-account-file.json"
    For different models,  ref. to model garden in Vertex AI in gcp console.
'''

In [None]:
%pip install google-cloud-aiplatform gitpython magika

In [None]:

# Initialize Vertex AI
import vertexai
from vertexai.generative_models import GenerativeModel
vertexai.init(project="river-span-431711-k8", location="us-central1")# Load the Gemini 1.5 Pro model. (https://cloud.google.com/vertex-ai/docs/reference/python/latest/vertexai.generative_models)

# Load the Gemini 1.5 Pro model. (https://cloud.google.com/vertex-ai/generative-ai/docs/reference/python/latest/vertexai.generative_models)
multimodal_model = GenerativeModel("gemini-1.5-pro-001")
multimodal_model_flash = GenerativeModel("gemini-1.5-flash-001") # using this var later

# Generate response
contents = [ "Explain LLM" ]
response = multimodal_model.generate_content(contents)
print(response)
print(response.text)


In [None]:
import IPython.display
display(IPython.display.Markdown(response.text))

### Document Summarization

Using Gemini 1.5 pro model, we are going to process a PDF document. The model will analyze the document content, retain information, and provide answers for our questions. PDF document URL is https://arxiv.org/pdf/2403.05530.pdf

API Reference: https://cloud.google.com/vertex-ai/generative-ai/docs/reference/python/latest/vertexai.generative_models.Part

In [None]:
from vertexai.generative_models import Part
pdf_file_uri = "gs://cloud-samples-data/generative-ai/pdf/2403.05530.pdf"
pdf_file = Part.from_uri(pdf_file_uri, mime_type="application/pdf")

prompt = "How many tokens can the model process?"

contents = [pdf_file, prompt]

response = multimodal_model.generate_content(contents)
display(IPython.display.Markdown(response.text))

In [None]:
prompt = """
  You are a professional document summarization specialist.
  Please summarize the given document.
"""

contents = [pdf_file, prompt]

response = multimodal_model.generate_content(contents)
display(IPython.display.Markdown(response.text))

### Usecase: Image Understanding across multiple images

In [None]:
# Helper function to load images from the given url
import http.client
import typing
import urllib.request
from vertexai.generative_models import Image

def get_image_bytes_from_url(image_url: str) -> bytes:
    with urllib.request.urlopen(image_url) as response:
        response = typing.cast(http.client.HTTPResponse, response)
        image_bytes = response.read()
    return image_bytes


def load_image_from_url(image_url: str) -> Image:
    image_bytes = get_image_bytes_from_url(image_url)
    return Image.from_bytes(image_bytes)


In [None]:
image_glasses1_url = "https://storage.googleapis.com/github-repo/img/gemini/multimodality_usecases_overview/glasses1.jpg"
image_glasses2_url = "https://storage.googleapis.com/github-repo/img/gemini/multimodality_usecases_overview/glasses2.jpg"

prompt = """
I have an oval face. Given my face shape, which glasses would be more suitable?

Explain how you reached this decision.
Provide your recommendation based on my face shape, and please give an explanation for each.
"""
image_glasses1 = load_image_from_url(image_glasses1_url)
image_glasses2 = load_image_from_url(image_glasses2_url)

In [None]:
IPython.display.Image(image_glasses1_url, width=150)

In [None]:
IPython.display.Image(image_glasses2_url, width=150)

In [None]:
contents = [prompt, image_glasses1, image_glasses2]
responses = multimodal_model.generate_content(contents)
display(IPython.display.Markdown(responses.text))