### Gemini-MultiModal-usecases.ipynb File

In [None]:
'''
Pre-requisites:
    GCP account need to be created.
    GCP project need to be linked with GCP billing/service account.
    GCP project need to be linked with GCP service account key file.(json file)
    Download json file and set env variable GOOGLE_APPLICATION_CREDENTIALS to the path of the downloaded file.
    eg: export GOOGLE_APPLICATION_CREDENTIALS="/home/user/gcp_project/data/json/service-account-file.json"
    For different models,  ref. to model garden in Vertex AI in gcp console.
'''

In [None]:
%pip install google-cloud-aiplatform gitpython magika

In [None]:

# Initialize Vertex AI
import vertexai
from vertexai.generative_models import GenerativeModel
vertexai.init(project="river-span-431711-k8", location="us-central1")# Load the Gemini 1.5 Pro model. (https://cloud.google.com/vertex-ai/docs/reference/python/latest/vertexai.generative_models)

# Load the Gemini 1.5 Pro model. (https://cloud.google.com/vertex-ai/generative-ai/docs/reference/python/latest/vertexai.generative_models)
multimodal_model = GenerativeModel("gemini-1.5-pro-001")
multimodal_model_flash = GenerativeModel("gemini-1.5-flash-001") # using this var later

# Generate response
contents = [ "Explain LLM" ]
response = multimodal_model.generate_content(contents)
print(response)
print(response.text)


In [None]:
import IPython.display
display(IPython.display.Markdown(response.text))

### Document Summarization

Using Gemini 1.5 pro model, we are going to process a PDF document. The model will analyze the document content, retain information, and provide answers for our questions. PDF document URL is https://arxiv.org/pdf/2403.05530.pdf

API Reference: https://cloud.google.com/vertex-ai/generative-ai/docs/reference/python/latest/vertexai.generative_models.Part

In [None]:
from vertexai.generative_models import Part
pdf_file_uri = "gs://cloud-samples-data/generative-ai/pdf/2403.05530.pdf"
pdf_file = Part.from_uri(pdf_file_uri, mime_type="application/pdf")

prompt = "How many tokens can the model process?"

contents = [pdf_file, prompt]

response = multimodal_model.generate_content(contents)
display(IPython.display.Markdown(response.text))

In [None]:
prompt = """
  You are a professional document summarization specialist.
  Please summarize the given document.
"""

contents = [pdf_file, prompt]

response = multimodal_model.generate_content(contents)
display(IPython.display.Markdown(response.text))

### Usecase: Image Understanding across multiple images

In [None]:
# Helper function to load images from the given url
import http.client
import typing
import urllib.request
from vertexai.generative_models import Image

def get_image_bytes_from_url(image_url: str) -> bytes:
    with urllib.request.urlopen(image_url) as response:
        response = typing.cast(http.client.HTTPResponse, response)
        image_bytes = response.read()
    return image_bytes


def load_image_from_url(image_url: str) -> Image:
    image_bytes = get_image_bytes_from_url(image_url)
    return Image.from_bytes(image_bytes)


In [None]:
image_glasses1_url = "https://storage.googleapis.com/github-repo/img/gemini/multimodality_usecases_overview/glasses1.jpg"
image_glasses2_url = "https://storage.googleapis.com/github-repo/img/gemini/multimodality_usecases_overview/glasses2.jpg"

prompt = """
I have an oval face. Given my face shape, which glasses would be more suitable?

Explain how you reached this decision.
Provide your recommendation based on my face shape, and please give an explanation for each.
"""
image_glasses1 = load_image_from_url(image_glasses1_url)
image_glasses2 = load_image_from_url(image_glasses2_url)

In [None]:
IPython.display.Image(image_glasses1_url, width=150)

In [None]:
IPython.display.Image(image_glasses2_url, width=150)

In [None]:
contents = [prompt, image_glasses1, image_glasses2]
responses = multimodal_model.generate_content(contents)
display(IPython.display.Markdown(responses.text))

### Usecase: Generate description about a given video           

In [None]:
# Helper function to display content as video.
def display_content_as_video(content: str | Image | Part):
    if not isinstance(content, Part):
        return False
    part = typing.cast(Part, content)
    file_path = part.file_data.file_uri.removeprefix("gs://")
    video_url = f"https://storage.googleapis.com/{file_path}"
    IPython.display.display(IPython.display.Video(video_url, width=350))
    

In [None]:
video = Part.from_uri(
    uri="gs://github-repo/img/gemini/multimodality_usecases_overview/mediterraneansea.mp4",
    mime_type="video/mp4",
)
display_content_as_video(video)

In [None]:
prompt = """
What is shown in this video?
Where should I go to see it?
What are the top 5 places in the world that look like this?
"""

contents = [prompt, video]
responses = multimodal_model.generate_content(contents)
display(IPython.display.Markdown(responses.text))

In [None]:
video = Part.from_uri(
    uri="gs://github-repo/img/gemini/multimodality_usecases_overview/ottawatrain3.mp4",
    mime_type="video/mp4",
)
display_content_as_video(video)

In [None]:
prompt = """
Which line is this?
Where does it go?
What are the stations/stops?
"""
contents = [prompt, video]
responses = multimodal_model.generate_content(contents)
display(IPython.display.Markdown(responses.text))

## Use git codebase usecase
### Given a git codebase, Undersand it and answer the questions.

In [None]:
! pip install gitpython

#### Helper Functions

In [None]:
#############################################################
# Helper functions to deal with git repo and source files.
#############################################################
import os
import shutil
from pathlib import Path
import git
import magika

m = magika.Magika()

def clone_repo(repo_url, repo_dir):
    """Clone a GitHub repository."""
    if os.path.exists(repo_dir):
        shutil.rmtree(repo_dir)         # rm -rf <dir>

    os.makedirs(repo_dir)
    git.Repo.clone_from(repo_url, repo_dir)


def extract_code(repo_dir):
    """Create code index and extract the content of source files from a GitHub repository."""
    code_index = []
    code_text = ""
    for root, dirs, files in os.walk(repo_dir):
        for file in files:
            file_path = os.path.join(root, file)
            relative_path = os.path.relpath(file_path, repo_dir)
            code_index.append(relative_path)

            file_type = m.identify_path(Path(file_path))
            if file_type.output.group in ("text", "code"):
                try:
                    with open(file_path, "r") as f:
                        code_text += f"----- File: {relative_path} -----\n"
                        code_text += f.read()
                        code_text += "\n-------------------------\n"
                except Exception:
                    pass

    return code_index, code_text


In [None]:
# GitHub repo url
repo_url = "https://github.com/GoogleCloudPlatform/microservices-demo"

# Location to clone the above git repo.
repo_dir = "./repo"

Clone the git repo and createa file index and extract contents of the code/text files.

In [None]:
clone_repo(repo_url, repo_dir)

In [None]:
code_index, code_text = extract_code(repo_dir)

Helper function to generate a prompt to a code related question

In [None]:
def get_code_prompt(question):
    """Generates a prompt to a code related question."""

    prompt = f"""
    Questions: {question}

    Context:
    - The entire codebase is provided below.
    - Here is an index of all of the files in the codebase:
      \n\n{code_index}\n\n.
    - Then each of the files is concatenated together. You will find all of the code you need:
      \n\n{code_text}\n\n

    Answer:
  """

    return prompt

#### Now create a prompt question.
###### for eg: generate a "Getting Started" guide for new developers based on this project (code base)

In [None]:
question = """
  Provide a getting started guide to onboard new developers to the codebase.
"""

prompt = get_code_prompt(question)
contents = [prompt]

response = multimodal_model.generate_content(contents)
IPython.display.Markdown(response.text)

##### Generate a Code Base Summary 

In [None]:
question = """
  Give me a summary of this codebase, and tell me the top 3 things that I can learn from it.
"""

prompt = get_code_prompt(question)
contents = [prompt]

# Generate text using non-streaming method
response = multimodal_model_flash.generate_content(contents)
IPython.display.Markdown(response.text)

#### Find the bugs in the code base (for some reason this is not working)

In [None]:
question = """
  Find the top 3 most severe issues in the codebase.
"""

prompt = get_code_prompt(question)
contents = [prompt]

response = multimodal_model.generate_content(contents)
IPython.display.Markdown(response.text)