In [None]:
#Import the necessary packages
!pip3 install --upgrade --user google-cloud-aiplatform pymupdf

In [1]:
import sys

# Additional authentication is required for Google Colab
if "google.colab" in sys.modules:
    # Authenticate user to Google Cloud
    from google.colab import auth

    auth.authenticate_user()

In [2]:
# Define project information

PROJECT_ID = "smithaargolisinternal"  # @param {type:"string"}
LOCATION = "us-central1"  # @param {type:"string"}

# if not running on colab, try to get the PROJECT_ID automatically
if "google.colab" not in sys.modules:
    import subprocess

    PROJECT_ID = subprocess.check_output(
        ["gcloud", "config", "get-value", "project"], text=True
    ).strip()

print(f"Your project ID is: {PROJECT_ID}")

Your project ID is: smithaargolisinternal


In [3]:
import sys

if "google.colab" in sys.modules:
    # Initialize Vertex AI
    import vertexai

    vertexai.init(project=PROJECT_ID, location=LOCATION)

In [4]:
from IPython.display import Markdown, display
from vertexai.preview.generative_models import (
    Content,
    GenerationConfig,
    GenerationResponse,
    GenerativeModel,
    Image,
    Part,
)
from vertexai.language_models import TextEmbeddingModel

In [5]:
model = GenerativeModel("gemini-pro-vision")
text_embedding_model = TextEmbeddingModel.from_pretrained("textembedding-gecko@001")
multi_modal_embedding_endpoint= f"projects/{PROJECT_ID}/locations/us-central1/publishers/google/models/multimodalembedding@001"

In [None]:
# download documents and images used in this notebook
!gsutil -m rsync -r gs://genpact-sample .
print("Download completed")

In [7]:
# Method to open the pdf object
import fitz

def open_pdf(pdf_path:str) -> tuple[fitz.Document, int]:
    # Open the PDF file
    doc: fitz.Document = fitz.open(pdf_path)

    # Get the number of pages in the PDF file
    num_pages: int = len(doc)

    return doc, num_pages

In [8]:
# Open the pdf document
doc, num_pages = open_pdf("Genpact Reports Third Quarter 2023 Results.pdf")

print(f"Number of pages: {num_pages}")

Number of pages: 6


In [32]:
#Generate text and image embeddings using multimodal apiendpoint
from google.protobuf import struct_pb2
from google.cloud import aiplatform

def get_embedding(text:None, encoded_image_content:None):
    # Create a client to interact with the Vertex AI Prediction Service
    client = aiplatform.gapic.PredictionServiceClient(
          client_options={"api_endpoint": "us-central1-aiplatform.googleapis.com"}
    )

    # Define the parameters for the prediction request
    parameters = {"dimension": 128}
    instance = struct_pb2.Struct()
    if text:
      instance.fields["text"].string_value = text
    if image:
        instance["image"] = {"bytesBase64Encoded": encoded_image_content}

    instances = [instance]

    # Make the prediction request
    response = client.predict(
        endpoint=multi_modal_embedding_endpoint,
        instances=instances,
        parameters=parameters,
    )
    text_embedding = [v for v in response.predictions[0].get("textEmbedding", [])]
    image_embedding = [v for v in response.predictions[0].get("imageEmbedding", [])]
    text_embedding = text_embedding + image_embedding
    return text_embedding

In [None]:
# Extract the text from the document and split the document with 1024 characters
import pandas as pd
from base64 import b64encode

text = ""
df = pd.DataFrame(columns=["id", "pagetext", "embedding"])

# Initialize variables
chunk_number = 1
chunked_text_dict = {}
j = -1
embs = []
for page in doc:
    j = j+1
    #text = page.get_text()
    text: str = page.get_text().encode("ascii", "ignore").decode("utf-8", "ignore")
    image: bytes = page.get_pixmap().tobytes()

    if(image):
      encoded_content = b64encode(image).decode("utf-8")

    # Iterate over page text to create 1024 chunks
    for i in range(0, len(text), 1024):
        end_index = min(i + 1024, len(text))
        chunk = text[i:end_index]

        # Encode and decode for consistent encoding
        chunked_text_dict[chunk_number] = chunk.encode("ascii", "ignore").decode(
            "utf-8", "ignore"
        )

        # Increment chunk number
        chunk_number += 1
        embeddings = get_embedding(chunk, encoded_content)
        embs = embs + [embeddings]
        new_rows = pd.DataFrame({'id': j, 'pagetext': text, 'embedding':embs})
        df = df.append(new_rows,  ignore_index=True)

print(df.count)
print(df.head())

In [34]:
#save id and embedding as a json file
jsonl_string = df[['id', 'embedding']].to_json(orient = 'records', lines = True)
with open('questions.json', 'w') as f:
  f.write(jsonl_string)

# show the first few lines of the json file
! head -n 3 questions.json

{"id":0,"embedding":[0.162386492,-0.13732411,-0.0335402898,0.14398022,-0.122333094,-0.0377790257,0.0240865219,-0.0017052217,0.0334912129,0.210242569,0.0621364228,-0.0291163661,0.0024660607,-0.0016409442,-0.0406342819,0.0893247053,0.0547820441,-0.0632302314,-0.0100670774,-0.0338799804,-0.0430848673,-0.0820516944,-0.0881515145,-0.0669674054,0.0066038007,-0.10697303,0.145512149,0.0092992894,0.0397034436,-0.0342278443,0.0223351121,0.0772869736,0.0171164759,0.0904175565,-0.0189724378,0.0168321133,0.0861063227,0.0695455819,-0.196284965,0.0514312312,0.129099473,0.006122801,-0.004913826,0.0615970381,0.0292057078,-0.0128903249,0.0081424993,0.0611125492,0.0604749694,0.0465256609,0.0135714663,-0.0094454475,0.0139428508,-0.0803473741,0.0827211514,-0.0297323409,-0.0343248658,0.420471,-0.163813666,-0.0094759557,-0.0911347941,0.0115482658,-0.0160371661,0.0856366903,-0.0464085862,0.109886266,-0.106389679,0.0270563066,-0.0594360642,0.063778989,-0.0374573506,0.0779226869,0.0440982543,0.0217658207,-0.386

In [35]:
from datetime import datetime
UID = datetime.now().strftime("%m%d%H%M")

BUCKET_URI = f"gs://{PROJECT_ID}-gemini-rag-{UID}"
! gsutil mb -l $LOCATION -p {PROJECT_ID} {BUCKET_URI}
! gsutil cp questions.json {BUCKET_URI}

Creating gs://smithaargolisinternal-gemini-rag-12251954/...
Copying file://questions.json [Content-Type=application/json]...
/ [1 files][  1.8 MiB/  1.8 MiB]                                                
Operation completed over 1 objects/1.8 MiB.                                      


In [None]:
# create index
my_index = aiplatform.MatchingEngineIndex.create_tree_ah_index(
  display_name = f"gemini-rag-{UID}",
  contents_delta_uri = BUCKET_URI,
  dimensions = 128,
  approximate_neighbors_count = 20,
  distance_measure_type = "DOT_PRODUCT_DISTANCE",
)

In [None]:
# create IndexEndpoint
my_index_endpoint = aiplatform.MatchingEngineIndexEndpoint.create(
  display_name = f"gemini-rag-index-endpoint-{UID}",
  public_endpoint_enabled = True,
)

In [78]:
DEPLOYED_INDEX_ID = f"gemini_rag_index_endpoint_{UID}"

In [None]:
# deploy the Index to the Index Endpoint
my_index_endpoint.deploy_index(
  index = my_index, deployed_index_id = DEPLOYED_INDEX_ID
)

In [98]:
# Now it is time to ask a question
user_query = "What is the total revenue for the full year 2023"
test_embeddings = get_embedding(user_query)
print(test_embeddings)

[0.183731273, -0.0349904299, -0.0426771566, 0.11915233, -0.0193503462, 0.0247816965, 0.0378496759, -0.0146224238, -0.0626877099, 0.207170337, 0.00691538677, -0.0751065239, -0.0663538277, -0.00507476786, 0.011584417, 0.0424236581, -0.0653219596, 0.00397406891, 0.0848871, -0.00989756268, -0.0128830122, -0.083487235, -0.0213273074, -0.0531130955, -0.0108254049, -0.140989453, 0.0146959815, 0.0401162915, 0.0803446323, -0.0576752089, 0.0455237888, 0.0702870414, -0.00101315591, 0.0256027803, -0.0547730736, 0.0532669835, 0.00965765, 0.0950957462, -0.214451715, 0.0324477777, 0.116984457, 0.050259728, -0.0068517914, -0.00424313778, 0.0226478633, 0.0629376248, -0.00765806157, -0.011567439, -0.0432947762, -0.033815261, 0.0157372281, 0.0366843753, -0.0978522152, -0.0352547392, -0.044441741, -0.000378627883, -0.0203806777, 0.463642389, -0.0330315158, -0.0918604, -0.0729816109, 0.00584390899, -0.0138778016, 0.0200585965, 0.00529478397, -0.0107086841, -0.099268727, -0.0405842401, -0.00683561759, 0.021

In [None]:
# Test query
response = my_index_endpoint.find_neighbors(
  deployed_index_id = DEPLOYED_INDEX_ID,
  queries = [test_embeddings],
  num_neighbors = 2,
)

# show the result
import numpy as np
for idx, neighbor in enumerate(response[0]):
  id = np.int64(neighbor.id)
  similar = df.query("id == @id", engine = "python")
  print(f"{neighbor.distance:.4f} {similar.pagetext.values[0]}")

In [100]:
from vertexai.preview.generative_models import (Content,
                                                GenerationConfig,
                                                GenerativeModel,
                                                GenerationResponse,
                                                Image,
                                                HarmCategory,
                                                HarmBlockThreshold,
                                                Part)
i = 0
nn = []
for idx, neighbor in enumerate(response[0]):
  id = np.int64(neighbor.id)
  similar = df.query("id == @id", engine = "python")
  context = similar.pagetext.values[0]

# Craft Prompt and Invoke Model
prompt = f"""
Context: You are Q&A Agent, an expert in reading earnings document.
Use the following essay you wrote to give a detailed answer to any questions you receive: {context}
Question: {user_query}
    """

safety_settings={
        HarmCategory.HARM_CATEGORY_HARASSMENT: HarmBlockThreshold.BLOCK_NONE,
        HarmCategory.HARM_CATEGORY_HATE_SPEECH: HarmBlockThreshold.BLOCK_NONE,
        HarmCategory.HARM_CATEGORY_SEXUALLY_EXPLICIT: HarmBlockThreshold.BLOCK_NONE,
        HarmCategory.HARM_CATEGORY_DANGEROUS_CONTENT: HarmBlockThreshold.BLOCK_NONE,
    }
config = {
        "temperature": 0.8,
        "max_output_tokens": 2048,
        }

responses = model.generate_content(prompt,
                                       generation_config = config,
                                       safety_settings=safety_settings,
                                       stream=True)
final_response = []
for response in responses:
      try:
        # st.write(response.text)
        final_response.append(response.text)
      except IndexError:
        # st.write(response)
        final_response.append("")
        continue

print("Answer:")
print(final_response)

Answer:
['I apologize, but I cannot answer your question based on the information you have provided', ". The document does not include any information about the company's total revenue for the full year of 2023."]
