## Getting Started
### Install Vertex AI SDK, other packages and their dependencies

In [3]:
# Install Vertex AI LLM SDK
! pip install --user google-cloud-aiplatform==1.27.0 langchain==0.0.201

# Dependencies required by Unstructured PDF loader
! sudo apt -y -qq install tesseract-ocr libtesseract-dev 
! sudo apt-get -y -qq install poppler-utils
! pip install --user unstructured==0.7.5 pdf2image==1.16.3 pytesseract==0.3.10 pdfminer.six

!pip install --upgrade pypdf


Collecting langchain==0.0.201
  Using cached langchain-0.0.201-py3-none-any.whl (1.0 MB)
Installing collected packages: langchain
[0mSuccessfully installed langchain-0.0.201
libtesseract-dev is already the newest version (4.1.1-2.1).
tesseract-ocr is already the newest version (4.1.1-2.1).
0 upgraded, 0 newly installed, 0 to remove and 15 not upgraded.


In [7]:
## Utils for Matching Engine

!pip install github-clone
!ghclone https://github.com/GoogleCloudPlatform/generative-ai/tree/main/language/examples/document-qa/utils

Cloning into 'utils'...
done.


In [28]:
PROJECT_ID = "analytics-ml-ai"  # @param {type:"string"}
LOCATION = "us-central1"  # @param {type:"string"}
BUCKET_NAME = "gs://rfpbot-stage-me"
GCS_BUCKET_ROOT = "gs://rfpbot-stage-me/products/"
LOCAL_DIR = "rfpbot/soc2"
CHUNK_SIZE = 4000
CHUNK_OVERLAP = 100


### Import libraries

In [2]:
import uuid
import json

# Utils
import time
import uuid
from typing import List

import numpy as np

# Vertex AI
from google.cloud import aiplatform
import vertexai


# Langchain
import langchain

print(f"LangChain version: {langchain.__version__}")

from langchain.embeddings import VertexAIEmbeddings
from langchain.llms import VertexAI


# Import custom Matching Engine packages
from utils.matching_engine import MatchingEngine
from utils.matching_engine_utils import MatchingEngineUtils

LangChain version: 0.0.201


In [3]:
## FOR LOCAL FILE include complete path
filename_url="../sources/GCP-[Winter-2023] GCP SOC 2..pdf"

## FOR HTTP URL OF A PDF FILE
# filename_url="https://www2.ed.gov/about/reports/annual/2022report/agency-financial-report.pdf"

In [4]:
all_docs = {}

In [5]:
# Ingest PDF files
from langchain.document_loaders import PyPDFLoader

# # US Department of Education 2022 Agency Financial Report
# urls = [
#     "https://www2.ed.gov/about/reports/annual/2022report/agency-financial-report.pdf",
#     # "https://www2.ed.gov/about/reports/annual/2021report/agency-financial-report.pdf"
# ]
documents = []

# loader = PyPDFLoader("/sources/"+filename)
loader = PyPDFLoader(filename_url)

documents.extend(loader.load())

print(f"# of pages loaded (pre-chunking) = {len(documents)}")

# of pages loaded (pre-chunking) = 162


In [6]:
documents[0]

Document(page_content=' \n \n \n \n \n \nSystem and Organization Controls (SOC) 2 Type II Report  \nDescription of the Google Cloud Platform System  \nFor the Period 1 March 2022 to 28 February 2023  \nWith Independent Service Auditor’s Report  \nIncluding Tests Performed and Results Thereof  \n \n  \nryanrichard@google.comGoogle Confidental Information', metadata={'source': '../sources/GCP-[Winter-2023] GCP SOC 2..pdf', 'page': 0})

In [7]:
all_content = []

for article in documents:
    myid = str(uuid.uuid4())
    all_docs[myid] = {}
    c = article.page_content.replace("\n", "")
    all_docs[myid]['content'] = c
    all_docs[myid]['url'] = article.metadata['source']

In [None]:
for k, v in all_docs.items():
    print(k)
    print(v)
   # break

In [54]:
# Set the project id
! gcloud config set project $PROJECT_ID
! gsutil mb -l $LOCATION -p $PROJECT_ID $BUCKET_NAME

Updated property [core/project].
Creating gs://rfpbot-stage/...
ServiceException: 409 A Cloud Storage bucket named 'rfpbot-stage' already exists. Try another name. Bucket names must be globally unique across all Google Cloud projects, including those outside of your organization.


In [8]:
from langchain.text_splitter import RecursiveCharacterTextSplitter

text_splitter = RecursiveCharacterTextSplitter(chunk_size=CHUNK_SIZE, chunk_overlap=CHUNK_OVERLAP)
doc_splits = text_splitter.split_documents(documents)
for doc in doc_splits:
    doc.metadata['id'] = str(uuid.uuid4()) # should we hash the URL + Heading?
    c = doc.page_content.replace("\n", "")
    doc.page_content = c
print(f"# of documents = {len(doc_splits)}")

# of documents = 162


In [23]:
doc_splits[0].metadata

{'source': '../sources/GCP-[Winter-2023] GCP SOC 2..pdf',
 'page': 0,
 'id': '05176d42-3ca5-482b-80b0-5bd7df09a5da'}

In [17]:
REQUESTS_PER_MINUTE = 300


llm = VertexAI(
    model_name='text-bison@001',
    max_output_tokens=512,
    temperature=0.1,
    top_p=0.8,
    top_k=40,
    verbose=True,
)

# Chat instance integrated with langChain
#chat = VertexChat()

# Embeddings API integrated with langChain
embedding = VertexAIEmbeddings(requests_per_minute=REQUESTS_PER_MINUTE)

embedding

VertexAIEmbeddings(client=<vertexai.language_models._language_models._PreviewTextEmbeddingModel object at 0x7f48747660e0>, model_name='textembedding-gecko', temperature=0.0, max_output_tokens=128, top_p=0.95, top_k=40, stop=None, project=None, location='us-central1', credentials=None)

In [18]:
mengine = MatchingEngineUtils(PROJECT_ID, "us-central1", "rfpbot_all_products_stage")

In [19]:
ME_INDEX_ID, ME_INDEX_ENDPOINT_ID = mengine.get_index_and_endpoint()
print(f"ME_INDEX_ID={ME_INDEX_ID}")
print(f"ME_INDEX_ENDPOINT_ID={ME_INDEX_ENDPOINT_ID}")


ME_INDEX_ID=projects/184378960328/locations/us-central1/indexes/9057504110734999552
ME_INDEX_ENDPOINT_ID=projects/184378960328/locations/us-central1/indexEndpoints/7247057060532060160


In [29]:
# initialize vector store
me = MatchingEngine.from_components(
    project_id=PROJECT_ID,
    region=LOCATION,
    gcs_bucket_name="gs://rfpbot-stage-me",
    embedding=embedding,
    index_id=ME_INDEX_ID,
    endpoint_id=ME_INDEX_ENDPOINT_ID
)

In [30]:
texts = [doc.page_content for doc in doc_splits]
metadatas = [
    [
        {"namespace": "source", "allow_list": [doc.metadata["source"]]},
        # {"namespace": "document_name", "allow_list": [doc.metadata["id"]]},
        # {"namespace": "chunk", "allow_list": [str(doc.metadata["chunk"])]},
    ]
    for doc in doc_splits
]

In [31]:
doc_ids = me.add_texts(texts=texts, metadatas=metadatas)

INFO:root:Indexed 162 documents to Matching Engine.


In [32]:
# Test whether search from vector store is working
me.similarity_search("AI Platform Data Labelling", k=2)

[Document(page_content="Google LLC | Descri ption of the Google Cloud Platform System  15 Artificial Intelligence (AI) and Machine Learning (ML)  AI Platform Data Labeling  AI Platform Data Labeling is a service that helps developers obtain data to train and evaluate their machine learning models. It supports labeling for image, video, text, and audio as well as centralized management of labeled data.  AI Platform Neural Architecture Search (NAS)  NAS is a managed service leveraging Google's neural architecture search technology to generate, evaluate, and train numerous model architectures for a customer's application. NAS training services facilitate management of large -scale experiments.  AI Platform Training and Prediction  AI Platform Training and Prediction is a managed service that enables users to easily build machine learning models with popular frameworks like TensorFlow, XGBoost and Scikit Learn. It provides scalable training and prediction services that work on large datase