In [1]:
# Copyright 2024 Forusone
#
# Licensed under the Apache License, Version 2.0 (the "License");
# you may not use this file except in compliance with the License.
# You may obtain a copy of the License at
#
#     https://www.apache.org/licenses/LICENSE-2.0
#
# Unless required by applicable law or agreed to in writing, software
# distributed under the License is distributed on an "AS IS" BASIS,
# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
# See the License for the specific language governing permissions and
# limitations under the License.

# RAG Engine in Vertex AI


In [2]:
# @title Install packages
%pip install --upgrade --user --quiet google-cloud-aiplatform[agent_engines,langchain,ag2] \
                                      google-genai

In [3]:
# Note:  Need to restart the kernel
%pip install --upgrade --user --quiet langchain-google-vertexai

In [4]:
# @title Authentication to access to GCP

# To use markdown for output data from LLM
from IPython.display import display, Markdown

# Use OAuth to access the GCP environment.
import sys
if "google.colab" in sys.modules:
    from google.colab import auth
    auth.authenticate_user()

In [5]:
# @title Define constants
PROJECT_ID = "ai-hangsik"
LOCATION = "us-central1"
MODEL_NAME = "gemini-2.0-flash-001"

In [6]:
# @title Initialize Vertex AI

import os

import vertexai
from vertexai import rag
from vertexai.generative_models import GenerativeModel, Tool

# from google.genai.types import GenerateContentConfig, Retrieval, Tool, VertexRagStore
from google import genai


vertexai.init(project=PROJECT_ID, location=LOCATION)
client = genai.Client(vertexai=True, project=PROJECT_ID, location=LOCATION)

In [7]:
# @title Create a RAG Corpus
# Currently supports Google first-party embedding models
# https://cloud.google.com/vertex-ai/generative-ai/docs/embeddings/get-text-embeddings#supported-models

#EMBEDDING_MODEL = "publishers/google/models/text-embedding-004"
EMBEDDING_MODEL="publishers/google/models/text-multilingual-embedding-002"

rag_corpus = rag.create_corpus(
    display_name="my-rag-corpus",
    backend_config=rag.RagVectorDbConfig(
        rag_embedding_model_config=rag.RagEmbeddingModelConfig(
            vertex_prediction_endpoint=rag.VertexPredictionEndpoint(
                publisher_model=EMBEDDING_MODEL
            )
        )
    ),
)

### Check the corpus just created

In [8]:
rag.list_corpora()

ListRagCorporaPager<rag_corpora {
  name: "projects/ai-hangsik/locations/us-central1/ragCorpora/2662753279682805760"
  display_name: "my-rag-corpus"
  create_time {
    seconds: 1741077278
    nanos: 256646000
  }
  update_time {
    seconds: 1741077278
    nanos: 256646000
  }
  corpus_status {
    state: ACTIVE
  }
  vector_db_config {
    rag_managed_db {
    }
    rag_embedding_model_config {
      vertex_prediction_endpoint {
        endpoint: "projects/721521243942/locations/us-central1/publishers/google/models/text-embedding-005"
      }
    }
  }
}
rag_corpora {
  name: "projects/ai-hangsik/locations/us-central1/ragCorpora/5359846506524311552"
  display_name: "my-rag-corpus"
  create_time {
    seconds: 1741136583
    nanos: 493126000
  }
  update_time {
    seconds: 1741136583
    nanos: 493126000
  }
  corpus_status {
    state: ACTIVE
  }
  vector_db_config {
    rag_managed_db {
    }
    rag_embedding_model_config {
      vertex_prediction_endpoint {
        endpoint: "pro

In [9]:
# @title Import files from Google Cloud Storage

transformation_config = rag.TransformationConfig(
      chunking_config=rag.ChunkingConfig(
          chunk_size=512,
          chunk_overlap=100,
      ),
  )

# INPUT_GCS_BUCKET_LIST = ["gs://it_laws_kr/law_pdf/개인정보 보호법(법률)(제19234호)(20240315).pdf",
#         "gs://it_laws_kr/law_pdf/정보통신망 이용촉진 및 정보보호 등에 관한 법률(법률)(제20069호)(20240123).pdf",
#         ]

INPUT_GCS_BUCKET = (
    "gs://it_laws_kr/law_pdf/"
)

response = rag.import_files(
    corpus_name=rag_corpus.name,
    paths=[INPUT_GCS_BUCKET],
    transformation_config=transformation_config, # Optional
    max_embedding_requests_per_min=1000,  # Optional
)

In [10]:
rag.list_files(rag_corpus.name)

ListRagFilesPager<rag_files {
  gcs_source {
    uris: "gs://it_laws_kr/law_pdf/개인정보 보호법(법률)(제19234호)(20240315).pdf"
  }
  name: "projects/721521243942/locations/us-central1/ragCorpora/6838716034162098176/ragFiles/5392607820359054834"
  display_name: "개인정보 보호법(법률)(제19234호)(20240315).pdf"
  create_time {
    seconds: 1742360575
    nanos: 434902000
  }
  update_time {
    seconds: 1742360575
    nanos: 434902000
  }
  file_status {
    state: ACTIVE
  }
}
rag_files {
  gcs_source {
    uris: "gs://it_laws_kr/law_pdf/신용정보의 이용 및 보호에 관한 법률(법률)(제19234호)(20230915).pdf"
  }
  name: "projects/721521243942/locations/us-central1/ragCorpora/6838716034162098176/ragFiles/5392607832646398257"
  display_name: "신용정보의 이용 및 보호에 관한 법률(법률)(제19234호)(20230915).pdf"
  create_time {
    seconds: 1742360577
    nanos: 21783000
  }
  update_time {
    seconds: 1742360577
    nanos: 21783000
  }
  file_status {
    state:

### Optional: Perform direct context retrieval

In [11]:
rag_retrieval_config=rag.RagRetrievalConfig(
    top_k=3,  # Optional
    filter=rag.Filter(vector_distance_threshold=0.5)  # Optional
)

# Direct context retrieval
response = rag.retrieval_query(
    rag_resources=[
        rag.RagResource(
            rag_corpus=rag_corpus.name,
            # Optional: supply IDs from `rag.list_files()`.
            # rag_file_ids=["rag-file-1", "rag-file-2", ...],
        )
    ],
    text="개인정보 보호법에 대해서 설명해주세요.",
    rag_retrieval_config=rag_retrieval_config,
)
print(response)

# Optional: The retrieved context can be passed to any SDK or model generation API to generate final results.
# context = " ".join([context.text for context in response.contexts.contexts]).replace("\n", "")

contexts {
  contexts {
    source_uri: "gs://it_laws_kr/law_pdf/개인정보 보호법(법률)(제19234호)(20240315).pdf"
    source_display_name: "개인정보 보호법(법률)(제19234호)(20240315).pdf"
    text: "법제처 1 국가법령정보센터\r\n개인정보 보호법\r\n \r\n개인정보 보호법\r\n[시행 2024. 3. 15.] [법률 제19234호, 2023. 3. 14., 일부개정]\r\n개인정보보호위원회 (개인정보보호정책과 - 법령 제개정) 02-2100-3057\r\n개인정보보호위원회 (심사총괄담당관 - 법령 해석) 02-2100-3043\r\n 제1장 총칙\r\n제1조(목적) 이 법은 개인정보의 처리 및 보호에 관한 사항을 정함으로써 개인의 자유와 권리를 보호하고, 나아가 개인\r\n의 존엄과 가치를 구현함을 목적으로 한다. <개정 2014. 3. 24.>\r\n제2조(정의) 이 법에서 사용하는 용어의 뜻은 다음과 같다. <개정 2014. 3. 24., 2020. 2. 4., 2023. 3. 14.>\r\n1. “개인정보”란 살아 있는 개인에 관한 정보로서 다음 각 목의 어느 하나에 해당하는 정보를 말한다.\r\n가."
    score: 0.15580535208718393
  }
  contexts {
    source_uri: "gs://it_laws_kr/law_pdf/개인정보 보호법(법률)(제19234호)(20240315).pdf"
    source_display_name: "개인정보 보호법(법률)(제19234호)(20240315).pdf"
    text: "<개정 2023. 3. 14.>\r\n1. 개인정보의 수집 출처법제처 10 국가법령정보센터\r\n개인정보 보호법\r\n2. 개인정보의 처리 목적\r\n3. 제37조에 따른 

### [Preview] Create RAG Retrieval Tool

In [12]:
# Create a tool for the RAG Corpus
rag_retrieval_tool = Tool.from_retrieval(
    retrieval=rag.Retrieval(
        source=rag.VertexRagStore(
            rag_resources=[
                rag.RagResource(
                    rag_corpus=rag_corpus.name,  # Currently only 1 corpus is allowed.
                    # Optional: supply IDs from `rag.list_files()`.
                    # rag_file_ids=["rag-file-1", "rag-file-2", ...],
                )
            ],
            rag_retrieval_config=rag_retrieval_config,
        ),
    )
)

In [13]:
# @title Generate Content with Gemini using Rag Retrieval Tool
# Load tool into Gemini model
rag_gemini_model = GenerativeModel(
    MODEL_NAME,
    tools=[rag_retrieval_tool],
)

In [14]:
response = rag_gemini_model.generate_content("개인정보 보호법에 대해서 설명해주세요.")
display(Markdown(response.text))

개인정보 보호법은 개인 정보의 처리 및 보호에 관한 사항을 규정하여 개인의 자유와 권리를 보호하고 개인의 존엄과 가치를 구현하는 것을 목적으로 합니다. 이 법은 개인정보 보호에 관한 사무를 독립적으로 수행하기 위해 국무총리 소속으로 개인정보 보호위원회를 둡니다. 개인정보의 처리 및 보호에 관하여 다른 법률에 특별한 규정이 있는 경우를 제외하고는 이 법에서 정하는 바에 따르며, 다른 법률을 제정하거나 개정하는 경우에는 이 법의 목적과 원칙에 맞도록 해야 합니다.
