In [None]:
# Copyright 2025 Forusone
#
# Licensed under the Apache License, Version 2.0 (the "License");
# you may not use this file except in compliance with the License.
# You may obtain a copy of the License at
#
#     https://www.apache.org/licenses/LICENSE-2.0
#
# Unless required by applicable law or agreed to in writing, software
# distributed under the License is distributed on an "AS IS" BASIS,
# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
# See the License for the specific language governing permissions and
# limitations under the License.

# RAG Engine Management

https://cloud.google.com/vertex-ai/generative-ai/docs/model-reference/rag-api#python_1



### Install Vertex AI SDK for Python

In [None]:
# @title Install packages
%pip install --upgrade --user --quiet google-cloud-aiplatform[agent_engines,adk,langchain,ag2,llama_index] \
                                      google-genai

[2K     [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m43.5/43.5 kB[0m [31m853.5 kB/s[0m eta [36m0:00:00[0m
[2K   [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m1.2/1.2 MB[0m [31m5.5 MB/s[0m eta [36m0:00:00[0m
[2K   [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m219.2/219.2 kB[0m [31m3.9 MB/s[0m eta [36m0:00:00[0m
[2K   [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m103.3/103.3 kB[0m [31m2.8 MB/s[0m eta [36m0:00:00[0m
[2K   [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m99.2/99.2 kB[0m [31m3.6 MB/s[0m eta [36m0:00:00[0m
[2K   [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m153.7/153.7 kB[0m [31m6.8 MB/s[0m eta [36m0:00:00[0m
[2K   [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m734.2/734.2 kB[0m [31m5.6 MB/s[0m eta [36m0:00:00[0m
[2K   [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m232.1/232.1 kB[0m [31m3.8 MB/s[0m eta [36m0:00:00[0m
[2K   [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━

In [None]:
# Note:  Need to restart the kernel
%pip install --upgrade --user --quiet langchain-google-vertexai
%pip install --upgrade --user --quiet langchain-community

[?25l   [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m0.0/2.5 MB[0m [31m?[0m eta [36m-:--:--[0m[2K   [91m━━━[0m[91m╸[0m[90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m0.2/2.5 MB[0m [31m6.6 MB/s[0m eta [36m0:00:01[0m[2K   [91m━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m[91m╸[0m[90m━━━━━━━━━━━━[0m [32m1.8/2.5 MB[0m [31m25.1 MB/s[0m eta [36m0:00:01[0m[2K   [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m2.5/2.5 MB[0m [31m25.0 MB/s[0m eta [36m0:00:00[0m
[?25h

#### Note: <font color="red">Need to restart the kernel after installing</font>

In [None]:
exit()

In [None]:
# @title Authentication to access to GCP

# To use markdown for output data from LLM
from IPython.display import display, Markdown

# @title Define constants
PROJECT_ID = "ai-hangsik"
LOCATION = "us-central1"
MODEL_NAME = "gemini-2.0-flash"
BUCKET_URI = f"gs://agent-0417"

# Use OAuth to access the GCP environment.
import sys
if "google.colab" in sys.modules:
    from google.colab import auth
    auth.authenticate_user(project_id = PROJECT_ID)

In [None]:
# @title Create a bucket.
! gsutil mb -l {LOCATION} -p {PROJECT_ID} {BUCKET_URI}

### Package import and initialize model

In [None]:
# @title Initialize Vertex AI with Staging Bucket.

import vertexai
from vertexai import agent_engines
from google import genai

vertexai.init(project=PROJECT_ID, location=LOCATION, staging_bucket=BUCKET_URI)

client = genai.Client(vertexai=True, project=PROJECT_ID, location=LOCATION)

## RAG Engine management

### Corpus managment helper functions

In [None]:
from vertexai import rag

#---------------------------------------------
def get_list_corpora():
  """
  Get corpora list.
  """
  corpora = rag.list_corpora()

  for corpus in corpora:
    print(f"[{corpus.create_time}][{corpus.display_name}][{corpus.name}]")

  return corpora

#---------------------------------------------

def get_corpus(corpus_name):
  """
  Get corpus.
  """
  corpus = rag.get_corpus(name=corpus_name)
  print(corpus)
  return corpus

#---------------------------------------------

def del_corpus(corpus_name):
  """
  Delete corpus.
  """

  rag.delete_corpus(name=corpus_name)
  print(f"Corpus {corpus_name} deleted. takes a little time to check the corpus is deleted.")


#### Get list of corpora that are registered

In [None]:
corpora = get_list_corpora()

[2025-04-09 05:35:24.867152+00:00][ai_agent][projects/ai-hangsik/locations/us-central1/ragCorpora/7186056155423047680]
[2025-04-15 11:24:03.402169+00:00][Alphabet_10K_2024_corpus][projects/ai-hangsik/locations/us-central1/ragCorpora/2842897264777625600]
[2025-04-16 07:03:34.092830+00:00][it-laws][projects/ai-hangsik/locations/us-central1/ragCorpora/1113515007867355136]


#### Get a corpus

In [None]:
corpus_name = "projects/ai-hangsik/locations/us-central1/ragCorpora/7186056155423047680"
get_corpus(corpus_name)

RagCorpus(name='projects/ai-hangsik/locations/us-central1/ragCorpora/7186056155423047680', display_name='ai_agent', description='', vertex_ai_search_config=None, backend_config=RagVectorDbConfig(vector_db=RagManagedDb(), rag_embedding_model_config=RagEmbeddingModelConfig(vertex_prediction_endpoint=VertexPredictionEndpoint(endpoint=None, publisher_model='projects/ai-hangsik/locations/us-central1/publishers/google/models/text-multilingual-embedding-002', model=None, model_version_id=None))))


RagCorpus(name='projects/ai-hangsik/locations/us-central1/ragCorpora/7186056155423047680', display_name='ai_agent', description='', vertex_ai_search_config=None, backend_config=RagVectorDbConfig(vector_db=RagManagedDb(), rag_embedding_model_config=RagEmbeddingModelConfig(vertex_prediction_endpoint=VertexPredictionEndpoint(endpoint=None, publisher_model='projects/ai-hangsik/locations/us-central1/publishers/google/models/text-multilingual-embedding-002', model=None, model_version_id=None))))

#### delete a corpus

In [None]:
corpus_name = "projects/ai-hangsik/locations/us-central1/ragCorpora/7186056155423047680"
# del_corpus(corpus_name)

### File management helper functions

In [None]:
from vertexai import rag

#---------------------------------------------
def upload_file(corpus_name,
                path,
                display_name,
                description
                ):
  """
  Upload file
  """

  rag_file = rag.upload_file(
    corpus_name=corpus_name,
    path=path,
    display_name=display_name,
    description=description,
  )

  print(rag_file)

  return rag_file

#---------------------------------------------

def import_files(corpus_name,
                 paths,
                 chunk_size,
                 chunk_overlap
                 ):
  """
  Get corpus.

  # corpus_name = "projects/{PROJECT_ID}/locations/us-central1/ragCorpora/{rag_corpus_id}"
  # paths = ["https://drive.google.com/file/123", "gs://my_bucket/my_files_dir"]  # Supports Google Cloud Storage and Google Drive Links

  """

  # Note : Layout parser setting.
  """
      parser = LayoutParserConfig(
          processor_name="projects/my-project/locations/us-central1/processors/my-processor-id",
          max_parsing_requests_per_min=120,
      )
      response = rag.import_files(
          corpus_name="projects/my-project/locations/us-central1/ragCorpora/my-corpus-1",
          paths=paths,
          parser=parser,
      )
  """

  transformation_config = rag.TransformationConfig(
    chunking_config=rag.ChunkingConfig(
        chunk_size=chunk_size,
        chunk_overlap=chunk_overlap,
    ),
  )
  response = rag.import_files(
      corpus_name=corpus_name,
      paths=paths,
      transformation_config = transformation_config,
      max_embedding_requests_per_min=900,  # Optional
  )

  print(f"Imported {response.imported_rag_files_count} files.")

  return response

#---------------------------------------------

def get_rag_files(corpus_name):
  """
  Get files
  """

  files = rag.list_files(corpus_name=corpus_name)
  for file in files:
      print(file.display_name)
      print(file.name)

  return files

#---------------------------------------------

def get_rag_file(file_name):
  """
  Get a file.
  """
  rag_file = rag.get_file(name=file_name)
  print(rag_file)
  return rag_file

#---------------------------------------------

def del_lag_file(file_name):
  """
  Delete file.
  """

  rag.delete_file(name=file_name)
  print(f"File {file_name} deleted. takes a little time to check the file is deleted.")


#### upload rag file from local stroage

In [None]:
corpus_name = "projects/ai-hangsik/locations/us-central1/ragCorpora/7186056155423047680"
path = "./google-ai-agents-whitepaper.pdf"
display_name = "ai-agents-whitepaper"
description = "ai-agents-whitepaper desc"

rag_file = upload_file(corpus_name,
                path,
                display_name,
                description
                )
print(rag_file)

RagFile(name='projects/ai-hangsik/locations/us-central1/ragCorpora/2662753279682805760/ragFiles/5392603022199990948', display_name='ai-agents-whitepaper', description='ai-agents-whitepaper desc')
RagFile(name='projects/ai-hangsik/locations/us-central1/ragCorpora/2662753279682805760/ragFiles/5392603022199990948', display_name='ai-agents-whitepaper', description='ai-agents-whitepaper desc')


#### import rag files from Drive or GCS

In [None]:
corpus_name = "projects/ai-hangsik/locations/us-central1/ragCorpora/7186056155423047680"
paths = ["gs://daou_office_manual/manual_org/DaouOffice 클라우드 관리자 가이드.pdf",
        "gs://daou_office_manual/manual_org/DaouOffice 클라우드 서비스 가이드 3.5.21.pdf",
        ]

rag_files = import_files(corpus_name,
                paths,
                chunk_size=512,
                chunk_overlap=100

                )

print(rag_files)

Imported 2 files.
imported_rag_files_count: 2



#### get rag file list

In [None]:
corpus_name = "projects/ai-hangsik/locations/us-central1/ragCorpora/7186056155423047680"
get_rag_files(corpus_name)

google-ai-agents-whitepaper.pdf
projects/721521243942/locations/us-central1/ragCorpora/7186056155423047680/ragFiles/5407844854494745030
DaouOffice 클라우드 관리자 가이드.pdf
projects/721521243942/locations/us-central1/ragCorpora/7186056155423047680/ragFiles/5412978057552637858
DaouOffice 클라우드 서비스 가이드 3.5.21.pdf
projects/721521243942/locations/us-central1/ragCorpora/7186056155423047680/ragFiles/5412978319244496986


ListRagFilesPager<rag_files {
  name: "projects/721521243942/locations/us-central1/ragCorpora/7186056155423047680/ragFiles/5407844854494745030"
  display_name: "google-ai-agents-whitepaper.pdf"
  create_time {
    seconds: 1744176970
    nanos: 970626000
  }
  update_time {
    seconds: 1744176970
    nanos: 970626000
  }
  direct_upload_source {
  }
  file_status {
    state: ACTIVE
  }
}
rag_files {
  name: "projects/721521243942/locations/us-central1/ragCorpora/7186056155423047680/ragFiles/5412978057552637858"
  display_name: "DaouOffice 클라우드 관리자 가이드.pdf"
  create_time {
    seconds: 1744788897
    nanos: 620694000
  }
  update_time {
    seconds: 1744788897
    nanos: 620694000
  }
  gcs_source {
    uris: "gs://daou_office_manual/manual_org/DaouOffice 클라우드 관리자 가이드.pdf"
  }
  file_status {
    state: ACTIVE
  }
}
rag_files {
  name: "projects/721521243942/locations/us-central1/ragCorpora/7186056155423047680/ragFiles/5412978319244496986"
  display_name: "Daou

#### get rag file

In [None]:
file_name = "projects/721521243942/locations/us-central1/ragCorpora/7186056155423047680/ragFiles/5412978319244496986"
get_rag_file(file_name)

RagFile(name='projects/721521243942/locations/us-central1/ragCorpora/7186056155423047680/ragFiles/5412978319244496986', display_name='DaouOffice 클라우드 서비스 가이드 3.5.21.pdf', description='')


RagFile(name='projects/721521243942/locations/us-central1/ragCorpora/7186056155423047680/ragFiles/5412978319244496986', display_name='DaouOffice 클라우드 서비스 가이드 3.5.21.pdf', description='')

#### delete a rag file

In [None]:
file_name = "projects/721521243942/locations/us-central1/ragCorpora/7186056155423047680/ragFiles/5412978319244496986"

# del_lag_file(file_name)

## End of Notebook