## Creating an index and populating it with documents using Milvus

Simple example on how to ingest PDF documents, then web pages content into a Milvus VectorStore.

Requirements:
- A Milvus instance, either standalone or cluster.
- Connection credentials to Milvus must be available as environment variables: MILVUS_USERNAME and MILVUS_PASSWORD

### Needed packages and imports

In [None]:
!pip install -q einops==0.7.0 langchain==0.1.9 pypdf==4.0.2 pymilvus==2.3.6 sentence-transformers==2.4.0

In [None]:
import requests
import os
from langchain.document_loaders import PyPDFDirectoryLoader, WebBaseLoader
from langchain.text_splitter import RecursiveCharacterTextSplitter
from langchain.embeddings.huggingface import HuggingFaceEmbeddings
from langchain_community.vectorstores import Milvus

### Base parameters, the Milvus connection info

In [None]:
# Replace values according to your Milvus deployment
MILVUS_HOST = "vectordb-milvus.milvus.svc.cluster.local"
MILVUS_PORT = 19530
MILVUS_USERNAME = os.getenv('MILVUS_USERNAME')
MILVUS_PASSWORD = os.getenv('MILVUS_PASSWORD')
MILVUS_COLLECTION = "demo_collection"

## Initial index creation and document ingestion

#### Download and load pdfs

In [None]:
product_version = "2.6"
documents = [
    "release_notes",
    "introduction_to_red_hat_openshift_ai",
    "getting_started_with_red_hat_openshift_ai_self-managed",
    "openshift_ai_tutorial_-_fraud_detection_example",
    "developing_a_model",
    "integrating_data_from_amazon_s3",
    "working_on_data_science_projects",
    "serving_models",
    "monitoring_data_science_models",
    "managing_users",
    "managing_resources",
    "installing_and_uninstalling_openshift_ai_self-managed",
    "installing_and_uninstalling_openshift_ai_self-managed_in_a_disconnected_environment",
    "upgrading_openshift_ai_self-managed",
    "upgrading_openshift_ai_self-managed_in_a_disconnected_environment",   
]

pdfs = [f"https://access.redhat.com/documentation/en-us/red_hat_openshift_ai_self-managed/{product_version}/pdf/{doc}/red_hat_openshift_ai_self-managed-{product_version}-{doc}-en-us.pdf" for doc in documents]
pdfs_to_urls = {f"red_hat_openshift_ai_self-managed-{product_version}-{doc}-en-us": f"https://access.redhat.com/documentation/en-us/red_hat_openshift_ai_self-managed/{product_version}/html-single/{doc}/index" for doc in documents}

In [None]:
docs_dir = f"rhoai-doc-{product_version}"

if not os.path.exists(docs_dir):
    os.mkdir(docs_dir)

for pdf in pdfs:
    try:
        response = requests.get(pdf)
    except:
        print(f"Skipped {pdf}")
        continue
    if response.status_code!=200:
        print(f"Skipped {pdf}")
        continue  
    with open(f"{docs_dir}/{pdf.split('/')[-1]}", 'wb') as f:
        f.write(response.content)

In [None]:
pdf_folder_path = f"./rhoai-doc-{product_version}"

pdf_loader = PyPDFDirectoryLoader(pdf_folder_path)
pdf_docs = pdf_loader.load()

#### Inject metadata

In [None]:
from pathlib import Path

for doc in pdf_docs:
    doc.metadata["source"] = pdfs_to_urls[Path(doc.metadata["source"]).stem]

#### Load websites

In [None]:
websites = [
    "https://ai-on-openshift.io/getting-started/openshift/",
    "https://ai-on-openshift.io/getting-started/opendatahub/",
    "https://ai-on-openshift.io/getting-started/openshift-ai/",
    "https://ai-on-openshift.io/odh-rhoai/configuration/",
    "https://ai-on-openshift.io/odh-rhoai/custom-notebooks/",
    "https://ai-on-openshift.io/odh-rhoai/nvidia-gpus/",
    "https://ai-on-openshift.io/odh-rhoai/custom-runtime-triton/",
    "https://ai-on-openshift.io/odh-rhoai/openshift-group-management/",
    "https://ai-on-openshift.io/tools-and-applications/minio/minio/",
    "https://access.redhat.com/articles/7047935",
    "https://access.redhat.com/articles/rhoai-supported-configs",
]

In [None]:
website_loader = WebBaseLoader(websites)
website_docs = website_loader.load()

#### Merge both types of docs

In [None]:
docs = pdf_docs + website_docs

#### Split documents into chunks with some overlap

In [None]:
text_splitter = RecursiveCharacterTextSplitter(chunk_size=1024,
                                               chunk_overlap=128)
all_splits = text_splitter.split_documents(docs)
all_splits[0]

#### Create the index and ingest the documents

In [None]:
# If you don't want to use a GPU, you can remove the 'device': 'cuda' argument
model_kwargs = {'device': 'cuda'}
embeddings = HuggingFaceEmbeddings(
    model_kwargs=model_kwargs,
    show_progress=True
)

# BEWARE: `drop_old` is set to True, so if the collection already existed it will deleted first.
db = Milvus(
    embedding_function=embeddings,
    connection_args={"host": MILVUS_HOST, "port": MILVUS_PORT, "user": MILVUS_USERNAME, "password": MILVUS_PASSWORD},
    collection_name=MILVUS_COLLECTION,
    metadata_field="metadata",
    text_field="page_content",
    auto_id=True,
    drop_old=True
    )

In [None]:
db.add_documents(all_splits)

#### Alternatively, add new documents

In [None]:
# If you don't want to use a GPU, you can remove the 'device': 'cuda' argument
# model_kwargs = {'device': 'cuda'}
# embeddings = HuggingFaceEmbeddings(
#     model_kwargs=model_kwargs,
#     show_progress=True
# )

# db = Milvus(
#     embedding_function=embeddings,
#     connection_args={"host": MILVUS_HOST, "port": MILVUS_PORT, "user": MILVUS_USERNAME, "password": MILVUS_PASSWORD},
#     collection_name=MILVUS_COLLECTION,
#     metadata_field="metadata",
#     text_field="page_content",
#     auto_id=True,
#     drop_old=False
#     )

# db.add_documents(all_splits)

#### Test query

In [None]:
query = "How can I work with GPU and taints in OpenShift AI?"
docs_with_score = db.similarity_search_with_score(query)

In [None]:
for doc, score in docs_with_score:
    print("-" * 80)
    print("Score: ", score)
    print(doc.page_content)
    print("-" * 80)