## Creating an index and populating it with documents using Milvus and Nomic AI Embeddings

Ingest PDF documents, then web pages content into a Milvus VectorStore.

### Needed packages and imports

In [1]:
#!pip install einops==0.7.0 langchain==0.1.9 pypdf==4.0.2 pymilvus==2.3.6 sentence-transformers==2.4.0
#!pip install -q einops==0.7.0 langchain==0.1.9 pymilvus==2.3.6

In [2]:
import requests
import os
from langchain.document_loaders import PyPDFDirectoryLoader, WebBaseLoader
from langchain.text_splitter import RecursiveCharacterTextSplitter
from langchain.embeddings.huggingface import HuggingFaceEmbeddings
from langchain_community.vectorstores import Milvus

### Base parameters, the Milvus connection info

In [3]:
MILVUS_HOST = "vectordb-milvus.milvus.svc.cluster.local"
MILVUS_PORT = 19530
MILVUS_USERNAME = os.getenv('MILVUS_USERNAME')
MILVUS_PASSWORD = os.getenv('MILVUS_PASSWORD')
MILVUS_COLLECTION = "collection_nomicai_embeddings"

## Initial index creation and document ingestion

#### Download and load pdfs

In [4]:
product_version="2.13"
documents = [
    "release_notes",
    "introduction_to_red_hat_openshift_ai",
    "getting_started_with_red_hat_openshift_ai_self-managed",
    "openshift_ai_tutorial_-_fraud_detection_example",
    "developing_a_model",
    "integrating_data_from_amazon_s3",
    "working_on_data_science_projects",
    "serving_models",
    "monitoring_data_science_models",
    "managing_users",
    "managing_resources",
    "installing_and_uninstalling_openshift_ai_self-managed",
    "installing_and_uninstalling_openshift_ai_self-managed_in_a_disconnected_environment",
    "upgrading_openshift_ai_self-managed",
    "upgrading_openshift_ai_self-managed_in_a_disconnected_environment",   
]

pdfs = [f"https://access.redhat.com/documentation/en-us/red_hat_openshift_ai_self-managed/{product_version}/pdf/{doc}/red_hat_openshift_ai_self-managed-{product_version}-{doc}-en-us.pdf" for doc in documents]
pdfs_to_urls = {f"red_hat_openshift_ai_self-managed-{product_version}-{doc}-en-us": f"https://access.redhat.com/documentation/en-us/red_hat_openshift_ai_self-managed/{product_version}/html-single/{doc}/index" for doc in documents}

In [5]:
docs_dir = f"rhoai-doc-{product_version}"

if not os.path.exists(docs_dir):
    os.mkdir(docs_dir)

for pdf in pdfs:
    try:
        response = requests.get(pdf)
    except:
        print(f"Skipped {pdf}")
        continue
    if response.status_code!=200:
        print(f"Skipped {pdf}")
        continue  
    with open(f"{docs_dir}/{pdf.split('/')[-1]}", 'wb') as f:
        f.write(response.content)

Skipped https://access.redhat.com/documentation/en-us/red_hat_openshift_ai_self-managed/2.13/pdf/developing_a_model/red_hat_openshift_ai_self-managed-2.13-developing_a_model-en-us.pdf
Skipped https://access.redhat.com/documentation/en-us/red_hat_openshift_ai_self-managed/2.13/pdf/integrating_data_from_amazon_s3/red_hat_openshift_ai_self-managed-2.13-integrating_data_from_amazon_s3-en-us.pdf
Skipped https://access.redhat.com/documentation/en-us/red_hat_openshift_ai_self-managed/2.13/pdf/monitoring_data_science_models/red_hat_openshift_ai_self-managed-2.13-monitoring_data_science_models-en-us.pdf


In [6]:
pdf_folder_path = f"./rhoai-doc-{product_version}"

pdf_loader = PyPDFDirectoryLoader(pdf_folder_path)
pdf_docs = pdf_loader.load()

#### Inject metadata

In [7]:
from pathlib import Path

for doc in pdf_docs:
    doc.metadata["source"] = pdfs_to_urls[Path(doc.metadata["source"]).stem]

#### Load websites

In [8]:
websites = [
    "https://ai-on-openshift.io/getting-started/openshift/",
    "https://ai-on-openshift.io/getting-started/opendatahub/",
    "https://ai-on-openshift.io/getting-started/openshift-ai/",
    "https://ai-on-openshift.io/odh-rhoai/configuration/",
    "https://ai-on-openshift.io/odh-rhoai/custom-notebooks/",
    "https://ai-on-openshift.io/odh-rhoai/nvidia-gpus/",
    "https://ai-on-openshift.io/odh-rhoai/custom-runtime-triton/",
    "https://ai-on-openshift.io/odh-rhoai/openshift-group-management/",
    "https://ai-on-openshift.io/tools-and-applications/minio/minio/",
    "https://access.redhat.com/articles/7047935",
    "https://access.redhat.com/articles/rhoai-supported-configs",
]

In [9]:
website_loader = WebBaseLoader(websites)
website_docs = website_loader.load()

#### Merge both types of docs

In [10]:
docs = pdf_docs + website_docs

#### Split documents into chunks with some overlap

In [11]:
text_splitter = RecursiveCharacterTextSplitter(chunk_size=1024,
                                               chunk_overlap=40)
all_splits = text_splitter.split_documents(docs)
all_splits[0]

Document(page_content='Red Hat OpenShift AI Self-Managed\n2.13\nOpenShift AI tutorial - Fraud detection\nexample\nUse OpenShift AI to train an example model in JupyterLab, deploy the model, and\nrefine the model by using automated pipelines\nLast Updated: 2024-09-20', metadata={'source': 'https://access.redhat.com/documentation/en-us/red_hat_openshift_ai_self-managed/2.13/html-single/openshift_ai_tutorial_-_fraud_detection_example/index', 'page': 0})

In [12]:
#!pip install sentence-transformers
#!pip show sentence-transformers
#!pip uninstall -y sentence-transformers
#!pip install sentence-transformers

#### Create the index and ingest the documents

In [13]:
# If you don't want to use a GPU, you can remove the 'device': 'cuda' argument
model_kwargs = {'trust_remote_code': True, 'device': 'cuda'}
embeddings = HuggingFaceEmbeddings(
    model_name="nomic-ai/nomic-embed-text-v1",
    model_kwargs=model_kwargs,
    show_progress=True
)


db = Milvus(
    embedding_function=embeddings,
    connection_args={"host": MILVUS_HOST, "port": MILVUS_PORT, "user": MILVUS_USERNAME, "password": MILVUS_PASSWORD},
    collection_name=MILVUS_COLLECTION,
    metadata_field="metadata",
    text_field="page_content",
    auto_id=True,
    drop_old=True
    )

You try to use a model that was created with version 2.4.0.dev0, however, your version is 2.4.0. This might cause unexpected behavior or errors. In that case, try to update to the latest version.



<All keys matched successfully>


In [14]:
db.add_documents(all_splits)

Batches:   0%|          | 0/35 [00:00<?, ?it/s]

[453133124104520064,
 453133124104520065,
 453133124104520066,
 453133124104520067,
 453133124104520068,
 453133124104520069,
 453133124104520070,
 453133124104520071,
 453133124104520072,
 453133124104520073,
 453133124104520074,
 453133124104520075,
 453133124104520076,
 453133124104520077,
 453133124104520078,
 453133124104520079,
 453133124104520080,
 453133124104520081,
 453133124104520082,
 453133124104520083,
 453133124104520084,
 453133124104520085,
 453133124104520086,
 453133124104520087,
 453133124104520088,
 453133124104520089,
 453133124104520090,
 453133124104520091,
 453133124104520092,
 453133124104520093,
 453133124104520094,
 453133124104520095,
 453133124104520096,
 453133124104520097,
 453133124104520098,
 453133124104520099,
 453133124104520100,
 453133124104520101,
 453133124104520102,
 453133124104520103,
 453133124104520104,
 453133124104520105,
 453133124104520106,
 453133124104520107,
 453133124104520108,
 453133124104520109,
 453133124104520110,
 453133124104

#### Alternatively, add new documents

In [15]:
# If you don't want to use a GPU, you can remove the 'device': 'cuda' argument
# model_kwargs = {'trust_remote_code': True, 'device': 'cuda'}
# embeddings = HuggingFaceEmbeddings(
#     model_name="nomic-ai/nomic-embed-text-v1",
#     model_kwargs=model_kwargs,
#     show_progress=True
# )

# db = Milvus(
#     embedding_function=embeddings,
#     connection_args={"host": MILVUS_HOST, "port": MILVUS_PORT, "user": MILVUS_USERNAME, "password": MILVUS_PASSWORD},
#     collection_name=MILVUS_COLLECTION,
#     metadata_field="metadata",
#     text_field="page_content",
#     auto_id=True,
#     drop_old=False
#     )

# db.add_documents(all_splits)

#### Test query

In [16]:
query = "How can I install OpenShift AI?"
docs_with_score = db.similarity_search_with_score(query)

Batches:   0%|          | 0/1 [00:00<?, ?it/s]

In [17]:
for doc, score in docs_with_score:
    print("-" * 80)
    print("Score: ", score)
    print(doc.page_content)
    print("-" * 80)

--------------------------------------------------------------------------------
Score:  0.4710683524608612
Pak for Data version 5.0.3 or greater. These versions of Cloud Pak for Data include
watsonx.ai. If this use case does not apply to your organization, see 
Installing and
deploying OpenShift AI in a disconnected environment
 for more generally applicable
instructions.
This procedure shows how to use the OpenShift command-line interface (CLI) to install the Red Hat
OpenShift AI Operator on your OpenShift cluster. You must install the Operator before you can manage
the installation of OpenShift AI components.
Prerequisites
You have a running OpenShift cluster, version 4.12 or greater, configured with a default storage
class that can be dynamically provisioned.
You have cluster administrator privileges for your OpenShift cluster.
You have downloaded and installed the OpenShift command-line interface (CLI). See 
Installing
the OpenShift CLI
.
You have mirrored the required container i