### Define the RAG Pipeline

In [24]:
from typing import List
from tqdm import tqdm
from pymilvus import MilvusClient
from langchain_core.messages import HumanMessage, SystemMessage

class RAG:
    """
    RAG (Retrieval-Augmented Generation) class built upon Milvus and HuggingFace + MLX.
    """

    def __init__(self, llm, embedding_model, milvus_client: MilvusClient):
        self._llm = llm
        self._embedding_model = embedding_model
        self._prepare_milvus(milvus_client)

    def _emb_text(self, text: str) -> List[float]: # np.floatarray, actually
        return self._embedding_model.encode([text])[0]

    def _prepare_milvus(
        self, milvus_client: MilvusClient, collection_name: str = "rag_collection"
    ):
        self.milvus_client = milvus_client
        self.collection_name = collection_name
        if self.milvus_client.has_collection(self.collection_name):
            self.milvus_client.drop_collection(self.collection_name)
        embedding_dim = len(self._emb_text("foo"))
        self.milvus_client.create_collection(
            collection_name=self.collection_name,
            dimension=embedding_dim,
            metric_type="IP",  # Inner product distance
            consistency_level="Strong",  # Strong consistency level
        )

    def load(self, texts: List[str]):
        """
        Load the text data into Milvus.
        """
        data = []
        for i, line in enumerate(tqdm(texts, desc="Creating embeddings")):
            data.append({"id": i, "vector": self._emb_text(line), "text": line})

        self.milvus_client.insert(collection_name=self.collection_name, data=data)

    def retrieve(self, question: str, top_k: int = 3) -> List[str]:
        """
        Retrieve the most similar text data to the given question.
        """
        search_res = self.milvus_client.search(
            collection_name=self.collection_name,
            data=[self._emb_text(question)],
            limit=top_k,
            search_params={"metric_type": "IP", "params": {}},  # Inner product distance
            output_fields=["text"],  # Return the text field
        )
        retrieved_texts = [res["entity"]["text"] for res in search_res[0]]
        return retrieved_texts[:top_k]

    def answer(
        self,
        question: str,
        retrieval_top_k: int = 3,
        return_retrieved_text: bool = False,
    ):
        """
        Answer the given question with the retrieved knowledge.
        """
        retrieved_texts = self.retrieve(question, top_k=retrieval_top_k)
        
        user_prompt = USER_PROMPT.format(
            context="\n".join(retrieved_texts), question=question
        )

        # DEBUG
        # print(user_prompt)

        messages = [
            SystemMessage(
                content=SYSTEM_PROMPT
            ),
            HumanMessage(
                content=user_prompt
            ),
        ]

        res = self._llm.invoke(messages)
        # print(res.content)

        # response = self.openai_client.chat.completions.create(
        #     model=self.llm_model,
        #     messages=[
        #         {"role": "system", "content": self.SYSTEM_PROMPT},
        #         {"role": "user", "content": user_prompt},
        #     ],
        # )

        if not return_retrieved_text:
            return res.content
        else:
            return res.content, retrieved_texts

### Prompts and Open Vector Database

In [2]:
SYSTEM_PROMPT = """
Human: You are an AI assistant. You are able to find answers to the questions from the contextual passage snippets provided.
"""
USER_PROMPT = """
Use the following pieces of information enclosed in <context> tags to provide an answer to the question enclosed in <question> tags.
<context>
{context}
</context>
<question>
{question}
</question>
"""

In [4]:
milvus_client = MilvusClient(uri="./milvus_demo2.db")

### Open Embedding Model and LLM

In [5]:
from sentence_transformers import SentenceTransformer
from sentence_transformers.models import StaticEmbedding
from tokenizers import Tokenizer

# embedding_tokenizer = Tokenizer.from_pretrained("google-bert/bert-base-uncased")
# static_embedding = StaticEmbedding(embedding_tokenizer, embedding_dim=1024)
# embedding_model = SentenceTransformer(modules=[static_embedding])
embedding_model = SentenceTransformer("all-MiniLM-L6-v2")

from mlx_lm import load

model, tokenizer = load("mlx-community/phi-4-4bit")

from langchain_community.llms.mlx_pipeline import MLXPipeline
from langchain_community.chat_models.mlx import ChatMLX
from langchain_core.messages import HumanMessage

llm = MLXPipeline(
    model=model, tokenizer=tokenizer, pipeline_kwargs={"max_tokens": 1024, "temp": 0.1}
)

chat = ChatMLX(llm=llm)

Fetching 9 files:   0%|          | 0/9 [00:00<?, ?it/s]

### Create RAG Pipeline

In [25]:
my_rag = RAG(llm=chat, embedding_model=embedding_model, milvus_client=milvus_client)

#### DEBUG

In [26]:
v = my_rag._emb_text("Hello, world!")
print(type(v), len(v), v)

<class 'numpy.ndarray'> 384 [-3.81771475e-02  3.29111144e-02 -5.45937615e-03  1.43699432e-02
 -4.02910337e-02 -1.16532452e-01  3.16876508e-02  1.91175763e-03
 -4.26223613e-02  2.91681066e-02  4.24266979e-02  3.20417061e-02
  2.98447404e-02  1.09803099e-02 -5.39396591e-02 -5.02772704e-02
 -2.35078186e-02  1.07793650e-02 -1.37707949e-01  4.11500549e-03
  2.93330774e-02  6.68411553e-02 -1.53894098e-02  4.84376550e-02
 -8.81497413e-02 -1.27268368e-02  4.14090268e-02  4.08314951e-02
 -5.01558892e-02 -5.81250526e-02  4.88014929e-02  6.88901097e-02
  5.87469302e-02  8.73099361e-03 -1.59182679e-02  8.51419792e-02
 -7.81474486e-02 -7.75167719e-02  2.07237769e-02  1.61942448e-02
  3.25105898e-02 -5.34888841e-02 -6.22287765e-02 -2.43146457e-02
  7.41276331e-03  2.39777416e-02  6.36097370e-03  5.11451066e-02
  7.27667212e-02  3.46497037e-02 -5.47711030e-02 -5.93284741e-02
 -7.16696167e-03  2.01377142e-02  3.58463563e-02  5.59091382e-03
  1.07735554e-02 -5.27637303e-02  1.01473741e-02 -8.73163342e-

In [16]:
print(my_rag.milvus_client)

<pymilvus.milvus_client.milvus_client.MilvusClient object at 0x12bb65fd0>


In [17]:
print(my_rag.milvus_client.list_collections())

['rag_collection']


In [18]:
my_rag.retrieve("testing")

[]

### Fill Vector Database with Embeddings

In [27]:
import os
import urllib.request

url = "https://raw.githubusercontent.com/milvus-io/milvus/master/DEVELOPMENT.md"
file_path = "./Milvus_DEVELOPMENT.md"

if not os.path.exists(file_path):
    urllib.request.urlretrieve(url, file_path)
with open(file_path, "r") as file:
    file_text = file.read()

# We simply use "# " to separate the content in the file, which can roughly separate the content of each main part of the markdown file.
text_lines = file_text.split("# ")
my_rag.load(text_lines)

Creating embeddings: 100%|██████████| 47/47 [00:00<00:00, 76.79it/s]


### DEBUG

In [28]:
question = "what is the hardware requirements specification if I want to build Milvus and run from source code?"
my_rag.retrieve(question)

['Hardware Requirements\n\nThe following specification (either physical or virtual machine resources) is recommended for Milvus to build and run from source code.\n\n```\n- 8GB of RAM\n- 50GB of free disk space\n```\n\n##',
 "Software Requirements\n\nAll Linux distributions are available for Milvus development. However a majority of our contributor worked with Ubuntu or CentOS systems, with a small portion of Mac (both x86_64 and Apple Silicon) contributors. If you would like Milvus to build and run on other distributions, you are more than welcome to file an issue and contribute!\n\nHere's a list of verified OS types where Milvus can successfully build and run:\n\n- Debian/Ubuntu\n- Amazon Linux\n- MacOS (x86_64)\n- MacOS (Apple Silicon)\n\n##",
 'Building Milvus on a local OS/shell environment\n\nThe details below outline the hardware and software requirements for building on Linux and MacOS.\n\n##']

In [29]:
my_rag.answer(question, return_retrieved_text=True)

To build and run Milvus from source code, the recommended hardware requirements are:

- 8GB of RAM
- 50GB of free disk space


('To build and run Milvus from source code, the recommended hardware requirements are:\n\n- 8GB of RAM\n- 50GB of free disk space',
 ['Hardware Requirements\n\nThe following specification (either physical or virtual machine resources) is recommended for Milvus to build and run from source code.\n\n```\n- 8GB of RAM\n- 50GB of free disk space\n```\n\n##',
  "Software Requirements\n\nAll Linux distributions are available for Milvus development. However a majority of our contributor worked with Ubuntu or CentOS systems, with a small portion of Mac (both x86_64 and Apple Silicon) contributors. If you would like Milvus to build and run on other distributions, you are more than welcome to file an issue and contribute!\n\nHere's a list of verified OS types where Milvus can successfully build and run:\n\n- Debian/Ubuntu\n- Amazon Linux\n- MacOS (x86_64)\n- MacOS (Apple Silicon)\n\n##",
  'Building Milvus on a local OS/shell environment\n\nThe details below outline the hardware and software req

### Ground truth
Now let’s prepare some questions with its corresponding ground truth answers. We get answers and contexts from our RAG pipeline.

In [35]:
from datasets import Dataset
import pandas as pd

question_list = [
    "what is the hardware requirements specification if I want to build Milvus and run from source code?",
    "What is the programming language used to write Knowhere?",
    "What should be ensured before running code coverage?",
]
ground_truth_list = [
    "If you want to build Milvus and run from source code, the recommended hardware requirements specification is:\n\n- 8GB of RAM\n- 50GB of free disk space.",
    "The programming language used to write Knowhere is C++.",
    "Before running code coverage, you should make sure that your code changes are covered by unit tests.",
]
contexts_list = []
answer_list = []
for question in tqdm(question_list, desc="Answering questions"):
    answer, contexts = my_rag.answer(question, return_retrieved_text=True)
    contexts_list.append(contexts)
    answer_list.append(answer)

df = pd.DataFrame(
    {
        "question": question_list,
        "contexts": contexts_list,
        "answer": answer_list,
        "ground_truth": ground_truth_list,
    }
)
rag_results = Dataset.from_pandas(df)
df

Answering questions:   0%|          | 0/3 [00:00<?, ?it/s]



Answering questions:  33%|███▎      | 1/3 [00:08<00:16,  8.25s/it]

To build and run Milvus from source code, the recommended hardware specifications are:

- 8GB of RAM
- 50GB of free disk space

These requirements are applicable for both physical and virtual machine resources.


Answering questions:  67%|██████▋   | 2/3 [00:12<00:05,  5.85s/it]

The programming language used to write Knowhere is C++. This information is found in the context where it states, "The algorithm library of Milvus, Knowhere is written in C++."


Answering questions: 100%|██████████| 3/3 [00:17<00:00,  5.88s/it]

Before running code coverage, it should be ensured that the code change is covered by unit tests. This is important to verify before submitting a pull request. The context specifies that developers should make sure their code change is covered by unit tests before proceeding with code coverage checks.





Unnamed: 0,question,contexts,answer,ground_truth
0,what is the hardware requirements specificatio...,[Hardware Requirements\n\nThe following specif...,"To build and run Milvus from source code, the ...",If you want to build Milvus and run from sourc...
1,What is the programming language used to write...,[Unless required by applicable law or agreed t...,The programming language used to write Knowher...,The programming language used to write Knowher...
2,What should be ensured before running code cov...,[Code coverage\n\nBefore submitting your pull ...,"Before running code coverage, it should be ens...","Before running code coverage, you should make ..."


In [38]:
rag_results[0]

{'question': 'what is the hardware requirements specification if I want to build Milvus and run from source code?',
 'contexts': ['Hardware Requirements\n\nThe following specification (either physical or virtual machine resources) is recommended for Milvus to build and run from source code.\n\n```\n- 8GB of RAM\n- 50GB of free disk space\n```\n\n##',
  "Software Requirements\n\nAll Linux distributions are available for Milvus development. However a majority of our contributor worked with Ubuntu or CentOS systems, with a small portion of Mac (both x86_64 and Apple Silicon) contributors. If you would like Milvus to build and run on other distributions, you are more than welcome to file an issue and contribute!\n\nHere's a list of verified OS types where Milvus can successfully build and run:\n\n- Debian/Ubuntu\n- Amazon Linux\n- MacOS (x86_64)\n- MacOS (Apple Silicon)\n\n##",
  'Building Milvus on a local OS/shell environment\n\nThe details below outline the hardware and software require

### Evaluation with RAGAS

In [40]:
from ragas.dataset_schema import SingleTurnSample 
from ragas.metrics import Faithfulness

sample = SingleTurnSample(
        user_input=rag_results[0]['question'],
        response=rag_results[0]['answer'],
        retrieved_contexts=rag_results[0]['contexts']
    )
scorer = Faithfulness(llm=chat)
await scorer.single_turn_ascore(sample)

TypeError: object of type 'StringPromptValue' has no len()