In [None]:
import os
from typing import List
from tqdm.notebook import tqdm

from langchain_community.document_loaders import PyPDFLoader
from langchain.text_splitter import CharacterTextSplitter
from pymilvus import MilvusClient

from langchain_huggingface import HuggingFaceEmbeddings

import openai

In [None]:
PDF_FILE = "WinRAR.pdf"
DB_FILE_PATH = "milvus_db.db"
COLLECTION_NAME = "WinRAR"

EMBED_MODEL_NAME = "intfloat/e5-large-v2"
EMBED_DIM = 1024

OPENAI_MODEL = "gpt-3.5-turbo"
openai.api_key = os.environ["OPENAI_API_KEY"]

prompt_template = (
    "You are a helpful assistant.\n"
    "Based on the question from the user, I have prepared some context that may be related to the question, which is given below:\n"
    "{}\n"
    "\n"
    "And here is the question: {}\n"
    "\n"
    "Please provide a useful answer."
)

In [None]:
milvus_client = MilvusClient(uri=DB_FILE_PATH)

milvus_client.create_collection(
    collection_name=COLLECTION_NAME,
    dimension=EMBED_DIM,
    metric_type="L2",
    consistency_level="Strong"
)

In [None]:
print("Loading PDF and splitting texts...")
loader = PyPDFLoader(PDF_FILE)
docs = loader.load_and_split()

text_splitter = CharacterTextSplitter(
    separator="\n",
    chunk_size=1000,
    chunk_overlap=100
)
split_docs = text_splitter.split_documents(docs)

print(
    f"Total splitted docs have {len(split_docs)} chunks, first chunk is:\n{split_docs[0].page_content}")

In [None]:
embeddings = HuggingFaceEmbeddings(model_name=EMBED_MODEL_NAME)

data_batch = []
for i, d in enumerate(split_docs):
    vec = embeddings.embed_query(d.page_content)
    data_batch.append({
        "id": i,
        "vector": vec,
        "text": d.page_content.replace("\n", " ")
    })

print("Vectors ready...")

insert_res = milvus_client.insert(
    collection_name=COLLECTION_NAME,
    data=data_batch
)

print(f"Insert done. Total inserted: {len(data_batch)}")

In [None]:
def milvus_search(query: str, top_k: int = 3):
    # Query from Milvus
    query_vec = embeddings.embed_query(query)
    results = milvus_client.search(
        collection_name=COLLECTION_NAME,
        data=[query_vec],
        limit=top_k,
        output_fields=["text"]
    )
    hits = results[0]

    # Extract the text and score from the result
    docs_found = []
    for h in hits:
        doc_text = h['entity']['text']
        doc_score = h['distance']
        docs_found.append({
            "text": doc_text,
            "score": doc_score
        })
    return docs_found


def generate_answer_with_gpt(query: str, context_list: List[str]) -> str:
    context_str = "\n".join(f"- {c}" for c in context_list)
    prompt = prompt_template.format(context_str, query)

    response = openai.chat.completions.create(
        model=OPENAI_MODEL,
        messages=[{"role": "user", "content": prompt}],
        temperature=0.0
    )
    
    ans = response.choices[0].message.content
    return prompt, ans.strip()

In [None]:
user_question = "What limitations does WinRAR have?"
print(f"\n[User Question] {user_question}")

print("------ Searching Milvus... ------")
top_docs = milvus_search(user_question, top_k=5)
for i, d in enumerate(top_docs):
    print(f"Doc {i+1} - Score {d['score']} : {d['text']}")

context_texts = [d["text"] for d in top_docs]

print("------ Generating answer with GPT... ------")
prompt, answer = generate_answer_with_gpt(user_question, context_texts)
print("\n===== GPT Prompt =====")
print(prompt)
print("\n===== RAG Answer =====")
print(answer)