In [5]:
import os
import numpy as np
from langchain.docstore.document import Document
from langchain_community.retrievers import BM25Retriever
from langchain_core.vectorstores import InMemoryVectorStore
from langchain_openai import OpenAIEmbeddings
from langchain.chains import create_retrieval_chain
from langchain.retrievers import EnsembleRetriever
from langchain_text_splitters import RecursiveCharacterTextSplitter
from langchain_openai import ChatOpenAI
from langchain_community.document_loaders import PyPDFLoader
# Initialize the LLM
import getpass
import os

os.environ["OPENAI_API_KEY"] = getpass.getpass()

folder_path = "document"  # Replace with the path to your folder containing PDFs

In [6]:
def load_documents_from_txt_folder(folder_path):
    """
    Load all TXT files from a specified folder and return them as a list of LangChain Documents.
    Args:
        folder_path (str): Path to the folder containing the TXT files.
    Returns:
        List[Document]: List of LangChain Documents.
    """
    documents = []
    for file_name in os.listdir(folder_path):
        if file_name.endswith(".txt"):
            file_path = os.path.join(folder_path, file_name)
            # Read text content from the file
            with open(file_path, "r", encoding="utf-8") as file:
                text_content = file.read()
            # Create a LangChain Document for each file
            doc = Document(page_content=text_content,
                           metadata={"source": file_name})
            documents.append(doc)
    return documents

In [7]:
# --- Step 2: Implement Small2Big Chunking ---
from langchain_text_splitters import RecursiveCharacterTextSplitter
from langchain.docstore.document import Document


def small2big_chunking(documents, min_chunk_size=200, max_chunk_size=1000):
    """
    Apply the Small2Big chunking strategy to a list of documents and provide statistics.

    Args:
        documents (List[Document]): List of documents to chunk.
        min_chunk_size (int): Minimum chunk size.
        max_chunk_size (int): Maximum chunk size.

    Returns:
        List[Document]: List of chunked documents.
    """
    chunked_docs = []
    text_splitter = RecursiveCharacterTextSplitter(
        chunk_size=min_chunk_size, chunk_overlap=50)

    total_original_length = 0
    total_chunked_length = 0

    for doc in documents:
        # Keep track of the original length of the document
        total_original_length += len(doc.page_content)

        # Split into small chunks first
        small_chunks = text_splitter.split_text(doc.page_content)
        combined_chunk = ""

        for chunk in small_chunks:
            if len(combined_chunk) + len(chunk) <= max_chunk_size:
                combined_chunk += " " + chunk
            else:
                chunked_docs.append(
                    Document(page_content=combined_chunk.strip(), metadata=doc.metadata))
                combined_chunk = chunk

        # Add the last chunk if it's not empty
        if combined_chunk:
            chunked_docs.append(
                Document(page_content=combined_chunk.strip(), metadata=doc.metadata))

    # Calculate statistics
    total_chunked_length = sum(len(doc.page_content) for doc in chunked_docs)
    avg_chunk_size = total_chunked_length / \
        len(chunked_docs) if chunked_docs else 0

    print("=== Small2Big Chunking Statistics ===")
    print(f"Total Original Documents: {len(documents)}")
    print(f"Total Original Length: {total_original_length} characters")
    print(f"Total Chunks Created: {len(chunked_docs)}")
    print(f"Total Chunked Length: {total_chunked_length} characters")
    print(f"Average Chunk Size: {avg_chunk_size:.2f} characters")
    print(f"Minimum Chunk Size: {min_chunk_size} characters")
    print(f"Maximum Chunk Size: {max_chunk_size} characters")

    return chunked_docs

In [8]:
# --- Step 3: Multi-Query Expansion ---
from langchain.output_parsers import PydanticToolsParser
from langchain_core.pydantic_v1 import BaseModel, Field
from langchain_core.prompts import ChatPromptTemplate


class ParaphrasedQuery(BaseModel):
    """You have performed query expansion to generate a paraphrasing of a question."""
    paraphrased_query: str = Field(
        ...,
        description="A unique paraphrasing of the original question."
    )


system = """
You are an expert in query paraphrasing and expansion. 

Your task is to generate multiple different phrasings of the same user query. 
Ensure that each paraphrased query captures the original meaning while using different wording.

If there are multiple common ways to phrase the question, or common synonyms for key terms, ensure all are included.

You **must** return at least 3 distinct paraphrased versions of the input query.
"""


prompt = ChatPromptTemplate.from_messages([
    ("system", system),
    ("human", "{question}"),
])

llm = ChatOpenAI(model="gpt-3.5-turbo-0125", temperature=0)
llm_with_tools = llm.bind_tools([ParaphrasedQuery])
query_analyzer = prompt | llm_with_tools | PydanticToolsParser(tools=[
                                                               ParaphrasedQuery])


def expand_query(query):
    """
    Generate multiple semantically similar queries using an LLM.
    Args:
        query (str): Original query.
    Returns:
        List[str]: List of expanded queries.
    """
    try:
        response = query_analyzer.invoke({"question": query})
        # Extract paraphrased queries
        expanded_queries = [
            q.paraphrased_query for q in response if isinstance(q, ParaphrasedQuery)]

        if not expanded_queries or len(expanded_queries) < 3:
            print(
                "Warning: Less than 3 paraphrased queries returned. Consider adjusting the prompt.")

        print(f"Expanded Queries: {expanded_queries}\n")
        return expanded_queries

    except Exception as e:
        print(f"Error during query expansion: {e}")
        return []

In [9]:
# --- Step 4: Combine Results from Multi-Query Retrieval ---
def multi_query_retrieval(expanded_queries, retriever, top_k=5):
    """
    Perform retrieval using multiple queries and combine results.
    """
    combined_results = []
    for query in expanded_queries:
        results = retriever.get_relevant_documents(query)[:top_k]
        combined_results.extend(results)

    # Deduplicate results based on document content
    unique_docs = {doc.page_content: doc for doc in combined_results}
    return list(unique_docs.values())

In [None]:
from sklearn.metrics.pairwise import cosine_similarity



def rank_with_cove(query, retrieved_docs, embeddings, top_k=5):
    """
    Rank retrieved documents based on relevance using embeddings.
    Args:
        query (str): The user's input query.
        retrieved_docs (List[Document]): List of retrieved documents.
        embeddings: The embedding model used for similarity calculations.
        top_k (int): The number of top documents to return.
    Returns:
        List[Document]: The top-ranked documents.
    """
    # Step 1: Generate embedding for the query
    query_embedding = embeddings.embed_query(query)
    # Step 2: Compute embeddings for the retrieved documents
    doc_contents = [doc.page_content for doc in retrieved_docs]

    doc_embeddings = embeddings.embed_documents(doc_contents)

    # Step 3: Calculate cosine similarity between query and document embeddings

    similarities = cosine_similarity([query_embedding], doc_embeddings)[0]

    # Step 4: Rank documents by similarity score

    ranked_indices = sorted(range(len(similarities)),
                            key=lambda i: similarities[i], reverse=True)

    # Step 5: Select top-k documents

    ranked_docs = [retrieved_docs[i] for i in ranked_indices[:top_k]]

    return ranked_docs

In [11]:
def optimized_rag_chain(query, retriever, llm, embeddings, top_k=5):
    """
    Perform retrieval-augmented generation with query optimization.
    Args:
        query (str): The user's input query.
        retriever: The retrieval mechanism.
        llm: The language model.
        embeddings: The embedding model for ranking documents.
        top_k (int): The number of top documents to retrieve.

    Returns:
        dict: A dictionary with the generated response and retrieved documents.
    """
    # Step 1: Expand the query using the LLM
    expanded_queries = expand_query(query)

    # Step 2: Retrieve documents using expanded queries
    retrieved_docs = multi_query_retrieval(expanded_queries, retriever, top_k)

    # Step 3: Rank documents using CoVe
    ranked_docs = rank_with_cove(query, retrieved_docs, embeddings, top_k)

    # Step 4: Combine the retrieved documents into a single context
    context = "\n\n".join([doc.page_content for doc in ranked_docs])

    # Step 5: Define the system prompt with the context
    system_prompt = (
        "You are an assistant for question-answering tasks. "
        "Use the following pieces of retrieved context to answer "
        "the question. If you don't know the answer, say that you "
        "don't know.\n\n"
        f"{context}\n\n"
        "Question: {query}"
    )

    # Step 6: Format the final prompt for the LLM
    final_prompt = system_prompt.format(query=query)
    # print(final_prompt)
    # Step 7: Get the answer from the LLM
    response = llm(final_prompt)  # Assuming llm takes input as a dictionary

    return {"query": query, "response": response, "retrieved_docs": ranked_docs}

In [12]:
documents = load_documents_from_txt_folder(folder_path)

In [13]:
# Apply Small2Big chunking
chunked_documents = small2big_chunking(documents)

=== Small2Big Chunking Statistics ===
Total Original Documents: 1
Total Original Length: 6005 characters
Total Chunks Created: 7
Total Chunked Length: 6517 characters
Average Chunk Size: 931.00 characters
Minimum Chunk Size: 200 characters
Maximum Chunk Size: 1000 characters


In [15]:
# Initialize retrievers
bm25_retriever = BM25Retriever.from_documents(chunked_documents)

In [16]:
vectorstore = InMemoryVectorStore.from_documents(
    documents=chunked_documents, embedding=OpenAIEmbeddings()
)
vector_retriever = vectorstore.as_retriever()

# Combine retrievers using EnsembleRetriever
ensemble_retriever = EnsembleRetriever(
    retrievers=[bm25_retriever, vector_retriever], weights=[0.5, 0.5]
)

In [20]:
# List of queries
queries = [
    "Thông tin Usage Data bao gồm những gì?",
]

# Initialize LLM and embeddings
llm = ChatOpenAI(model="gpt-4o-mini")
embeddings = OpenAIEmbeddings()

# Initialize a list to store results
results_list = []

# Loop through each query, perform RAG, and store the results
for query in queries:
    result = optimized_rag_chain(
        query=query,
        retriever=ensemble_retriever,
        llm=llm,
        embeddings=embeddings,
        top_k=5,
    )
    # Append the result to the list
    results_list.append(result)

Expanded Queries: ['What does Usage Data information consist of?']



In [21]:
# Display results
for result in results_list:
    print("=== Query ===")
    print(result["query"])
    print(result["response"].pretty_repr())
    print("\n")

=== Query ===
Thông tin Usage Data bao gồm những gì?

Thông tin Usage Data bao gồm các dữ liệu như địa chỉ IP của máy tính, loại trình duyệt, phiên bản trình duyệt, các trang của dịch vụ mà bạn truy cập, thời gian và ngày tháng của chuyến thăm, thời gian bạn dành cho những trang đó, các định danh thiết bị duy nhất và các dữ liệu chẩn đoán khác.




In [22]:
for result in results_list:
    # Loop through the retrieved documents for this query
    for i, doc in enumerate(result["retrieved_docs"], start=1):
        if isinstance(doc, Document):
            print(f"--- Document {i} ---")
            print(f"Metadata: {doc.metadata}")
            print(f"Content: {doc.page_content}\n")

--- Document 1 ---
Metadata: {'source': 'privacy_policy.txt'}
Content: We may also collect information that your browser sends whenever you visit our Service or when you access the Service by or through a mobile device ("Usage Data"). This Usage Data may include ("Usage Data"). This Usage Data may include information such as your computer's Internet Protocol address (e.g. IP address), browser type, browser version, the pages of our Service that you visit, version, the pages of our Service that you visit, the time and date of your visit, the time spent on those pages, unique device identifiers, and other diagnostic data. USE OF DATA Presight uses the collected data for various purposes:
- To provide and maintain our Service
- To notify you about changes to our Service - To notify you about changes to our Service
- To allow you to participate in interactive features of our Service when you choose to do so
- To provide customer support

--- Document 2 ---
Metadata: {'source': 'privacy_pol