## **🚀 NaNsense**  
- **Database**: ChromaDB
- **Embedding Model**: sentence-transformers/all-MiniLM-L6-v2
- **LLM for Generation**: gpt-4o


# 1. Knowledge Base Preparation

## 1.1 Pre-process documents.

In [None]:
import os
from tqdm import tqdm
from src.preprocessing import filter_json_file

folder_path = "data/hackathon_data"

if not os.path.exists("data/clean"):
    os.makedirs("data/clean")
    for filename in tqdm(os.listdir(folder_path)):
        if filename.endswith(".json"):
            filepath = os.path.join(folder_path, filename)
            filter_json_file(filepath, "data/clean")

## 2.3 Document Indexing and Storage

In [1]:
from langchain.text_splitter import RecursiveCharacterTextSplitter
from langchain_community.vectorstores import Chroma
import json
import os
from tqdm import tqdm

def chunk_documents(documents, chunk_size=500, chunk_overlap=100):
    """
    Split documents into chunks for better retrieval.
    """
    from langchain.schema import Document
    
    text_splitter = RecursiveCharacterTextSplitter(
        chunk_size=chunk_size,
        chunk_overlap=chunk_overlap,
        length_function=len,
    )
    
    chunked_docs = []
    for doc in tqdm(documents):
        splits = text_splitter.split_text(doc["content"])
        for i, split in enumerate(splits):
            chunked_docs.append(
                Document(
                    page_content=split,
                    metadata={
                        **doc["metadata"],
                        "chunk_id": i
                    }
                )
            )
    
    return chunked_docs

#### Option 1: Wihout Metadata

In [2]:
# go over the data/clean folder and chunk the documents
documents = []
for filename in tqdm(os.listdir("data/clean")):
    if filename.endswith(".json"):
        filepath = os.path.join("data/clean", filename)
        with open(filepath, "r") as f:
            data = json.load(f)
            for url in data["text_by_page_url"]:
                documents.append({"content": data["text_by_page_url"][url], "metadata": {"source": url}})

100%|██████████| 13144/13144 [00:12<00:00, 1070.66it/s]


#### Option 2: With Metadata

In [None]:
from src.fuzzy_metadata import fuzzy_is_meta

documents= fuzzy_is_meta(use_all_doc=True)

#### Continue as before

In [3]:
chunked_documents = chunk_documents(documents)

100%|██████████| 258097/258097 [02:01<00:00, 2123.07it/s]


In [4]:
from src.keyword_retrieval import fuzzy_search

keywords = fuzzy_search(["France", "Cheese", "Wine"], chunked_documents)


Fuzzy searching: 100%|██████████| 3976253/3976253 [01:21<00:00, 49037.01it/s]


In [11]:
keywords[20]

Document(metadata={'source': 'https://www.thehenryrestaurant.com/locations/the-henry-west-hollywood/menus/dinner-menu/', 'chunk_id': 2}, page_content='fig, pumpkin seed, candied pecan, pecorino, mustard vinaigrette Entrées Wagyu Cheeseburger* 25 lettuce, tomato, pickle, charred onion, white cheddar, american cheese, henry sauce Scottish Salmon* 38 toasted quinoa, marcona almond pesto, crispy sweet potato, watercress, pomegranate glaze Filet Mignon* 56 horseradish gratin, roasted brussels sprout, wild mushroom, cipollini onion, burgundy sauce Add Lobster 24 Bolognese 29 garganelli pasta, truffle mushroom butter, herbed ricotta, garlic toast')

# 3. Retrieval Augmented Generation

## 3.1 Load Knowledge Database

In [17]:
import torch
from langchain_community.embeddings import HuggingFaceEmbeddings

def get_local_embeddings(model_name="sentence-transformers/all-MiniLM-L6-v2"):
    """
    Create a local embedding model using HuggingFace models.
    """
    model_kwargs = {'device': 'cuda' if torch.cuda.is_available() else 'cpu'}  # Use 'cuda' if you have a GPU
    encode_kwargs = {'normalize_embeddings': True}
    
    embeddings = HuggingFaceEmbeddings(
        model_name=model_name,
        model_kwargs=model_kwargs,
        encode_kwargs=encode_kwargs
    )
    
    return embeddings

In [18]:
def create_vector_db(documents, persist_directory="./chroma_db"):
    """
    Create and persist a vector database from documents.
    """
    embeddings = get_local_embeddings()
    
    vectordb = Chroma.from_documents(
        documents=documents,
        embedding=embeddings,
        persist_directory=persist_directory,

    )
    
    vectordb.persist()
    print(f"Vector database created with {len(documents)} chunks and saved to {persist_directory}")
    
    return vectordb

In [None]:
vector_db = create_vector_db(documents)

## 3.2 Relevant Document Retrieval

Feel free to check and improve your retrieval performance as it affect the generation results significantly.

In [25]:
def retrieve_documents(query, vectordb, k=1):
    """
    Retrieve relevant documents from the vector database based on the query.
    """
    retriever = vectordb.as_retriever(search_type="similarity", search_kwargs={"k": k})
    docs = retriever.get_relevant_documents(query)
    return docs

## 3.3 Response Generation

In [None]:
from src.prompts import generate_answer, load_prompts

query = "What company is located in 29010 Commerce Center Dr., Valencia, 91355, California, US?"
retrieved_docs = retrieve_documents(query, vector_db)
prompts = load_prompts()
prompt_template = prompts["rag_default"]
response = generate_answer(query, retrieved_texts=retrieved_docs, prompt_template=prompt_template, model="gpt-4o")

print("Query:", query)
print("Retrieved Documents:", ["ABC Corporation is located at 29010 Commerce Center Dr., Valencia, 91355, California, US."])
print("Generated Answer:", response)

# 4. Evaluation

In [None]:
from src.evaluate import evaluate_rag_system, save_evaluation_results

benchmark_file = "benchmark.json"
k = 3
prompt_template_name = "rag_default"
model = "gpt-4o"

print("Starting evaluation...")
evaluation_results = evaluate_rag_system(
    benchmark_file=benchmark_file,
    k=k,
    prompt_template_name=prompt_template_name,
    model=model
)

print("\nEvaluation Results:")
for metric, value in evaluation_results["metrics"].items():
    print(
        f"{metric}: {value:.2f}%"
        if "percentage" in metric
        else f"{metric}: {value}"
    )

save_evaluation_results(evaluation_results, "evaluation_results.json")

print("\nExample Results:")
for i, result in enumerate(evaluation_results["results"][:3]):  # Show first 3 examples
    print(f"\nExample {i+1}:")
    print(f"Question: {result['question']}")
    print(f"Reference: {result['reference_answer']}")
    print(f"Prediction: {result['predicted_answer']}")
    print(f"Exact Match: {'Yes' if result['exact_match'] == 1.0 else 'No'}")