# **ChromaDB**
This part is just for understanding the basic methodology of ChromaDB, skip it if you want.

In [1]:
pip install chromadb

Collecting chromadb
  Downloading chromadb-0.6.3-py3-none-any.whl.metadata (6.8 kB)
Collecting build>=1.0.3 (from chromadb)
  Downloading build-1.2.2.post1-py3-none-any.whl.metadata (6.5 kB)
Collecting chroma-hnswlib==0.7.6 (from chromadb)
  Downloading chroma_hnswlib-0.7.6-cp311-cp311-manylinux_2_17_x86_64.manylinux2014_x86_64.whl.metadata (252 bytes)
Collecting fastapi>=0.95.2 (from chromadb)
  Downloading fastapi-0.115.12-py3-none-any.whl.metadata (27 kB)
Collecting uvicorn>=0.18.3 (from uvicorn[standard]>=0.18.3->chromadb)
  Downloading uvicorn-0.34.0-py3-none-any.whl.metadata (6.5 kB)
Collecting posthog>=2.4.0 (from chromadb)
  Downloading posthog-3.21.0-py2.py3-none-any.whl.metadata (2.9 kB)
Collecting onnxruntime>=1.14.1 (from chromadb)
  Downloading onnxruntime-1.21.0-cp311-cp311-manylinux_2_27_x86_64.manylinux_2_28_x86_64.whl.metadata (4.5 kB)
Collecting opentelemetry-exporter-otlp-proto-grpc>=1.2.0 (from chromadb)
  Downloading opentelemetry_exporter_otlp_proto_grpc-1.31.1-py

In [2]:
import chromadb
chroma_client = chromadb.Client()

In [3]:
# store the embeddings, documents, and any additional metadata
collection = chroma_client.get_or_create_collection(name="my_collection")

In [4]:
documents = [
    {"id": "id1", "text": "This is a document about orange"},
    {"id": "id2", "text": "This is a document about pineapple"}
]


In [5]:
# sample collection data, update with collection.upsert
for doc in documents:
    collection.upsert(
        documents=[doc["text"]],
        metadatas=[{"source": doc["id"]}],
        ids=[doc["id"]]
    )

print(documents)

/root/.cache/chroma/onnx_models/all-MiniLM-L6-v2/onnx.tar.gz: 100%|██████████| 79.3M/79.3M [00:07<00:00, 11.0MiB/s]


[{'id': 'id1', 'text': 'This is a document about orange'}, {'id': 'id2', 'text': 'This is a document about pineapple'}]


In [6]:
query_text = "This is a query document about florida." # Chroma will embed this for you

results = collection.query(
    query_texts=[query_text],
    n_results=2 # how many results to return
)

print(results)


{'ids': [['id1', 'id2']], 'embeddings': None, 'documents': [['This is a document about orange', 'This is a document about pineapple']], 'uris': None, 'data': None, 'metadatas': [[{'source': 'id1'}, {'source': 'id2'}]], 'distances': [[1.145798921585083, 1.3179377317428589]], 'included': [<IncludeEnum.distances: 'distances'>, <IncludeEnum.documents: 'documents'>, <IncludeEnum.metadatas: 'metadatas'>]}


In [7]:
for idx, documents in enumerate(results["documents"][0]):
    dox_id = results["ids"][0][idx]
    distance = results["distances"][0][idx]
    print(f" For the query: {query_text}, \n Found similar document: {documents} with ID: {dox_id} and distance {round(distance, 4)}")
    # print(f"\n\nQuestion #{idx + 1}\n")
    # for document in documents:
        # print(document)

 For the query: This is a query document about florida., 
 Found similar document: This is a document about orange with ID: id1 and distance 1.1458
 For the query: This is a query document about florida., 
 Found similar document: This is a document about pineapple with ID: id2 and distance 1.3179


## Built RAG

In [8]:
pip install groq

Collecting groq
  Downloading groq-0.20.0-py3-none-any.whl.metadata (15 kB)
Downloading groq-0.20.0-py3-none-any.whl (124 kB)
[?25l   [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m0.0/124.9 kB[0m [31m?[0m eta [36m-:--:--[0m[2K   [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m124.9/124.9 kB[0m [31m8.3 MB/s[0m eta [36m0:00:00[0m
[?25hInstalling collected packages: groq
Successfully installed groq-0.20.0


In [9]:
import os
from groq import Client
import openai
from dotenv import load_dotenv
import chromadb
from openai import OpenAI
from chromadb.utils import embedding_functions

In [10]:
# free api from Groq
groq_api_key  = "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"

openai_ef = embedding_functions.OpenAIEmbeddingFunction(
    api_key=groq_api_key, model_name="text-embedding-ada-002"
)

# Initialize the Chroma client with persistence
chroma_client = chromadb.PersistentClient(path="chroma_persistent_storage")
collection_name = "document_qa_collection"
collection = chroma_client.get_or_create_collection(
    name=collection_name, embedding_function=openai_ef
)

In [11]:
import chromadb
from chromadb.utils import embedding_functions
from sentence_transformers import SentenceTransformer

# Load Hugging Face embedding model
hf_embedding_model = SentenceTransformer("BAAI/bge-large-en")

# Define a ChromaDB-compatible embedding function
class HuggingFaceEmbeddingFunction(embedding_functions.EmbeddingFunction):
    def __call__(self, texts):
        return hf_embedding_model.encode(texts).tolist()

# Initialize the Chroma client with persistence
chroma_client = chromadb.PersistentClient(path="chroma_persistent_storage")
collection_name = "document_qa_collection"

# Create an instance of our embedding function
hf_embedding_func = HuggingFaceEmbeddingFunction()

# Create collection using Hugging Face embeddings
collection = chroma_client.get_or_create_collection(
    name=collection_name,
    embedding_function=hf_embedding_func
)

print("✅ ChromaDB collection initialized successfully!")


The secret `HF_TOKEN` does not exist in your Colab secrets.
To authenticate with the Hugging Face Hub, create a token in your settings tab (https://huggingface.co/settings/tokens), set it as secret in your Google Colab and restart your session.
You will be able to reuse this secret in all of your notebooks.
Please note that authentication is recommended but still optional to access public models or datasets.


modules.json:   0%|          | 0.00/349 [00:00<?, ?B/s]

config_sentence_transformers.json:   0%|          | 0.00/124 [00:00<?, ?B/s]

README.md: 0.00B [00:00, ?B/s]

sentence_bert_config.json:   0%|          | 0.00/52.0 [00:00<?, ?B/s]

config.json:   0%|          | 0.00/720 [00:00<?, ?B/s]

model.safetensors:   0%|          | 0.00/1.34G [00:00<?, ?B/s]

tokenizer_config.json:   0%|          | 0.00/366 [00:00<?, ?B/s]

vocab.txt: 0.00B [00:00, ?B/s]

tokenizer.json: 0.00B [00:00, ?B/s]

special_tokens_map.json:   0%|          | 0.00/125 [00:00<?, ?B/s]

config.json:   0%|          | 0.00/191 [00:00<?, ?B/s]

✅ ChromaDB collection initialized successfully!


In [12]:
print(collection.peek(5))  # See if documents exist


{'ids': [], 'embeddings': array([], dtype=float64), 'documents': [], 'uris': None, 'data': None, 'metadatas': [], 'included': [<IncludeEnum.embeddings: 'embeddings'>, <IncludeEnum.documents: 'documents'>, <IncludeEnum.metadatas: 'metadatas'>]}


In [13]:
client = Client(api_key=groq_api_key )


In [14]:
resp = client.chat.completions.create(
    model="llama3-8b-8192",
  messages=[
    {"role": "system", "content": "You are a helpful assistant."},
    {"role": "user", "content": "Who won the world series in 2020?"},

  ]
)

In [15]:
print(resp)

ChatCompletion(id='chatcmpl-109017a6-b902-46a1-87cc-4c20c09ddd79', choices=[Choice(finish_reason='stop', index=0, logprobs=None, message=ChatCompletionMessage(content="The Los Angeles Dodgers won the World Series in 2020, defeating the Tampa Bay Rays in the Fall Classic, 4 games to 2. It was the Dodgers' first World Series title since 1988.", role='assistant', function_call=None, reasoning=None, tool_calls=None))], created=1742837165, model='llama3-8b-8192', object='chat.completion', system_fingerprint='fp_179b0f92c9', usage=CompletionUsage(completion_tokens=45, prompt_tokens=31, total_tokens=76, completion_time=0.0375, prompt_time=0.004468999, queue_time=0.021059901, total_time=0.041968999), x_groq={'id': 'req_01jq4k1btmfw8bgz01f68mqgj8'})


In [16]:
print(resp.choices[0].message.content)

The Los Angeles Dodgers won the World Series in 2020, defeating the Tampa Bay Rays in the Fall Classic, 4 games to 2. It was the Dodgers' first World Series title since 1988.


In [17]:
# Function to load documents from a directory
def load_documents_from_directory(directory_path):
    print("==== Loading documents from directory ====")
    documents = []
    for filename in os.listdir(directory_path):
        if filename.endswith(".txt"):
            with open(
                os.path.join(directory_path, filename), "r", encoding="utf-8"
            ) as file:
                documents.append({"id": filename, "text": file.read()})
    return documents


In [18]:
# Function to split text into chunks
def split_text(text, chunk_size=1000, chunk_overlap=20):
    chunks = []
    start = 0
    while start < len(text):
        end = start + chunk_size
        chunks.append(text[start:end])
        start = end - chunk_overlap
    return chunks

In [19]:
!pip install google-colab

Collecting jedi>=0.16 (from ipython==7.34.0->google-colab)
  Downloading jedi-0.19.2-py2.py3-none-any.whl.metadata (22 kB)
Downloading jedi-0.19.2-py2.py3-none-any.whl (1.6 MB)
[2K   [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m1.6/1.6 MB[0m [31m64.8 MB/s[0m eta [36m0:00:00[0m
[?25hInstalling collected packages: jedi
Successfully installed jedi-0.19.2


In [21]:
from google.colab import drive

# Mount Google Drive
drive.mount('/content/drive')

Mounted at /content/drive


In [22]:
# Load documents from the directory
directory_path = "/content/drive/My Drive/Colab Notebooks/NLP Project/dataset_txt"
documents = load_documents_from_directory(directory_path)

print(f"Loaded {len(documents)} documents")

==== Loading documents from directory ====
Loaded 45 documents


In [23]:
for i in documents:
  print(i["id"])

NLP - Winter 2025 - Course Outline.txt
SEP775-NLP-HandsOn-Session.txt
Image Transformer.txt
lecture02-Neural Nets.txt
Assignment4-2025.txt
Layer Normalization.txt
Dense Passage Retrieval for Open-Domain Question Answering.txt
BLEU- a Method for Automatic Evaluation of Machine Translation.txt
Natural Language Processing (Almost) from Scratch.txt
lecture08-Question Answering.txt
Assignment2-2025.txt
Improving Distributional Similarity with Lessons Learned from Word Embeddings.txt
BI-DIRECTIONAL ATTENTION FLOW FOR MACHINE COMPREHENSION.txt
Attention Is All You Need.txt
lecture01-wordvecs11.txt
SEP775-python-review.txt
lecture07-Prompting, Instruction Finetuning, and RLHF .txt
gradient-notes.txt
Efficient Estimation of Word Representations in Vector Space.txt
N-gram Language Models.txt
Contextual Word Representations- A Contextual Introduction.txt
lecture09-Multimodal Models.txt
Latent Retrieval for Weakly Supervised Open Domain Question Answering.txt
Fine-Tuning and Masked Language Models

In [24]:
# Split documents into chunks
chunked_documents = []
for doc in documents:
    chunks = split_text(doc["text"])
    print("==== Splitting docs into chunks ====")
    for i, chunk in enumerate(chunks):
        chunked_documents.append({"id": f"{doc['id']}_chunk{i+1}", "text": chunk})

print(f"Split documents into {len(chunked_documents)} chunks")

==== Splitting docs into chunks ====
==== Splitting docs into chunks ====
==== Splitting docs into chunks ====
==== Splitting docs into chunks ====
==== Splitting docs into chunks ====
==== Splitting docs into chunks ====
==== Splitting docs into chunks ====
==== Splitting docs into chunks ====
==== Splitting docs into chunks ====
==== Splitting docs into chunks ====
==== Splitting docs into chunks ====
==== Splitting docs into chunks ====
==== Splitting docs into chunks ====
==== Splitting docs into chunks ====
==== Splitting docs into chunks ====
==== Splitting docs into chunks ====
==== Splitting docs into chunks ====
==== Splitting docs into chunks ====
==== Splitting docs into chunks ====
==== Splitting docs into chunks ====
==== Splitting docs into chunks ====
==== Splitting docs into chunks ====
==== Splitting docs into chunks ====
==== Splitting docs into chunks ====
==== Splitting docs into chunks ====
==== Splitting docs into chunks ====
==== Splitting docs into chunks ====
=

In [25]:
print(chunked_documents[70])

{'id': 'Assignment4-2025.txt_chunk1', 'text': '\ufeffMcMaster University SEP 775\n\nAssignment 4\nParameter Efficient Supervised Fine-Tuning of a Pretrained\n\nLanguage Model (Total: 100 Points)\n\nObjectives\n\nIn this assignment, you will explore the process of supervised fine-tuning on a small pretrained\nlanguage model originally trained with the next-token prediction (autoregressive) and not\nadapted with any post-training modifications or chat-specific templates. You will research\nparameter-efficient fine-tuning approaches, select an appropriate dataset from Hugging Face,\nand evaluate how fine-tuning influences the model’s behavior.\n\n1. Background and Supervised Fine-Tuning\n\n• Research and describe the concept of supervised fine-tuning.\n\n• Explain the characteristics of a base model that has been pretrained with next-token\nprediction and has not undergone additional post-training adjustments. Demonstrate your\nclaim with an example.\n\n• Compare standard next-token predi

In [26]:
# List of available models
models = client.models.list()
for model in models.data:
    print(model.id)

whisper-large-v3-turbo
distil-whisper-large-v3-en
mistral-saba-24b
deepseek-r1-distill-qwen-32b
llama-3.2-1b-preview
llama-3.2-90b-vision-preview
llama-guard-3-8b
llama-3.1-8b-instant
llama-3.2-11b-vision-preview
gemma2-9b-it
qwen-2.5-coder-32b
llama3-70b-8192
llama3-8b-8192
llama-3.3-70b-versatile
qwen-2.5-32b
whisper-large-v3
qwen-qwq-32b
llama-3.2-3b-preview
deepseek-r1-distill-llama-70b
llama-3.3-70b-specdec
allam-2-7b


In [27]:
from sentence_transformers import SentenceTransformer

# Load the embedding model (Make sure you install `sentence-transformers` first)
model = SentenceTransformer("BAAI/bge-large-en")

def get_huggingface_embedding(text):
    embedding = model.encode(text).tolist()  # Convert to list for storage
    print("==== Generating embeddings... ====")
    return embedding


In [28]:
print(len(chunked_documents))

2096


In [29]:
# Generate embeddings for the document chunks
for doc in chunked_documents:
    print("==== Generating embeddings... ====")
    doc["embedding"] = get_huggingface_embedding(doc["text"])

print(doc["embedding"])

==== Generating embeddings... ====
==== Generating embeddings... ====
==== Generating embeddings... ====
==== Generating embeddings... ====
==== Generating embeddings... ====
==== Generating embeddings... ====
==== Generating embeddings... ====
==== Generating embeddings... ====
==== Generating embeddings... ====
==== Generating embeddings... ====
==== Generating embeddings... ====
==== Generating embeddings... ====
==== Generating embeddings... ====
==== Generating embeddings... ====
==== Generating embeddings... ====
==== Generating embeddings... ====
==== Generating embeddings... ====
==== Generating embeddings... ====
==== Generating embeddings... ====
==== Generating embeddings... ====
==== Generating embeddings... ====
==== Generating embeddings... ====
==== Generating embeddings... ====
==== Generating embeddings... ====
==== Generating embeddings... ====
==== Generating embeddings... ====
==== Generating embeddings... ====
==== Generating embeddings... ====
==== Generating embe

In [30]:
# Upsert documents with embeddings into Chroma
for doc in chunked_documents:
    print("==== Inserting chunks into db;;; ====")
    collection.upsert(
        ids=[doc["id"]], documents=[doc["text"]], embeddings=[doc["embedding"]]
    )

==== Inserting chunks into db;;; ====
==== Inserting chunks into db;;; ====
==== Inserting chunks into db;;; ====
==== Inserting chunks into db;;; ====
==== Inserting chunks into db;;; ====
==== Inserting chunks into db;;; ====
==== Inserting chunks into db;;; ====
==== Inserting chunks into db;;; ====
==== Inserting chunks into db;;; ====
==== Inserting chunks into db;;; ====
==== Inserting chunks into db;;; ====
==== Inserting chunks into db;;; ====
==== Inserting chunks into db;;; ====
==== Inserting chunks into db;;; ====
==== Inserting chunks into db;;; ====
==== Inserting chunks into db;;; ====
==== Inserting chunks into db;;; ====
==== Inserting chunks into db;;; ====
==== Inserting chunks into db;;; ====
==== Inserting chunks into db;;; ====
==== Inserting chunks into db;;; ====
==== Inserting chunks into db;;; ====
==== Inserting chunks into db;;; ====
==== Inserting chunks into db;;; ====
==== Inserting chunks into db;;; ====
==== Inserting chunks into db;;; ====
==== Inserti

In [31]:
# Function to query documents
def query_documents(question, n_results=2):
    # query_embedding = get_openai_embedding(question)
    results = collection.query(query_texts=[question], n_results=n_results)

    # Extract the relevant chunks
    relevant_chunks = [doc for sublist in results["documents"] for doc in sublist]
    print("==== Returning relevant chunks ====")
    return relevant_chunks

    # The following code was indented incorrectly and will not execute properly
    # It seems intended to print information about the returned results
    # for idx, document in enumerate(results["documents"][0]):
    #     doc_id = results["ids"][0][idx]
    #     distance = results["distances"][0][idx]
    #     print(f"Found document chunk: {document} (ID: {doc_id}, Distance: {distance})")



    # Extract the relevant chunks
    relevant_chunks = [doc for sublist in results["documents"] for doc in sublist]
    print("==== Returning relevant chunks ====")
    return relevant_chunks

    for idx, document in enumerate(results["documents"][0]):
        doc_id = results["ids"][0][idx]
        distance = results["distances"][0][idx]
        print(f"Found document chunk: {document} (ID: {doc_id}, Distance: {distance})")


In [32]:
def query_documents(question, n_results=2):
    # Ensure the input is wrapped in a list
    results = collection.query(query_texts=[question], n_results=n_results)

    # Check if the response contains expected keys
    if "documents" not in results or not results["documents"]:
        print("==== No relevant documents found ====")
        return []

    # Extract the relevant chunks
    relevant_chunks = [doc for sublist in results["documents"] for doc in sublist]
    print("==== Returning relevant chunks ====")

    # Print relevant documents and their metadata
    for idx, document in enumerate(results["documents"][0]):
        doc_id = results["ids"][0][idx]
        distance = results["distances"][0][idx]
        print(f"Found document chunk: {document} (ID: {doc_id}, Distance: {distance})")

    return relevant_chunks  # Ensure this comes after the for loop


In [33]:
# Function to generate a response from OpenAI
def generate_response(question, relevant_chunks):
    context = "\n\n".join(relevant_chunks)
    prompt = (
        "You are an assistant for question-answering tasks. Use the following pieces of "
        "retrieved context to answer the question. If you don't know the answer, say that you "
        "don't know. Use three sentences maximum and keep the answer concise."
        "\n\nContext:\n" + context + "\n\nQuestion:\n" + question
    )

    response = client.chat.completions.create(
        model="llama3-8b-8192",
        messages=[
            {
                "role": "system",
                "content": prompt,
            },
            {
                "role": "user",
                "content": question,
            },
        ],
    )

    answer = response.choices[0].message
    return answer

In [36]:
# Example query
# Example query and response generation
question = "tell me about Natural Language Processing."
relevant_chunks = query_documents(question)
answer = generate_response(question, relevant_chunks)

==== Returning relevant chunks ====
Found document chunk:  amounts of mostly unlabeled training data. This work is then used as a basis for
building a freely available tagging system with good performance and minimal computational re-
quirements.
Keywords: natural language processing, neural networks

1. Introduction

Will a computer program ever be able to convert a piece of English text into a programmer friendly
data structure that describes the meaning of the natural language text? Unfortunately, no consensus
has emerged about the form or the existence of such a data structure. Until such fundamental
Articial Intelligence problems are resolved, computer scientists must settle for the reduced objective
of extracting simpler representations that describe limited aspects of the textual information.

These simpler representations are often motivated by specific applications (for instance, bag-
of-words variants for information retrieval), or by our belief that they capture something mo

In [37]:
print(answer)

ChatCompletionMessage(content='Natural Language Processing (NLP) is a subfield of artificial intelligence that deals with the interaction between computers and human language. It involves the development of algorithms and statistical models that enable computers to process, understand, and generate natural language data.', role='assistant', function_call=None, reasoning=None, tool_calls=None)
