In [1]:
!pip install chromadb pypdf2 python-docx sentence-transformers --quiet

[?25l     [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m0.0/67.3 kB[0m [31m?[0m eta [36m-:--:--[0m[2K     [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m67.3/67.3 kB[0m [31m3.0 MB/s[0m eta [36m0:00:00[0m
[?25h  Installing build dependencies ... [?25l[?25hdone
  Getting requirements to build wheel ... [?25l[?25hdone
  Preparing metadata (pyproject.toml) ... [?25l[?25hdone
[2K   [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m19.5/19.5 MB[0m [31m89.6 MB/s[0m eta [36m0:00:00[0m
[2K   [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m232.6/232.6 kB[0m [31m17.6 MB/s[0m eta [36m0:00:00[0m
[2K   [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m253.0/253.0 kB[0m [31m14.9 MB/s[0m eta [36m0:00:00[0m
[2K   [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m284.2/284.2 kB[0m [31m19.3 MB/s[0m eta [36m0:00:00[0m
[2K   [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m1.9/1.9 MB[0m [31m84.2 MB/s[0m eta [36m0:

# Document Processing
We will have:


*   Text Files
*   PDF Files
*   Docx





In [2]:
import docx
import PyPDF2
import os

def read_text_file(file_path: str):
  """Read content from a text file"""
  with open(file_path, 'r', encoding = "utf-8") as file:
    return file.read()

def read_pdf_file(file_path: str):
  """Read content from a PDF file"""
  text = ""
  with open(file_path, 'rb') as file:
    pdf_reader = PyPDF2.PdfReader(file)
    for page in pdf_reader.pages:
      text += page.extract_text() + "\n"
  return text

def read_docx_file(file_path: str):
  """Read content from a Docx file"""
  doc = docx.Document(file_path)
  return "\n".join([paragraph.text for paragraph in doc.paragraphs])

In [3]:
# creating a unified function to read any document

def read_document(file_path: str):
  """Read content based on the file extenstion"""
  _, file_extension = os.path.splitext(file_path)
  file_extension = file_extension.lower()

  if file_extension == '.txt':
    return read_text_file(file_path)

  elif file_extension == '.pdf':
    return read_pdf_file(file_path)

  elif file_extension == '.docx':
    return read_docx_file(file_path)

  else:
    raise ValueError(f"unsupported file extension: {file_extension}")

# Creating Text Chunks

In [4]:
def split_text(text: str, chunk_size: int = 500):
  """Split the document extracted text into chunks"""
  sentences = text.replace('\n', ' ').split('. ')
  chunks = []
  current_chunk = []
  current_size = 0

  for sentence in sentences:
    sentence = sentence.strip()
    if not sentence:
      continue
    if not sentence.endswith('.'):
      sentence += '.'

    sentence_size = len(sentence)

    if current_size + sentence_size > chunk_size and current_chunk:
      chunks.append(' '.join(current_chunk))
      current_chunk = [sentence]
      current_size = sentence_size
    else:
      current_chunk.append(sentence)
      current_size += sentence_size

  if current_chunk:
    chunks.append(' '.join(current_chunk))

  return chunks

# Setting up ChromaDB

In [5]:
import chromadb
from chromadb.utils import embedding_functions

client = chromadb.PersistentClient(path = "chromadb")

sentence_transformer_embedding = embedding_functions.SentenceTransformerEmbeddingFunction(
    model_name = "all-MiniLM-L6-v2"
)

collection = client.get_or_create_collection(
    name = "documents",
    embedding_function = sentence_transformer_embedding
)

The secret `HF_TOKEN` does not exist in your Colab secrets.
To authenticate with the Hugging Face Hub, create a token in your settings tab (https://huggingface.co/settings/tokens), set it as secret in your Google Colab and restart your session.
You will be able to reuse this secret in all of your notebooks.
Please note that authentication is recommended but still optional to access public models or datasets.


modules.json:   0%|          | 0.00/349 [00:00<?, ?B/s]

config_sentence_transformers.json:   0%|          | 0.00/116 [00:00<?, ?B/s]

README.md: 0.00B [00:00, ?B/s]

sentence_bert_config.json:   0%|          | 0.00/53.0 [00:00<?, ?B/s]

config.json:   0%|          | 0.00/612 [00:00<?, ?B/s]

model.safetensors:   0%|          | 0.00/90.9M [00:00<?, ?B/s]

tokenizer_config.json:   0%|          | 0.00/350 [00:00<?, ?B/s]

vocab.txt: 0.00B [00:00, ?B/s]

tokenizer.json: 0.00B [00:00, ?B/s]

special_tokens_map.json:   0%|          | 0.00/112 [00:00<?, ?B/s]

config.json:   0%|          | 0.00/190 [00:00<?, ?B/s]

# Inserting Data in ChromaDB

In [6]:
def process_document(file_path: str):
  """Prepare the data For ChromaDB"""
  try:
    content = read_document(file_path)
    if content is None:  # Ensure content is not None
      print(f"Warning: No content extracted from {file_path}")
      return [], [], []
    chunks = split_text(content)

    file_name = os.path.basename(file_path)
    metadata = [{"source": file_name, "chunk": i} for i in range(len(chunks))]
    id = [f"{file_name}_chunk_{i}" for i in range(len(chunks))]

    return id, chunks, metadata

  except Exception as e:
    print(f"Error Processing Data from {file_path}: {str(e)}")
    return [], [], []

In [7]:
def add_to_collection(collection, id, texts, metadata):
  """Add documents to collection in batches"""
  if not texts:
    return

  batch_size = 100
  for i in range(0, len(texts), batch_size):
    idx = min(i + batch_size, len(texts))
    collection.add(
        documents = texts[i:idx],
        metadatas = metadata[i:idx],
        ids = id[i:idx]
    )

In [8]:
def process_and_add_documents(collection, folder_path: str):
  """Process all documents in the folder and add to collection"""
  files = [os.path.join(folder_path, file)
          for file in os.listdir(folder_path)
          if os.path.isfile(os.path.join(folder_path, file))]

  for file_path in files:
    print(f"Processing {os.path.basename(file_path)}")
    id, texts, metadata = process_document(file_path)
    add_to_collection(collection, id, texts, metadata)
    print(f"Added {len(texts)} chunks to collection")

In [9]:
folder_path = "/content/drive/MyDrive/Docs"
process_and_add_documents(collection, folder_path)

Processing Company_ QuantumNext Systems.docx
Added 2 chunks to collection
Processing Company_ GreenFields BioTech.docx
Added 2 chunks to collection
Processing Company_ TechWave Innovations.docx
Added 1 chunks to collection
Processing GreenGrow Innovations_ Company History.docx
Added 5 chunks to collection
Processing GreenGrow's EcoHarvest System_ A Revolution in Farming.pdf
Added 6 chunks to collection


# Semantic Search

In [10]:
def semantic_search(collection, query: str, n_results: int = 2):
  """Perform semantic search on collection"""
  results = collection.query(
      query_texts = [query],
      n_results = n_results
  )
  return results

In [11]:
def get_context_with_sources(results):
  """Get context & source"""
  context = "\n\n".join(results['documents'][0]) # accesses the first batch from retieved docs
  sources = [
      f"{meta['source']} (chunk {meta['chunk']})"
      for meta in results['metadatas'][0]
  ]
  return context, sources

In [12]:
query = "When was GreenGroq invented?"
results = semantic_search(collection, query)
results

{'ids': [['GreenGrow Innovations_ Company History.docx_chunk_0',
   'GreenGrow Innovations_ Company History.docx_chunk_1']],
 'embeddings': None,
 'documents': [['GreenGrow Innovations was founded in 2010 by Sarah Chen and Michael Rodriguez, two agricultural engineers with a passion for sustainable farming. The company started in a small garage in Portland, Oregon, with a simple mission: to make farming more environmentally friendly and efficient. In its early days, GreenGrow focused on developing smart irrigation systems that could significantly reduce water usage in agriculture.',
   'Their first product, the WaterWise Sensor, was launched in 2012 and quickly gained popularity among local farmers. This success allowed the company to expand its research and development efforts. By 2015, GreenGrow had outgrown its garage origins and moved into a proper office and research facility in the outskirts of Portland.']],
 'uris': None,
 'included': ['metadatas', 'documents', 'distances'],
 'd

quite confusing.. let's clean it.

In [13]:
def print_search_results(results):
  """Print formatted search results"""
  print("\nSearch Results:\n" + "-" * 50)

  for i in range(len(results['documents'][0])):
    doc = results['documents'][0][i]
    metadata = results['metadatas'][0][i]
    distances = results['distances'][0][i]

    print(f"\nResult: {i+1}")
    print(f"Source: {metadata['source']}, Chunk {metadata['chunk']}")
    print(f"Distance: {distances}")
    print(f"Content: {doc}")

In [14]:
print_search_results(results)


Search Results:
--------------------------------------------------

Result: 1
Source: GreenGrow Innovations_ Company History.docx, Chunk 0
Distance: 0.7566683292388916
Content: GreenGrow Innovations was founded in 2010 by Sarah Chen and Michael Rodriguez, two agricultural engineers with a passion for sustainable farming. The company started in a small garage in Portland, Oregon, with a simple mission: to make farming more environmentally friendly and efficient. In its early days, GreenGrow focused on developing smart irrigation systems that could significantly reduce water usage in agriculture.

Result: 2
Source: GreenGrow Innovations_ Company History.docx, Chunk 1
Distance: 0.8583546876907349
Content: Their first product, the WaterWise Sensor, was launched in 2012 and quickly gained popularity among local farmers. This success allowed the company to expand its research and development efforts. By 2015, GreenGrow had outgrown its garage origins and moved into a proper office and resea

# Setting up Gemini

geminiapi - AIzaSyD8gBeu-GI92birP9Ou8V2zDWJxfKEjROo

In [15]:
import google.generativeai as genai
from google.colab import userdata
import os

GOOGLE_API_KEY = userdata.get('GOOGLE_API_KEY')

genai.configure(api_key=GOOGLE_API_KEY)

In [16]:
def get_prompt(context: str, conversation_history: str, query: str):
  """Generate a prompt combining context, history and query"""
  prompt = f""" based on the following contextand conversation history, please provide a
  relevant aand contexual response. If the answer cannot be derived from the context,
  only use the conversation history or say 'I cannot answer this based on the provided information'

  Context from documents: {context}
  previous conversation: {conversation_history}
  Human: {query}
  Assistant:"""

  return prompt

In [17]:
def generate_response(query: str, context: str, conversation_history: str = ""):
    """Generate a response using Gemini with context and conversation history"""
    prompt = get_prompt(context, conversation_history, query)
    model = genai.GenerativeModel("gemini-2.5-flash")
    try:
        response = model.generate_content(
            prompt,
            generation_config={
                "temperature": 0.0,
                "max_output_tokens": 500,
            }
        )
        return response.text
    except Exception as e:
        return f"Error generating response: {str(e)}"

# Testing our RAG

In [18]:
def rag_query(collection, query:str, n_chunks: int = 2):
  """retirver relevant chunks and generate answer """
  results = semantic_search(collection, query, n_chunks)
  context, sources = get_context_with_sources(results)
  response = generate_response(query, context)
  return response, sources

In [19]:
query = "When was GreenGrow innovations founded?"
response, sources = rag_query(collection, query)

print('\nQuery:', query)
print('\nAnswer:', response)
print('\nSources used:')
for source in sources:
  print(f" - {source}")


Query: When was GreenGrow innovations founded?

Answer: GreenGrow Innovations was founded in 2010.

Sources used:
 - GreenGrow Innovations_ Company History.docx (chunk 0)
 - GreenGrow Innovations_ Company History.docx (chunk 4)


In [20]:
query = "When was Tesla founded?"
response, sources = rag_query(collection, query)

print('\nQuery:', query)
print('\nAnswer:', response)
print('\nSources used:')
for source in sources:
  print(f" - {source}")


Query: When was Tesla founded?

Answer: I cannot answer this based on the provided information.

Sources used:
 - Company_ GreenFields BioTech.docx (chunk 0)
 - GreenGrow Innovations_ Company History.docx (chunk 0)


# Session & Message Management

In [21]:
import uuid
from datetime import datetime
import json

conversations = {}

def create_session():
  """Create a new session"""
  session_id = str(uuid.uuid4())
  conversations[session_id] = []
  return session_id

In [22]:
def add_message(session_id: str, role: str, content: str):
  """Add a message to conversation history"""
  if session_id not in conversations:
    conversations[session_id] = []

  conversations[session_id].append({
      "role": role,
      "content": content,
      "timestamp": datetime.now().isoformat()
  })

In [23]:
def get_conversation_history(session_id: str, max_messages: int = None):
  """Get conversation history for a session"""
  if session_id not in conversations:
    return []

  history = conversations[session_id]
  if max_messages:
    history = history[-max_messages:]

  return history

In [24]:
# creating a formatted conv history. i.e, assigning the approriate roles

def format_conversation_history(session_id: str, max_messages: int = 5):
  """Format conversation history for adding in prompts"""
  history = get_conversation_history(session_id, max_messages)
  formatted_history = ""
  for msg in history:
    role = "Human" if msg["role"] == "user" else "Assistant"
    formatted_history += f"{role}: {msg['content']}\n\n"

  return formatted_history.strip()

In [25]:
def contextualize_query(query: str, conversation_history: str):
    """
    Reformulate follow-up questions into standalone queries using Gemini.
    """
    prompt = """Given a chat history and the latest user question
which might reference context in the chat history, formulate a standalone
question which can be understood without the chat history.
Do NOT answer the question, just reformulate it if needed and otherwise return it as is.

Chat history:
{history}

Question:
{question}

Rewritten standalone question:""".format(
        history=conversation_history.strip(),
        question=query.strip()
    )
    model = genai.GenerativeModel("gemini-2.5-flash")
    try:
        response = model.generate_content(
            prompt,
            generation_config={
                "temperature": 0.0,
                "max_output_tokens": 100,
            }
        )
        return response.text.strip()
    except Exception as e:
        print(f"Error contextualizing query: {str(e)}")
        return query  # Fallback to original query

# Combining RAG with history

In [26]:
def get_prompt(context, conversation_history, query):
  prompt = f"""Based on the following context and conversation history, please provide
           a relevant and contexual respone. If the answer cannot be deriveed from the
           content, only use the conversation history or say 'I cannot answer this
           based on the provided context.'
           context from documents: {context}
           previous conversation: {conversation_history}
           Human: {query}
           Assistant: """
  return prompt

In [27]:
def generate_response(query: str, context: str, conversation_history: str = ""):
    """Generate a response using Gemini with context and conversation history"""
    prompt = get_prompt(context, conversation_history, query)
    model = genai.GenerativeModel("gemini-2.5-flash")
    try:
        response = model.generate_content(
            prompt,
            generation_config={
                "temperature": 0.0,
                "max_output_tokens": 500,
            }
        )
        return response.text
    except Exception as e:
        return f"Error generating response: {str(e)}"

# Combining everything & creating our final RAG

In [28]:
def conversational_rag(collection, query: str, session_id: str, n_chunks: int = 2):
  """Perform RAG query with session and conversational history"""
  conversation_history = format_conversation_history(session_id)
  query = contextualize_query(query, conversation_history)
  print("Contexualised query:", query)
  context, sources = get_context_with_sources(semantic_search(collection, query, n_chunks))
  print("Context:", context)
  print("Sources:", sources)
  response = generate_response(query, context, conversation_history)
  add_message(session_id, "user", query)
  add_message(session_id, "assistant", response)
  return response, sources

In [29]:
## testing!!

session_id = create_session()

query = "When was GreenGrow innovations founded?"
response, sources = conversational_rag(collection, query, session_id)
print(response)

Contexualised query: When was GreenGrow innovations founded?
Context: GreenGrow Innovations was founded in 2010 by Sarah Chen and Michael Rodriguez, two agricultural engineers with a passion for sustainable farming. The company started in a small garage in Portland, Oregon, with a simple mission: to make farming more environmentally friendly and efficient. In its early days, GreenGrow focused on developing smart irrigation systems that could significantly reduce water usage in agriculture.

Despite its growth, GreenGrow remains committed to its original mission of promoting sustainable farming practices. The company regularly partners with universities and research institutions to advance the field of agricultural technology and hosts annual conferences to share knowledge with farmers and other industry professionals.
Sources: ['GreenGrow Innovations_ Company History.docx (chunk 0)', 'GreenGrow Innovations_ Company History.docx (chunk 4)']
GreenGrow Innovations was founded in 2010.


In [30]:
query = "Where is it located?"
response, sources = conversational_rag(collection, query, session_id)
print(response)

Contexualised query: Where is GreenGrow Innovations located?
Context: GreenGrow Innovations was founded in 2010 by Sarah Chen and Michael Rodriguez, two agricultural engineers with a passion for sustainable farming. The company started in a small garage in Portland, Oregon, with a simple mission: to make farming more environmentally friendly and efficient. In its early days, GreenGrow focused on developing smart irrigation systems that could significantly reduce water usage in agriculture.

This system caught the attention of large-scale farmers across the United States, propelling GreenGrow to national prominence. Today, GreenGrow Innovations employs over 200 people and has expanded its operations to include offices in California and Iowa. The company continues to focus on developing sustainable agricultural technologies, with ongoing projects in vertical farming, drought-resistant crop development, and AI-powered farm management systems.
Sources: ['GreenGrow Innovations_ Company Hist