<a href="https://colab.research.google.com/github/shreenn/Restaurant-RAG-chatbot/blob/main/restroRAGchatbot.ipynb" target="_parent"><img src="https://colab.research.google.com/assets/colab-badge.svg" alt="Open In Colab"/></a>

In [1]:
!pip install langchain langchain-community langchain-openai pypdf chromadb python-dotenv

Collecting langchain-community
  Downloading langchain_community-0.3.27-py3-none-any.whl.metadata (2.9 kB)
Collecting langchain-openai
  Downloading langchain_openai-0.3.28-py3-none-any.whl.metadata (2.3 kB)
Collecting pypdf
  Downloading pypdf-5.8.0-py3-none-any.whl.metadata (7.1 kB)
Collecting chromadb
  Downloading chromadb-1.0.15-cp39-abi3-manylinux_2_17_x86_64.manylinux2014_x86_64.whl.metadata (7.0 kB)
Collecting python-dotenv
  Downloading python_dotenv-1.1.1-py3-none-any.whl.metadata (24 kB)
Collecting dataclasses-json<0.7,>=0.5.7 (from langchain-community)
  Downloading dataclasses_json-0.6.7-py3-none-any.whl.metadata (25 kB)
Collecting pydantic-settings<3.0.0,>=2.4.0 (from langchain-community)
  Downloading pydantic_settings-2.10.1-py3-none-any.whl.metadata (3.4 kB)
Collecting httpx-sse<1.0.0,>=0.4.0 (from langchain-community)
  Downloading httpx_sse-0.4.1-py3-none-any.whl.metadata (9.4 kB)
Collecting pybase64>=1.4.1 (from chromadb)
  Downloading pybase64-1.4.1-cp311-cp311-man

In [2]:
from google.colab import userdata
import os

os.environ["OPENAI_API_KEY"] = userdata.get('OPENAI_API_KEY')

print("OpenAI API Key loaded from Colab Secrets.")

OpenAI API Key loaded from Colab Secrets.


In [7]:

%%writefile ingest_data.py
import os

from langchain_community.document_loaders import PyPDFLoader
from langchain.text_splitter import RecursiveCharacterTextSplitter
from langchain_openai import OpenAIEmbeddings
from langchain_community.vectorstores import Chroma

# Path to PDF document (make sure you uploaded it!)
PDF_PATH = "Restaurant QA.pdf"
# Directory where ChromaDB will store its data
CHROMA_DB_DIR = "chroma_db"
# Embedding model to use
EMBEDDING_MODEL = "text-embedding-ada-002"
# Chunk size for text splitting
CHUNK_SIZE = 1000
# Overlap between chunks to maintain context
CHUNK_OVERLAP = 200

def ingest_documents():
    """
    Ingests documents from a PDF, splits them into chunks,
    generates embeddings, and stores them in a ChromaDB vector store.
    """
    print(f"--- Starting Data Ingestion ---")

    #  Load the PDF document
    print(f"Loading document from: {PDF_PATH}")
    try:
        loader = PyPDFLoader(PDF_PATH)
        documents = loader.load()
        print(f"Loaded {len(documents)} pages from the PDF.")
    except Exception as e:
        print(f"Error loading PDF: {e}")
        print(f"Please ensure '{PDF_PATH}' exists in your Colab files.")
        return

    # Split the documents into smaller, manageable chunks
    print(f"Splitting documents into chunks (size: {CHUNK_SIZE}, overlap: {CHUNK_OVERLAP})...")
    text_splitter = RecursiveCharacterTextSplitter(
        chunk_size=CHUNK_SIZE,
        chunk_overlap=CHUNK_OVERLAP,
        length_function=len, # character length for splitting
        add_start_index=True, # metadata about where the chunk starts in the original doc
    )
    chunks = text_splitter.split_documents(documents)
    print(f"Created {len(chunks)} text chunks.")

    #Initialize the OpenAI Embeddings model
    print(f"Initializing OpenAI Embeddings model: {EMBEDDING_MODEL}...")
    # OPENAI_API_KEY is set in the environment
    if not os.getenv("OPENAI_API_KEY"):
        print("Error: OPENAI_API_KEY not found in environment variables.")
        print("Please ensure it's set in Colab Secrets or directly in a previous cell.")
        return

    embeddings = OpenAIEmbeddings(model=EMBEDDING_MODEL)
    print("Embeddings model initialized.")

    # Store the embeddings in ChromaDB
    print(f"Storing embeddings in ChromaDB at: {CHROMA_DB_DIR}...")
    # create or load the ChromaDB instance
    vectorstore = Chroma.from_documents(
        documents=chunks,
        embedding=embeddings,
        persist_directory=CHROMA_DB_DIR
    )
    # Persist the collection to disk
    vectorstore.persist()
    print(f"Embeddings successfully stored in ChromaDB.")
    print(f"--- Data Ingestion Complete ---")

if __name__ == "__main__":
    ingest_documents()


Overwriting ingest_data.py


In [9]:
# Run Data Ingestion

!python ingest_data.py


--- Starting Data Ingestion ---
Loading document from: Restaurant QA.pdf
Loaded 2 pages from the PDF.
Splitting documents into chunks (size: 1000, overlap: 200)...
Created 3 text chunks.
Initializing OpenAI Embeddings model: text-embedding-ada-002...
Embeddings model initialized.
Storing embeddings in ChromaDB at: chroma_db...
Traceback (most recent call last):
  File "/content/ingest_data.py", line 79, in <module>
    ingest_documents()
  File "/content/ingest_data.py", line 68, in ingest_documents
    vectorstore = Chroma.from_documents(
                  ^^^^^^^^^^^^^^^^^^^^^^
  File "/usr/local/lib/python3.11/dist-packages/langchain_community/vectorstores/chroma.py", line 887, in from_documents
    return cls.from_texts(
           ^^^^^^^^^^^^^^^
  File "/usr/local/lib/python3.11/dist-packages/langchain_community/vectorstores/chroma.py", line 843, in from_texts
    chroma_collection.add_texts(
  File "/usr/local/lib/python3.11/dist-packages/langchain_community/vectorstores/chroma.

In [10]:

%%writefile chatbot_app.py
import os

from langchain_openai import OpenAIEmbeddings, ChatOpenAI
from langchain_community.vectorstores import Chroma
from langchain.prompts import ChatPromptTemplate
from langchain.chains.retrieval import create_retrieval_chain
from langchain.chains.combine_documents import create_stuff_documents_chain

# Directory where ChromaDB stored its data
CHROMA_DB_DIR = "chroma_db"
# Embedding model used during ingestion
EMBEDDING_MODEL = "text-embedding-ada-002"
# LLM model to use for generating answers
LLM_MODEL = "gpt-3.5-turbo"

# --- Prompt Template
PROMPT_TEMPLATE = """
You are a helpful restaurant assistant.
Answer the user's question ONLY based on the provided context.
If the answer is not found in the context, politely state that you don't have information on that.
Do not make up information.

Context:
{context}

Question: {input}
"""

def run_chatbot():
    """
    Initializes the chatbot, loads the vector store, and allows the user to ask questions.
    """
    print(f"--- Starting Restaurant Chatbot ---")

    # OPENAI_API_KEY is set in the environment
    if not os.getenv("OPENAI_API_KEY"):
        print("Error: OPENAI_API_KEY not found in environment variables.")
        print("Please ensure it's set in Colab Secrets or directly in a previous cell.")
        return

    # 1. Load Embeddings Model
    print(f"Initializing OpenAI Embeddings model: {EMBEDDING_MODEL}...")
    embeddings = OpenAIEmbeddings(model=EMBEDDING_MODEL)
    print("Embeddings model initialized.")

    # 2. Load Vector Store
    print(f"Loading vector store from: {CHROMA_DB_DIR}...")
    try:

        vectorstore = Chroma(persist_directory=CHROMA_DB_DIR, embedding_function=embeddings)
        print("Vector store loaded successfully.")
    except Exception as e:
        print(f"Error loading vector store: {e}")
        print("Please ensure 'ingest_data.py' was run successfully and created the 'chroma_db' folder.")
        return

    # 3. Initialize LLM
    print(f"Initializing LLM model: {LLM_MODEL}...")
    llm = ChatOpenAI(model=LLM_MODEL, temperature=0) # temperature=0 makes it more factual
    print("LLM model initialized.")

    # 4. Create the Prompt Template
    prompt = ChatPromptTemplate.from_template(PROMPT_TEMPLATE)

    # 5. Create the Document Chain
    document_chain = create_stuff_documents_chain(llm, prompt)

    # 6. Create the Retrieval Chain
    retriever = vectorstore.as_retriever() # Convert vector store into a retriever
    retrieval_chain = create_retrieval_chain(retriever, document_chain)

    print("\nChatbot is ready! Type your questions below. Type 'exit' to quit.")

    # Interaction Loop
    while True:
        user_input = input("\nYour Question: ")
        if user_input.lower() == 'exit':
            print("Exiting chatbot. Goodbye!")
            break

        print("Thinking...")
        try:
            # Invoke the retrieval chain with the user's question
            response = retrieval_chain.invoke({"input": user_input})

            #  response will contain the answer and potentially the retrieved documents
            print("\nChatbot Answer:")
            print(response["answer"])

        except Exception as e:
            print(f"An error occurred: {e}")
            print("Please ensure your OpenAI API key is correct and you have an internet connection.")
            print("If this persists, try restarting the runtime (Runtime -> Restart runtime) and re-running cells.")

if __name__ == "__main__":
    run_chatbot()


Writing chatbot_app.py


In [11]:
# Run Chatbot Application

!python chatbot_app.py

--- Starting Restaurant Chatbot ---
Initializing OpenAI Embeddings model: text-embedding-ada-002...
Embeddings model initialized.
Loading vector store from: chroma_db...
  vectorstore = Chroma(persist_directory=CHROMA_DB_DIR, embedding_function=embeddings)
Vector store loaded successfully.
Initializing LLM model: gpt-3.5-turbo...
LLM model initialized.

Chatbot is ready! Type your questions below. Type 'exit' to quit.

Your Question: whats the restaurants name
Thinking...
An error occurred: Error code: 429 - {'error': {'message': 'You exceeded your current quota, please check your plan and billing details. For more information on this error, read the docs: https://platform.openai.com/docs/guides/error-codes/api-errors.', 'type': 'insufficient_quota', 'param': None, 'code': 'insufficient_quota'}}
Please ensure your OpenAI API key is correct and you have an internet connection.
If this persists, try restarting the runtime (Runtime -> Restart runtime) and re-running cells.

Your Question: