<a href="https://colab.research.google.com/github/sivamutukuri/Creating_Chatbot_By_Using_Prompt_Engineering/blob/main/Untitled15.ipynb" target="_parent"><img src="https://colab.research.google.com/assets/colab-badge.svg" alt="Open In Colab"/></a>

In [10]:
!pip install streamlit langchain langchain-openai langchain-community chromadb




In [11]:
!pip install pypdf python-docx unstructured pdfplumber



In [12]:
!pip install langchain-Chroma



In [13]:
!pip install pytesseract pillow




In [14]:
!pip install PyMuPDF



In [15]:
import os
import streamlit as st
from google.colab import userdata
openai_key=userdata.get('chat')
os.environ['OPENAI_API_KEY']=openai_key

In [16]:
%%writefile app.py

from langchain_openai import ChatOpenAI, OpenAIEmbeddings
from langchain_core.prompts import ChatPromptTemplate
from langchain_community.chat_message_histories import StreamlitChatMessageHistory
from langchain_core.callbacks.base import BaseCallbackHandler
from langchain_community.document_loaders import PyMuPDFLoader # Correctly imports PyMuPDFLoader
from langchain_text_splitters import RecursiveCharacterTextSplitter
from langchain_chroma import Chroma
from operator import itemgetter
import pandas as pd
import streamlit as st
import chromadb
import tempfile
import os

# Customize initial app landing page
st.set_page_config(page_title="File QA Chatbot", page_icon="🤖")
st.title("Welcome to File QA RAG Chatbot 🤖")

@st.cache_resource(ttl="1h")
# Takes uploaded PDFs, creates document chunks, computes embeddings
# Stores document chunks and embeddings in a Vector DB
# Returns a retriever which can look up the Vector DB
# to return documents based on user input
# Stores this in the cache
def configure_retriever(uploaded_files):
    # Read documents
    docs = []
    temp_dir = tempfile.TemporaryDirectory()
    for file in uploaded_files:
        temp_filepath = os.path.join(temp_dir.name, file.name)
        with open(temp_filepath, "wb") as f:
            f.write(file.getvalue())

        # PyMuPDFLoader only handles PDFs, so no need for explicit file type check here
        # since file_uploader already restricts to PDFs.
        loader = PyMuPDFLoader(temp_filepath)
        docs.extend(loader.load())

    # Split into documents chunks
    text_splitter = RecursiveCharacterTextSplitter(chunk_size=1500,
                                                   chunk_overlap=200)
    doc_chunks = text_splitter.split_documents(docs)

    # Create document embeddings and store in Vector DB
    embeddings_model = OpenAIEmbeddings()

    client = chromadb.PersistentClient(path="./chroma_db")

    # Clear any existing collection to ensure a fresh start
    try:
        client.delete_collection("document_collection")
    except Exception:
        # Collection might not exist, ignore the error
        pass

    # Use Chroma from LangChain
    vectordb = Chroma.from_documents(
        documents=doc_chunks,
        embedding=embeddings_model,
        client=client,
        collection_name="document_collection"
    )

    # Define retriever object
    retriever = vectordb.as_retriever(search_kwargs={"k": 3})
    return retriever

# Manages live updates to a Streamlit app's display by appending new text tokens
# to an existing text stream and rendering the updated text in Markdown
class StreamHandler(BaseCallbackHandler):
    def __init__(self, container, initial_text=""):
        self.container = container
        self.text = initial_text

    def on_llm_new_token(self, token: str, **kwargs) -> None:
        self.text += token
        self.container.markdown(self.text)

# Creates UI element to accept PDF uploads
uploaded_files = st.sidebar.file_uploader(
    label="Upload PDF files", type=["pdf"], # Restricted to PDF only
    accept_multiple_files=True
)
if not uploaded_files:
    st.info("Please upload PDF documents to continue.")
    st.stop()

# Create retriever object based on uploaded PDFs
retriever = configure_retriever(uploaded_files)

# Load a connection to ChatGPT LLM
chatgpt = ChatOpenAI(model_name='gpt-4o-mini', temperature=0.1,
                     streaming=True)

# Create a prompt template for QA RAG System
qa_template = """
Use only the following pieces of context to answer the question at the end.
If you don't know the answer, just say that you don't know,
don't try to make up an answer. Keep the answer as concise as possible.

{context}

Question: {question}
"""
qa_prompt = ChatPromptTemplate.from_template(qa_template)

# This function formats retrieved documents before sending to LLM
def format_docs(docs):
    return "\n\n".join([d.page_content for d in docs])

# Create a QA RAG System Chain
qa_rag_chain = (
    {
        "context": itemgetter("question") # based on the user question get context docs
            |
        retriever
            |
        format_docs,
        "question": itemgetter("question") # user question
    }
        |
    qa_prompt # prompt with above user question and context
        |
    chatgpt # above prompt is sent to the LLM for response
)

# Store conversation history in Streamlit session state
streamlit_msg_history = StreamlitChatMessageHistory(key="langchain_messages")

# Shows the first message when app starts
if len(streamlit_msg_history.messages) == 0:
    streamlit_msg_history.add_ai_message("Please ask your question?")

# Render current messages from StreamlitChatMessageHistory
for msg in streamlit_msg_history.messages:
    st.chat_message(msg.type).write(msg.content)

# Callback handler which does some post-processing on the LLM response
# Used to post the top 3 document sources used by the LLM in RAG response
class PostMessageHandler(BaseCallbackHandler):
    def __init__(self, msg: st.write):
        BaseCallbackHandler.__init__(self)
        self.msg = msg
        self.sources = []

    def on_retriever_end(self, documents, *, run_id, parent_run_id, **kwargs):
        source_ids = []
        for d in documents: # retrieved documents from retriever based on user query
            # Ensure 'source' and 'page' are always present for PyMuPDFLoader
            metadata = {
                "source": d.metadata.get("source", "N/A"),
                "page": d.metadata.get("page", "N/A"),
                "content": d.page_content[:200]
            }
            idx = (metadata["source"], metadata["page"])
            if idx not in source_ids: # store unique source documents
                source_ids.append(idx)
                self.sources.append(metadata)

    def on_llm_end(self, response, *, run_id, parent_run_id, **kwargs):
        if len(self.sources):
            st.markdown("---") # Add a separator for clarity
            st.markdown("**Sources:**") # Bold the header
            # Convert list of dicts to a DataFrame for display
            df_sources = pd.DataFrame(self.sources[:3])
            st.dataframe(data=df_sources, hide_index=True) # Hide default index for cleaner look


# If user inputs a new prompt, display it and show the response
if user_prompt := st.chat_input():
    st.chat_message("human").write(user_prompt)
    # This is where response from the LLM is shown
    with st.chat_message("ai"):
        # Initializing an empty data stream
        stream_handler = StreamHandler(st.empty())
        # UI element to write RAG sources after LLM response
        sources_container = st.empty() # Use st.empty() for dynamic content
        pm_handler = PostMessageHandler(sources_container)
        config = {"callbacks": [stream_handler, pm_handler]}
        # Get LLM response
        response = qa_rag_chain.invoke({"question": user_prompt},
                                        config)

Overwriting app.py


In [21]:
from google.colab import userdata
ngrok_auth_token=userdata.get('ngrock')
os.environ["NGROK_AUTHTOKEN"] = ngrok_auth_token


In [19]:
!pip install pyngrok

Collecting pyngrok
  Downloading pyngrok-7.2.11-py3-none-any.whl.metadata (9.4 kB)
Downloading pyngrok-7.2.11-py3-none-any.whl (25 kB)
Installing collected packages: pyngrok
Successfully installed pyngrok-7.2.11


In [22]:


# In your Python code
from pyngrok import ngrok
import threading

# Start Streamlit in background
def run_streamlit():
    os.system("streamlit run app.py --server.port 8501 --server.address 0.0.0.0")

thread = threading.Thread(target=run_streamlit)
thread.start()

# Create ngrok tunnel
public_url = ngrok.connect(8501)
print(f"Streamlit app is available at: {public_url}")

Streamlit app is available at: NgrokTunnel: "https://bfe3-35-239-235-43.ngrok-free.app" -> "http://localhost:8501"
