RAG OVER PDF - GUI Using Streamlit 

Import Librarries and Packages

Recrusive character text splitter  - Chunking text

OpenAi(embeddings) , Deepseek(answer generation) models

FAISS - vector db

Prompt Template for formating prompt

Pdf reader

In [None]:
from langchain.text_splitter import RecursiveCharacterTextSplitter 
from langchain_openai import OpenAIEmbeddings, ChatOpenAI
from langchain_deepseek import ChatDeepSeek
from langchain_community.vectorstores import FAISS
from langchain_core.prompts import PromptTemplate
from PyPDF2 import PdfReader

Extracting Document Text function

In [None]:
def get_pdf_content(documents):
    raw_text = ""

    for document in documents:
        pdf_reader = PdfReader(document)
        for page in pdf_reader.pages:
            raw_text += page.extract_text()

    return raw_text


Chunking text using RecursiveCharacterTextSplitter

In [None]:
def get_chunks(text):
    splitter = RecursiveCharacterTextSplitter(chunk_size=1000, chunk_overlap=200)
    chunks = splitter.create_documents([text])
    return chunks


Get Embeddings using openai embedding-3 model and store in FAISS

In [None]:
def get_embeddings_openai(chunks):
    embeddings = OpenAIEmbeddings(
        api_key = 'api_key',
        model="text-embedding-3-small")
    vector_store = FAISS.from_documents(chunks, embeddings)
    return vector_store

Retrieve Top-k(=4) queries using similarity search on vector store

In [None]:
def top_k_queries(vector_store):
    retriever = vector_store.as_retriever(search_type="similarity", search_kwargs={"k": 4})
    return retriever

Initialising Deepseek LLM

In [None]:
llm = ChatDeepSeek(
    api_key='api_key',
    model="deepseek-chat",
    temperature=0.2,
    max_tokens=None,
    timeout=None,
    max_retries=2,
)

Prompt Template for the LLM

In [None]:
prompt = PromptTemplate(
    template="""
      You are a helpful assistant.
      Answer ONLY from understandingthe provided transcript context and the conversation history.
      If the context and history are insufficient, just say you don't know.

      {context}

      Question: {question}
    """,
    input_variables = ['context','question']
)

Basic Streamlit GUI for conversations 

In [None]:
import streamlit as st

st.title("RAG PDF Chatbot- Q2")

uploaded_file = st.file_uploader("Choose a PDF file", type="pdf")
if uploaded_file is not None:

    text = get_pdf_content([uploaded_file])  #get text
    split_text = get_chunks(text) #get chunks 
    st.write("PDF text split in to chunks.") #progress print
    embedding_index = get_embeddings_openai(split_text) #get embeddings
    similar_chunks = top_k_queries(embedding_index) # initialise thr similarity search 

    st.write("embedding peformed on chunks.")
    st.write("PDF text extracted. You can now ask questions.")

    user_question = st.text_input("Ask a question about the PDF:")

    if user_question: 
        retrieved_docs = similar_chunks.invoke(user_question) #invoke the SS for the question
        context_text = "\n\n".join(doc.page_content for doc in retrieved_docs) #join the retrieved top queries 
        
          
        final_prompt = prompt.invoke({
            "context": context_text,
            "question": user_question
        }) #make final prompt 

        answer = llm.invoke(final_prompt) #get answer from deepseek
        
        st.write(f"Answer: {answer.content}") #display the answer in the GUI
        print(st.session_state.history)
        # print(history_string)
