In [1]:
pip install langchain openai streamlit chromadb unstructured python-docx PyMuPDF





In [2]:
import fitz  # PyMuPDF for PDFs
from docx import Document

def extract_text_from_pdf(file):
    doc = fitz.open(stream=file.read(), filetype="pdf")
    return "\n".join([page.get_text() for page in doc])

def extract_text_from_docx(file):
    doc = Document(file)
    return "\n".join([para.text for para in doc.paragraphs])


In [3]:
pip install -U langchain-community





In [4]:
pip install -U langchain


Note: you may need to restart the kernel to use updated packages.


In [5]:
from langchain.text_splitter import RecursiveCharacterTextSplitter
from langchain.embeddings import OpenAIEmbeddings
from langchain.vectorstores import Chroma  # or FAISS

def chunk_text(text):
    splitter = RecursiveCharacterTextSplitter(chunk_size=500, chunk_overlap=100)
    return splitter.split_text(text)

def embed_and_store(chunks):
    embeddings = OpenAIEmbeddings()
    vectorstore = Chroma.from_texts(chunks, embeddings)
    return vectorstore


In [6]:
from langchain.chains import RetrievalQA
from langchain.chat_models import ChatOpenAI

In [7]:
def build_qa_chain(vectorstore):
    retriever = vectorstore.as_retriever(search_kwargs={"k": 4})
    llm = ChatOpenAI(model="gpt-4")
    return RetrievalQA.from_chain_type(llm=llm, retriever=retriever)

In [8]:
import streamlit as st

In [22]:
import streamlit as st
import fitz  # PyMuPDF for PDFs
from docx import Document
from langchain.text_splitter import RecursiveCharacterTextSplitter
from langchain_community.embeddings import OpenAIEmbeddings
from langchain_community.vectorstores import Chroma
from langchain.chains import RetrievalQA
from langchain.chat_models import ChatOpenAI
import tempfile
import os

In [24]:
st.set_page_config(page_title="Document QA GenAI", layout="centered")
st.title("📄 Document-based QA (GenAI)")

uploaded_file = st.file_uploader("Upload a PDF or Word Document", type=["pdf", "docx"])
question = st.text_input("Ask a question based on the document")

In [26]:
def extract_text_from_pdf(file_path):
    doc = fitz.open(file_path)
    return "\n".join([page.get_text() for page in doc])

def extract_text_from_docx(file_path):
    doc = Document(file_path)
    return "\n".join([para.text for para in doc.paragraphs])

In [28]:
def chunk_and_embed(text):
    splitter = RecursiveCharacterTextSplitter(chunk_size=500, chunk_overlap=50)
    chunks = splitter.split_text(text)
    embeddings = OpenAIEmbeddings()
    vectorstore = Chroma.from_texts(chunks, embeddings)
    return vectorstore


In [30]:
def run_qa_with_sources(vectorstore, question):
    retriever = vectorstore.as_retriever(search_kwargs={"k": 4})
    llm = ChatOpenAI(model="gpt-4", temperature=0)
    qa_chain = RetrievalQA.from_chain_type(
        llm=llm,
        retriever=retriever,
        return_source_documents=True
    )
    result = qa_chain(question)
    return result['result'], result['source_documents']

In [32]:
if uploaded_file and question:
    with tempfile.NamedTemporaryFile(delete=False) as tmp_file:
        tmp_file.write(uploaded_file.read())
        file_path = tmp_file.name

In [48]:
if uploaded_file and question:
    with tempfile.NamedTemporaryFile(delete=False) as tmp_file:
        tmp_file.write(uploaded_file.read())
        file_path = tmp_file.name

    if uploaded_file.type == "application/pdf":
        text = extract_text_from_pdf(file_path)
    else:
        text = extract_text_from_docx(file_path)

    os.unlink(file_path)

    with st.spinner("Processing document and generating answer..."):
        vectorstore = chunk_and_embed(text)
        answer, sources = run_qa_with_sources(vectorstore, question)

    st.success("Answer:")
    st.write(answer)

    st.info("🔍 Source Chunks Used:")
    for i, doc in enumerate(sources):
        st.markdown(f"**Chunk {i+1}:**\n```{doc.page_content.strip()}```")
