In [72]:
import os
import glob
import requests
import tempfile
import gradio as gr
from dotenv import load_dotenv

from langchain.document_loaders import DirectoryLoader, TextLoader
from langchain.text_splitter import CharacterTextSplitter, RecursiveCharacterTextSplitter
from langchain_ollama import OllamaEmbeddings, ChatOllama
from langchain_huggingface import HuggingFaceEmbeddings
from langchain_openai import OpenAIEmbeddings, ChatOpenAI
from langchain.schema import Document
from langchain_chroma import Chroma

from langchain.memory import ConversationBufferMemory
from langchain.chains import ConversationalRetrievalChain

from langchain_community.document_loaders import PyPDFLoader

In [45]:
GPT_MODEL = "gpt-5-nano"
LLAMA_MODEL = "llama3.1:8b"
EMBEDDING_MODEL = "embeddinggemma"
db_name = "vector_db"

In [33]:
load_dotenv(override=True)
os.environ['OPENAI_API_KEY'] = os.getenv('OPENAI_API_KEY', 'your-key-if-not-using-env')

In [34]:
loader = PyPDFLoader("data/sample-invoice.pdf")
pages = loader.load()

In [35]:
text_splitter = CharacterTextSplitter(chunk_size=500, chunk_overlap=50)
chunks = text_splitter.split_documents(pages)
print(f"Total number of chunks: {len(chunks)}")

Total number of chunks: 3


In [36]:
# embeddings = HuggingFaceEmbeddings(model_name="sentence-transformers/all-mpnet-base-v2", encode_kwargs = {"normalize_embeddings": False})
# embeddings = OpenAIEmbeddings()
embeddings = OllamaEmbeddings(model="embeddinggemma")

In [37]:
if os.path.exists(db_name):
    Chroma(persist_directory=db_name, embedding_function=embeddings).delete_collection()

In [38]:
vectorstore = Chroma.from_documents(documents=chunks, embedding=embeddings, persist_directory=db_name)
print(f"Vectorstore created with {vectorstore._collection.count()} chunks.")

Vectorstore created with 3 chunks.


In [54]:
# llm = ChatOpenAI(temperature=0.7, model_name=GPT_MODEL)
llm = ChatOllama(temperature=0.7, model=LLAMA_MODEL,  validate_model_on_init=True)

In [55]:
memory = ConversationBufferMemory(memory_key='chat_history', return_messages=True)

In [56]:
retriever = vectorstore.as_retriever()

In [57]:
conversation_chain = ConversationalRetrievalChain.from_llm(llm=llm, retriever=retriever, memory=memory)

In [58]:
query = "Fetch following details from the pdf: Invoice No, Customer No, Invoice Period, Date, Gross Amount with and without Vat."
result = conversation_chain.invoke({"question": query})
print(result["answer"])

Here are the requested details:

1. **Invoice No**: 123100401
2. **Customer No**: 12345
3. **Invoice Period**: Not specified in the format "01.02.2024 to 29.02.2024"
4. **Date**: 01.03.2024 (note: this is not a date range, but a specific date when the invoice was generated)
5. **Gross Amount with VAT**: 453.53 €
6. **Gross Amount without VAT**: 381.12 €


In [78]:
conversation_chain = None

def load_pdf(file):
    global conversation_chain
    
    file_path = file.name

    loader = PyPDFLoader(file_path)
    docs = loader.load()

    splitter = CharacterTextSplitter(chunk_size=500, chunk_overlap=50)
    chunks = splitter.split_documents(docs)

    embeddings = OllamaEmbeddings(model=EMBEDDING_MODEL)

    if os.path.exists(db_name):
        Chroma(persist_directory=db_name, embedding_function=embeddings).delete_collection()

    vectorstore = Chroma.from_documents(documents=chunks, embedding=embeddings, persist_directory=db_name)

    llm = ChatOllama(temperature=0.7, model=LLAMA_MODEL, validate_model_on_init=True)
    memory = ConversationBufferMemory(memory_key='chat_history', return_messages=True)
    retriever = vectorstore.as_retriever()
    conversation_chain = ConversationalRetrievalChain.from_llm(llm=llm, retriever=retriever, memory=memory)

    return "✅ PDF uploaded and indexed. You can now ask questions!"

In [79]:
def chat(question, memory):
    global conversation_chain
    
    if conversation_chain is None:
        return "⚠️ Please upload a PDF first."

    result = conversation_chain.invoke({"question": question})
    return result["answer"]

In [81]:
with gr.Blocks() as demo:
    gr.Markdown("## Open Source PDF Extraction with QA Chatbot")
    pdf_upload = gr.File(label="Upload your PDF", file_types=[".pdf"])
    status = gr.Textbox(label="Status", interactive=False)

    pdf_upload.change(load_pdf, inputs=pdf_upload, outputs=status)

    gr.ChatInterface(
        fn=chat,
        type="messages",
        description="Ask questions about your uploaded PDF.",
    )

demo.launch(inbrowser=True)

* Running on local URL:  http://127.0.0.1:7874
* To create a public link, set `share=True` in `launch()`.


