<a href="https://colab.research.google.com/github/shr968/marvel/blob/main/PDFQuery_Langchain.ipynb" target="_parent"><img src="https://colab.research.google.com/assets/colab-badge.svg" alt="Open In Colab"/></a>

In [23]:
!pip install fpdf

Collecting fpdf
  Downloading fpdf-1.7.2.tar.gz (39 kB)
  Preparing metadata (setup.py) ... [?25l[?25hdone
Building wheels for collected packages: fpdf
  Building wheel for fpdf (setup.py) ... [?25l[?25hdone
  Created wheel for fpdf: filename=fpdf-1.7.2-py2.py3-none-any.whl size=40704 sha256=6e9840506f6f48984a07a0e944052d3e42a44a965c13deef633b53cb8f757475
  Stored in directory: /root/.cache/pip/wheels/65/4f/66/bbda9866da446a72e206d6484cd97381cbc7859a7068541c36
Successfully built fpdf
Installing collected packages: fpdf
Successfully installed fpdf-1.7.2


In [24]:
from fpdf import FPDF

# Create a simple PDF with sample text
pdf = FPDF()
pdf.add_page()
pdf.set_font("Arial", size=12)

text = """This is a test PDF document.
It contains multiple pages to test text extraction.
The purpose of this document is to check PDF querying.
The conclusion of this document is that LangChain works fine.
"""

# Add text to multiple pages
for i in range(1, 4):
    pdf.cell(200, 10, f"Page {i}", ln=True, align='C')
    pdf.multi_cell(190, 10, text)
    pdf.ln(10)

# Save the PDF
pdf_path = "/content/test_document.pdf"
pdf.output(pdf_path)
print(f"Sample PDF created at {pdf_path}")


Sample PDF created at /content/test_document.pdf


In [30]:
import os
from langchain.document_loaders import PyPDFLoader
from langchain.embeddings import HuggingFaceEmbeddings
from langchain.vectorstores import FAISS

def load_pdf(file_path):
    if not os.path.exists(file_path):
        raise FileNotFoundError(f"File {file_path} not found.")
    loader = PyPDFLoader(file_path)
    docs = loader.load()
    return docs

def create_faiss_index(docs):
    full_text = "\n".join([doc.page_content for doc in docs])
    embeddings = HuggingFaceEmbeddings(model_name="all-MiniLM-L6-v2")
    vector_store = FAISS.from_texts([full_text], embeddings)
    return vector_store, embeddings



def query_pdf(vector_store, embeddings, query):
    query_embedding = embeddings.embed_query(query)
    results = vector_store.similarity_search_by_vector(query_embedding, k=1)
    return results

if __name__ == "__main__":
    file_path = "/content/test_document.pdf"
    query = "What is the conclusion of the document?"

    print("Loading PDF...")
    docs = load_pdf(file_path)

    print("Creating FAISS index...")
    vector_store, embeddings = create_faiss_index(docs)

    print(f"Querying: {query}")
    results = query_pdf(vector_store, embeddings, query)

    print("\nTop Matching Sections:")
    for res in results:
        print(res.page_content)
        print("-" * 50)


Loading PDF...
Creating FAISS index...
Querying: What is the conclusion of the document?

Top Matching Sections:
Page 1
This is a test PDF document. 
It contains multiple pages to test text extraction.
The purpose of this document is to check PDF querying.
The conclusion of this document is that LangChain works fine.
Page 2
This is a test PDF document. 
It contains multiple pages to test text extraction.
The purpose of this document is to check PDF querying.
The conclusion of this document is that LangChain works fine.
Page 3
This is a test PDF document. 
It contains multiple pages to test text extraction.
The purpose of this document is to check PDF querying.
The conclusion of this document is that LangChain works fine.
--------------------------------------------------
