 # Extract Text from PDFs using LangChain

In [3]:
import os
from langchain.document_loaders import PyPDFLoader

# Directory where the PDFs are stored
pdf_directory = '/Users/shivavardhineedi/Desktop/HPC-data/major-project/POC/course-documents'

# Load all PDFs from the specified folder
def load_pdfs_from_directory(directory):
    pdf_files = [f for f in os.listdir(directory) if f.endswith('.pdf')]
    documents = []
    for pdf_file in pdf_files:
        file_path = os.path.join(directory, pdf_file)
        loader = PyPDFLoader(file_path)
        docs = loader.load()  # Load the document (each page is a document)
        documents.extend(docs)
    return documents

documents = load_pdfs_from_directory(pdf_directory)


In [4]:
print(f"Total number of documents loaded: {len(documents)}")

Total number of documents loaded: 434


In [7]:
# Check the content of the first few documents (first 500 characters)
for i, doc in enumerate(documents[:3]):
    print(f"Document {i+1}:")
    print(f"Content: {doc.page_content[:100]}")
    print(f"Metadata: {doc.metadata}")
    print("="*120)


Document 1:
Content: Holub Holub onPatternsLearning Design Patterns
by Looking at CodeCYAN
MAGENTAYELLOW
BLACKPANTONE 123
Metadata: {'source': '/Users/shivavardhineedi/Desktop/HPC-data/major-project/POC/course-documents/designpatterns.pdf', 'page': 0}
Document 2:
Content: 
Metadata: {'source': '/Users/shivavardhineedi/Desktop/HPC-data/major-project/POC/course-documents/designpatterns.pdf', 'page': 1}
Document 3:
Content: Holub on Patterns: 
Learning Design Patterns 
by Looking at Code
ALLEN HOLUB388x_Ch00_FINAL.qxd  8/2
Metadata: {'source': '/Users/shivavardhineedi/Desktop/HPC-data/major-project/POC/course-documents/designpatterns.pdf', 'page': 2}


# Chunk Text using LangChain’s Text Splitter

In [8]:
from langchain.text_splitter import RecursiveCharacterTextSplitter

# Define the chunk size and overlap
text_splitter = RecursiveCharacterTextSplitter(chunk_size=500, chunk_overlap=50)

# Split documents into smaller chunks
chunked_docs = text_splitter.split_documents(documents)

print(f"Total number of chunks created: {len(chunked_docs)}")


Total number of chunks created: 2723


In [12]:
import logging
import sys

# Set up logging to print to Jupyter cells
logging.basicConfig(
    level=logging.INFO,
    format='%(asctime)s - %(levelname)s - %(message)s',
    handlers=[logging.StreamHandler(sys.stdout)]
)

# Log the total number of chunks created
print(f"Total number of chunks created: {len(chunked_docs)}")

# Log the content and length of the first few chunks
for i, chunk in enumerate(chunked_docs[:3]):  # Check the first 3 chunks
    print(f"Chunk {i+1}: {chunk.page_content[:300]}...")  # Log the first 300 characters
    print(f"Chunk {i+1} length: {len(chunk.page_content)} characters")
    print(f"Chunk {i+1} metadata: {chunk.metadata}")

Total number of chunks created: 2723
Chunk 1: Holub Holub onPatternsLearning Design Patterns
by Looking at CodeCYAN
MAGENTAYELLOW
BLACKPANTONE 123 CV
this print for content only—size & color not accurate 7" x 9-1/4" / CASEBOUND / MALLOY
(0.8125 INCH BULK --432 pages --60# Thor)
THE EXPERT’S VOICE®IN SOFTWARE ENGINEERING
Allen HolubHolub on
Patt...
Chunk 1 length: 468 characters
Chunk 1 metadata: {'source': '/Users/shivavardhineedi/Desktop/HPC-data/major-project/POC/course-documents/designpatterns.pdf', 'page': 0}
Chunk 2: Holub on Patterns:
Learning Design Patterns by Looking at Code
Dear Reader,
To be a good object-oriented designer, you have to know the design patterns...
Chunk 2 length: 151 characters
Chunk 2 metadata: {'source': '/Users/shivavardhineedi/Desktop/HPC-data/major-project/POC/course-documents/designpatterns.pdf', 'page': 0}
Chunk 3: cold, not just what they are but how to apply them to solve real problems. Mostbooks on the subject leave you in the lurch in the how-to-ap

# Generate and Store Embeddings using LangChain’s VectorStore

In [15]:
from langchain.embeddings import HuggingFaceEmbeddings

# Load the Hugging Face sentence-transformer model
embedding_model = HuggingFaceEmbeddings(model_name="all-MiniLM-L6-v2")

# Generate embeddings for each chunk
embeddings = [embedding_model.embed_query(chunk.page_content) for chunk in chunked_docs]

# Log how many embeddings were generated
print(f"Generated {len(embeddings)} embeddings for {len(chunked_docs)} chunks.")


Generated 2723 embeddings for 2723 chunks.


In [20]:
for emb in embeddings[:3]:
    print(emb)

[-0.09179039299488068, 0.03948359191417694, -0.09033330529928207, -0.05470029637217522, -0.04472817853093147, -0.07503479719161987, -0.00730796018615365, 0.002942960010841489, -0.12652342021465302, -0.032230693846940994, -0.035229042172431946, 0.05404297262430191, 0.006537171546369791, -0.05250120535492897, 0.005927739664912224, -0.015933411195874214, -0.014999765902757645, 0.01940457709133625, 0.03137839585542679, -0.02537115104496479, 0.02363095059990883, -0.006030644290149212, -0.03076545149087906, -0.04608825966715813, -0.0234681386500597, 0.06872662156820297, 0.05556819587945938, -0.005924264434725046, 0.06094113364815712, -0.03421691805124283, 0.01918952539563179, 0.11704275012016296, 0.050682876259088516, -0.0026116210501641035, 0.020013686269521713, 0.021404240280389786, -0.0199571680277586, 0.0509633831679821, 0.07041429728269577, 0.023366600275039673, -0.046808164566755295, -0.006475499365478754, -0.012446006760001183, 0.0565275177359581, 0.01649893820285797, -0.0571824163198

In [21]:
from langchain.vectorstores import FAISS

# Create a FAISS vector store from the chunked documents and their embeddings
faiss_index = FAISS.from_documents(chunked_docs, embedding_model)

# Log that the embeddings have been successfully stored in FAISS
logging.info("Embeddings stored in FAISS vector store.")


In [22]:
# Save the FAISS index locally for future use
faiss_index.save_local("faiss_index")
print("FAISS index saved locally as 'faiss_index'.")


FAISS index saved locally as 'faiss_index'.


# Query the FAISS Index for Retrieval

In [24]:
# Load the FAISS index from disk, allowing pickle-based deserialization
faiss_index = FAISS.load_local(
    "faiss_index", 
    embedding_model, 
    allow_dangerous_deserialization=True
)

# Define a query for similarity search
query = "What is an observer design pattern?"

# Perform a similarity search in the FAISS index
results = faiss_index.similarity_search(query, k=5)

# Log and display the top 5 results
for i, result in enumerate(results):
    print(f"Result {i+1}:\n{result.page_content[:300]}...")  # Log the first 300 characters


Result 1:
Defines an interface for notifying
Observers.
Concrete Observer : Implements
the Observer interface to do
something when notified.APPENDIX ■A DESIGN-PATTERN QUICK REFERENCE 390
What Problem Does It Solve?
In Chain of Responsibility, a button notifies a
parent of a press event like this:
class Window...
Result 2:
The Clock Subsystem: Observer
Now let’ s look at the code itself. I’ll start describing the classes at the edges of the system—
looking at the ancillary pieces used by the core abstractions. These pieces form stand-alonesubsystems, so they’re easy to look at in isolation.
You’ll see the clock subsys...
Result 3:
rather than through some visual intermediary.
Now let’ s apply the Observer pattern to Life. The Clock class, shown in Listing 3-1, uses
Observer to notify a subscriber (the Universe object) of clock-tick events. The Clock has the
role of Subject/Publisher. The Concrete Observer/Subscriber role is f...
Result 4:
Observer interface defines only one method ( voi

# Retrieve Relevant Embeddings for Query