In [18]:
!pip install sentence-transformers


Collecting sentence-transformers
  Downloading sentence_transformers-4.1.0-py3-none-any.whl.metadata (13 kB)
Collecting transformers<5.0.0,>=4.41.0 (from sentence-transformers)
  Downloading transformers-4.52.4-py3-none-any.whl.metadata (38 kB)
Collecting scikit-learn (from sentence-transformers)
  Using cached scikit_learn-1.6.1-cp312-cp312-manylinux_2_17_x86_64.manylinux2014_x86_64.whl.metadata (18 kB)
Collecting scipy (from sentence-transformers)
  Using cached scipy-1.15.3-cp312-cp312-manylinux_2_17_x86_64.manylinux2014_x86_64.whl.metadata (61 kB)
Collecting huggingface-hub>=0.20.0 (from sentence-transformers)
  Downloading huggingface_hub-0.32.4-py3-none-any.whl.metadata (14 kB)
Collecting hf-xet<2.0.0,>=1.1.2 (from huggingface-hub>=0.20.0->sentence-transformers)
  Downloading hf_xet-1.1.3-cp37-abi3-manylinux_2_17_x86_64.manylinux2014_x86_64.whl.metadata (879 bytes)
Collecting tokenizers<0.22,>=0.21 (from transformers<5.0.0,>=4.41.0->sentence-transformers)
  Using cached tokenizer

# Data Ingestion

In [1]:
from langchain.document_loaders import PyMuPDFLoader
# Load the PDF
loader = PyMuPDFLoader("data/RAMAYANA.pdf")
docs = loader.load()

# Each doc now contains `page_content` and `metadata`
for doc in docs:
    print(doc.page_content[:200])  # Preview text
    print(doc.metadata)            # {'source': 'data/ramayana.pdf', 'page': 0}

SRI RAMA JAYAM 
RAMAYANA FOR CHILDREN 
Compiled by  
 
Visalakshi Gopalan 
14-Apr-13 
 
 
 
For children’s reading
{'producer': 'Microsoft® Word 2010', 'creator': 'Microsoft® Word 2010', 'creationdate': '2013-04-14T19:39:50-07:00', 'source': 'data/RAMAYANA.pdf', 'file_path': 'data/RAMAYANA.pdf', 'total_pages': 45, 'format': 'PDF 1.5', 'title': 'RAMAYANA FOR CHILDREN', 'author': 'Sony', 'subject': 'Compiled by', 'keywords': '', 'moddate': '2013-04-14T19:39:50-07:00', 'trapped': '', 'modDate': "D:20130414193950-07'00'", 'creationDate': "D:20130414193950-07'00'", 'page': 0}
1 
 
Contents 
 
1 RAMAYANA FOR CHILDREN ............................................................................................... 2 
1.1 THE BIRTH OF RAMA ......................................
{'producer': 'Microsoft® Word 2010', 'creator': 'Microsoft® Word 2010', 'creationdate': '2013-04-14T19:39:50-07:00', 'source': 'data/RAMAYANA.pdf', 'file_path': 'data/RAMAYANA.pdf', 'total_pages': 45, 'format': 'PDF 1.5',

#Data Chunking

In [2]:
from langchain.text_splitter import RecursiveCharacterTextSplitter

# Split the documents into smaller chunks
splitter = RecursiveCharacterTextSplitter(
    chunk_size=300,
    chunk_overlap=50,
    separators=["\n\n", "\n", ".", " "] # we can use custom separators 
)
chunks = splitter.split_documents(docs)

# Data Embedding
## OpenAI

In [None]:
# Read key from file (or load from env manually)
with open("data/key.txt", "r") as f:
    api_key = f.read().strip()

# Create embeddings for the chunks
from langchain.embeddings import OpenAIEmbeddings
embedding_model = OpenAIEmbeddings(
    model="text-embedding-3-small",
    openai_api_key=api_key
)
vectors = embedding_model.embed_documents([chunk.page_content for chunk in chunks[0:3]])

In [16]:
# Check the number of vectors
print(f"Generated {len(vectors)} embedding vectors.")

# Print each vector (or part of it)
for i, vec in enumerate(vectors):
    print(f"\n--- Embedding for Chunk {i+1} (length: {len(vec)}) ---")
    print(vec[:10], "...")  # show only first 10 values for readability

Generated 3 embedding vectors.

--- Embedding for Chunk 1 (length: 1536) ---
[0.04541555729203853, -0.0027762369333961803, -0.005980003542346092, -0.011332241661385037, -0.049095559185383025, -0.03961414043883742, -0.03240566655823034, -0.006829650978047406, -0.005344120711591003, 0.004334825962359828] ...

--- Embedding for Chunk 2 (length: 1536) ---
[0.05980083492335891, 0.013278535891342362, 0.03146538960252122, -0.00701865477855608, -0.014867218574190018, -0.04531300551036028, -0.003204039609036614, 0.023759095578180806, -0.010771027097757615, 0.005400333219709112] ...

--- Embedding for Chunk 3 (length: 1536) ---
[0.02635293400073436, 0.000685837212484199, 0.048369663403806, -0.03424202822829093, -0.016323712543248298, 0.0032031963791768086, -0.023667284545314946, 0.004752340499666927, 0.012896711654735445, 0.020114394915421094] ...


## HuggingFace

In [21]:
from sentence_transformers import SentenceTransformer

model = SentenceTransformer('all-MiniLM-L6-v2', device='cpu')
vectors = model.encode([chunk.page_content for chunk in chunks[0:3]])


In [22]:
# Check the number of vectors
print(f"Generated {len(vectors)} embedding vectors.")

# Print each vector (or part of it)
for i, vec in enumerate(vectors):
    print(f"\n--- Embedding for Chunk {i+1} (length: {len(vec)}) ---")
    print(vec[:10], "...")  # show only first 10 values for readability

Generated 3 embedding vectors.

--- Embedding for Chunk 1 (length: 384) ---
[-0.0242353   0.06597026 -0.00560062  0.06157163 -0.04379062  0.03772251
  0.00235332 -0.00226355  0.03798074  0.05338313] ...

--- Embedding for Chunk 2 (length: 384) ---
[-0.05809603  0.00846758 -0.00365997  0.01260815 -0.01566969  0.03913204
  0.07374223 -0.0118803   0.01346084  0.05691848] ...

--- Embedding for Chunk 3 (length: 384) ---
[-0.04248373 -0.00755953 -0.04094534 -0.05896254 -0.08073539  0.06576404
  0.08860552  0.03759485 -0.06575926  0.05521883] ...


In [23]:
from langchain.schema import Document
documents = [
    Document(page_content="AI enables automation of complex tasks.", metadata={"title": "Introduction to AI"}),
    Document(page_content="LLMs are used in chatbots and assistants.", metadata={"title": "Applications of LLMs"})
]
# This metadata can be used later to boost or filter during retrieval
print(documents[0].metadata['title'])  # Output: Introduction to AI

Introduction to AI
