In [12]:
from langchain.text_splitter import RecursiveCharacterTextSplitter
from langchain_community.document_loaders.pdf import PyPDFLoader
from langchain_huggingface.embeddings import HuggingFaceEmbeddings
from langchain.vectorstores import Chroma
from dotenv import load_dotenv

load_dotenv()

True

In [3]:
loader = PyPDFLoader('../Data/RP1.pdf')

docs = list(loader.lazy_load())

docs

[Document(metadata={'producer': 'FPDF 1.82', 'creator': 'PyPDF', 'creationdate': 'D:20250911155137', 'source': '../Data/RP1.pdf', 'total_pages': 12, 'page': 0, 'page_label': '1'}, page_content='13 V May 2025\nhttps://doi.org/10.22214/ijraset.2025.70977'),
 Document(metadata={'producer': 'FPDF 1.82', 'creator': 'PyPDF', 'creationdate': 'D:20250911155137', 'source': '../Data/RP1.pdf', 'total_pages': 12, 'page': 1, 'page_label': '2'}, page_content='International Journal for Research in Applied Science & Engineering Technology (IJRASET) \n                                                                                           ISSN: 2321-9653; IC Value: 45.98; SJ Impact Factor: 7.538 \n                                                                                                                Volume 13 Issue V May 2025- Available at www.ijraset.com \n     \n \n3461 \n©IJRASET: All Rights are Reserved | SJ Impact Factor 7.538 | ISRA Journal Impact Factor 7.894 | \n \nPrepmania: an AI-Po

In [7]:
len(docs)

12

In [5]:
# chunking

splitter = RecursiveCharacterTextSplitter(
    chunk_size=100,
    chunk_overlap=10,
)

chunk = splitter.split_documents(docs)

chunk


[Document(metadata={'producer': 'FPDF 1.82', 'creator': 'PyPDF', 'creationdate': 'D:20250911155137', 'source': '../Data/RP1.pdf', 'total_pages': 12, 'page': 0, 'page_label': '1'}, page_content='13 V May 2025\nhttps://doi.org/10.22214/ijraset.2025.70977'),
 Document(metadata={'producer': 'FPDF 1.82', 'creator': 'PyPDF', 'creationdate': 'D:20250911155137', 'source': '../Data/RP1.pdf', 'total_pages': 12, 'page': 1, 'page_label': '2'}, page_content='International Journal for Research in Applied Science & Engineering Technology (IJRASET)'),
 Document(metadata={'producer': 'FPDF 1.82', 'creator': 'PyPDF', 'creationdate': 'D:20250911155137', 'source': '../Data/RP1.pdf', 'total_pages': 12, 'page': 1, 'page_label': '2'}, page_content='ISSN:'),
 Document(metadata={'producer': 'FPDF 1.82', 'creator': 'PyPDF', 'creationdate': 'D:20250911155137', 'source': '../Data/RP1.pdf', 'total_pages': 12, 'page': 1, 'page_label': '2'}, page_content='ISSN: 2321-9653; IC Value: 45.98; SJ Impact Factor: 7.538'),


In [6]:
len(chunk)

379

In [10]:
# embedding of chunks to store in vector db

embeddings = HuggingFaceEmbeddings(model_name='sentence-transformers/all-MiniLM-L6-v2')


In [None]:
# db to store embeddings
vector_store = Chroma(
    embedding_function=embeddings,
    persist_directory='chroma_chunks',
    collection_name='sample2'
)

  vector_store = Chroma(


In [14]:
# addings embeddings to db
vector_store.add_documents(chunk)

['77d95c03-2428-48b8-9f8a-7cb38710019e',
 'a68ea041-41b6-4067-806b-7006031c63d0',
 '3d1d10fe-7861-40cb-a235-590225177cf2',
 '46619b21-1275-4849-8b9e-987c864e7080',
 '9ab4734b-15bd-40bf-8c63-2206faffe1ca',
 '68371767-6ee7-439b-88b6-9c3a862b9371',
 'bd0e0df2-f1ee-42c9-ba34-fe8cc60b150e',
 '8dcebd81-9cb5-4c80-bdcc-6817ec7b3902',
 '7bb82ef3-4489-4ebb-8f5c-013f948910dd',
 'df51725d-7c79-400e-a460-e13075be9907',
 '743b6fd9-9c75-4b50-bcf4-7a9322eeeac5',
 'e1ab6feb-489e-4419-9150-ab72b79a51eb',
 '0d94299b-8169-4299-8c1f-2d90ba94c529',
 'e81cc53e-9866-4624-8d01-e65dc6e56211',
 '3fac0c6a-3585-4af4-b837-94b1afc5d585',
 '35fbaf65-6561-4e9c-9a89-213aeeda783a',
 '6b4a076a-234a-4ddf-b4a5-95bdcf79dd23',
 '8b138ecd-701d-4deb-91f6-fae6f0a45c54',
 '5d5b0bc6-eb70-47ca-ba8b-832906fd2781',
 'c8eef204-3e38-4d93-bc23-097b3daecc39',
 '2b5fab67-5bbc-414b-a4bf-c856f4273ebc',
 'fcdff695-aaa5-4ee4-9ae3-e23345c4abd0',
 'fe79486b-8966-4eb5-a9cc-1a87eeab630a',
 '79a4e224-9ada-4c62-be5a-3eacba5b9639',
 '4c56ad74-4522-

In [15]:
# looking how the data looks in the db

vector_store.get(include=['embeddings','documents','metadatas'])

{'ids': ['77d95c03-2428-48b8-9f8a-7cb38710019e',
  'a68ea041-41b6-4067-806b-7006031c63d0',
  '3d1d10fe-7861-40cb-a235-590225177cf2',
  '46619b21-1275-4849-8b9e-987c864e7080',
  '9ab4734b-15bd-40bf-8c63-2206faffe1ca',
  '68371767-6ee7-439b-88b6-9c3a862b9371',
  'bd0e0df2-f1ee-42c9-ba34-fe8cc60b150e',
  '8dcebd81-9cb5-4c80-bdcc-6817ec7b3902',
  '7bb82ef3-4489-4ebb-8f5c-013f948910dd',
  'df51725d-7c79-400e-a460-e13075be9907',
  '743b6fd9-9c75-4b50-bcf4-7a9322eeeac5',
  'e1ab6feb-489e-4419-9150-ab72b79a51eb',
  '0d94299b-8169-4299-8c1f-2d90ba94c529',
  'e81cc53e-9866-4624-8d01-e65dc6e56211',
  '3fac0c6a-3585-4af4-b837-94b1afc5d585',
  '35fbaf65-6561-4e9c-9a89-213aeeda783a',
  '6b4a076a-234a-4ddf-b4a5-95bdcf79dd23',
  '8b138ecd-701d-4deb-91f6-fae6f0a45c54',
  '5d5b0bc6-eb70-47ca-ba8b-832906fd2781',
  'c8eef204-3e38-4d93-bc23-097b3daecc39',
  '2b5fab67-5bbc-414b-a4bf-c856f4273ebc',
  'fcdff695-aaa5-4ee4-9ae3-e23345c4abd0',
  'fe79486b-8966-4eb5-a9cc-1a87eeab630a',
  '79a4e224-9ada-4c62-be5a-

In [16]:
vector_store.similarity_search('tell me about the system architecture',k=3)

[Document(metadata={'creationdate': 'D:20250911155137', 'page': 3, 'producer': 'FPDF 1.82', 'creator': 'PyPDF', 'total_pages': 12, 'page_label': '4', 'source': '../Data/RP1.pdf'}, page_content='III. SYSTEM ARCHITECTURE'),
 Document(metadata={'page_label': '6', 'creator': 'PyPDF', 'creationdate': 'D:20250911155137', 'page': 5, 'total_pages': 12, 'producer': 'FPDF 1.82', 'source': '../Data/RP1.pdf'}, page_content='IV. METHODOLOGY \n1) System Design: Prepmania uses a full-stack architecture:'),
 Document(metadata={'page_label': '4', 'page': 3, 'creator': 'PyPDF', 'source': '../Data/RP1.pdf', 'creationdate': 'D:20250911155137', 'total_pages': 12, 'producer': 'FPDF 1.82'}, page_content='Fig. 1 System Architecture of Prepmania')]