In [1]:
!pip install -U langchain langchain-community chromadb transformers sentence-transformers


Collecting langchain-community
  Downloading langchain_community-0.3.21-py3-none-any.whl.metadata (2.4 kB)
Collecting chromadb
  Downloading chromadb-1.0.4-cp39-abi3-manylinux_2_17_x86_64.manylinux2014_x86_64.whl.metadata (6.9 kB)
Collecting transformers
  Downloading transformers-4.51.3-py3-none-any.whl.metadata (38 kB)
Collecting sentence-transformers
  Downloading sentence_transformers-4.1.0-py3-none-any.whl.metadata (13 kB)
Collecting dataclasses-json<0.7,>=0.5.7 (from langchain-community)
  Downloading dataclasses_json-0.6.7-py3-none-any.whl.metadata (25 kB)
Collecting pydantic-settings<3.0.0,>=2.4.0 (from langchain-community)
  Downloading pydantic_settings-2.8.1-py3-none-any.whl.metadata (3.5 kB)
Collecting httpx-sse<1.0.0,>=0.4.0 (from langchain-community)
  Downloading httpx_sse-0.4.0-py3-none-any.whl.metadata (9.0 kB)
Collecting build>=1.0.3 (from chromadb)
  Downloading build-1.2.2.post1-py3-none-any.whl.metadata (6.5 kB)
Collecting chroma-hnswlib==0.7.6 (from chromadb)
  Do

In [9]:
!pip install pypdf




In [13]:
import os
from transformers import AutoTokenizer
from langchain_community.document_loaders import PyPDFLoader
from langchain.text_splitter import RecursiveCharacterTextSplitter
from langchain.embeddings import HuggingFaceEmbeddings
from langchain.vectorstores import Chroma

#Use your actual file path
PDF_PATH = "/5008_Federalist Papers.pdf"
COLLECTION_NAME = "federalist_index"
EMBEDDING_MODEL = "all-MiniLM-L6-v2"
CHROMA_PERSIST_DIR = "./chroma_db"

#Split PDF into chunks
def pdf_to_chunks(pdf_file):
    tokenizer = AutoTokenizer.from_pretrained("sentence-transformers/all-MiniLM-L6-v2")
    text_splitter = RecursiveCharacterTextSplitter.from_huggingface_tokenizer(
        tokenizer,
        separators=["\n\n", "\n", " ", ""],
        chunk_size=512,
        chunk_overlap=0,
    )
    loader = PyPDFLoader(pdf_file)
    docs = loader.load_and_split(text_splitter)
    return docs

#Save chunks + embeddings in Chroma DB
def create_index_chroma(docs, embeddings, persist_dir):
    db = Chroma.from_documents(
        documents=docs,
        embedding=embeddings,
        collection_name=COLLECTION_NAME,
        persist_directory=persist_dir,
    )
    db.persist()
    return db

#Main Flow
def main():
    print("Loading and splitting PDF...")
    docs = pdf_to_chunks(PDF_PATH)
    print(f"Total chunks: {len(docs)}")

    print("Generating embeddings...")
    embeddings = HuggingFaceEmbeddings(model_name=EMBEDDING_MODEL)

    print("Saving to Chroma DB...")
    db = create_index_chroma(docs, embeddings, CHROMA_PERSIST_DIR)

    print("Done! Embeddings saved to local vector DB.")

main()


Loading and splitting PDF...
Total chunks: 593
Generating embeddings...
Saving to Chroma DB...
Done! Embeddings saved to local vector DB.


  db.persist()


In [14]:
from langchain.vectorstores import Chroma
from langchain.embeddings import HuggingFaceEmbeddings

# Reconnect to your saved Chroma DB
embedding_model = HuggingFaceEmbeddings(model_name="all-MiniLM-L6-v2")
db = Chroma(
    persist_directory="./chroma_db",
    embedding_function=embedding_model,
    collection_name="federalist_index"
)

#Ask a question
query = "Why is the Union important according to the Federalist Papers?"
results = db.similarity_search(query, k=1)

#Print the top 3 most relevant chunks
for i, doc in enumerate(results):
    print(f"\n📄 Result {i+1}:\n{doc.page_content}")



📄 Result 1:
3
- The utility of the UNION to your political prosperity  
- The insufficiency of the present Confederation to preserve that Union  
- The necessity of a government at least equally energetic with the one proposed, to the attainment 
of this object  
- The conformity of the proposed Constitution to th e true principles of republican government - Its 
analogy to your own State constitution  
- and lastly, The additional security which its adoption will afford to the preservation of that species 
of government to liberty, and to property.   
 
In the progress of this discussion I shall endeavor to  give a satisfactory answer to all the objections 
which shall have made their appearance, that may seem to have any claim to your attention.   
 
It may perhaps be thought superfluous to offer arguments to prove the utility of the UNION, a 
point, no doubt, deeply engraved on the hearts of the great body of the people in every State, and 
one, which it may be imagined, has no adv