In [23]:
from dotenv import load_dotenv
load_dotenv()

from langchain_google_genai import ChatGoogleGenerativeAI
llm = ChatGoogleGenerativeAI(model='gemini-2.5-flash-preview-04-17')

from langchain.embeddings import HuggingFaceEmbeddings
embedding_model = HuggingFaceEmbeddings(model_name = 'BAAI/bge-base-en-v1.5')

In [24]:
from langchain_text_splitters import CharacterTextSplitter
from langchain_community.document_loaders import TextLoader

documents = TextLoader(file_path='state_of_union.txt',autodetect_encoding=True)
documents = documents.load()
text_splitter = CharacterTextSplitter(chunk_size=500, chunk_overlap=100)
texts = text_splitter.split_documents(documents)

In [25]:
texts

[Document(metadata={'source': 'state_of_union.txt'}, page_content='Madam Speaker, Madam Vice President, our First Lady and Second Gentleman. Members of Congress and the Cabinet. Justices of the Supreme Court. My fellow Americans.  \n\nLast year COVID-19 kept us apart. This year we are finally together again. \n\nTonight, we meet as Democrats Republicans and Independents. But most importantly as Americans. \n\nWith a duty to one another to the American people to the Constitution. \n\nAnd with an unwavering resolve that freedom will always triumph over tyranny.'),
 Document(metadata={'source': 'state_of_union.txt'}, page_content='And with an unwavering resolve that freedom will always triumph over tyranny. \n\nSix days ago, Russia’s Vladimir Putin sought to shake the foundations of the free world thinking he could make it bend to his menacing ways. But he badly miscalculated. \n\nHe thought he could roll into Ukraine and the world would roll over. Instead he met a wall of strength he nev

In [26]:
from langchain_community.vectorstores import FAISS
vectordb = FAISS.from_documents(documents=texts, embedding=embedding_model)
retriever = vectordb.as_retriever()

In [27]:
from langchain.retrievers import ContextualCompressionRetriever
from langchain.retrievers.document_compressors import LLMChainExtractor

compressor = LLMChainExtractor.from_llm(llm)
compression_retriever = ContextualCompressionRetriever(base_compressor=compressor, base_retriever=retriever)

In [28]:
compressed_docs = compression_retriever.invoke("What were the top three priorities outlined in the most recent State of the Union address?")
compressed_docs

[Document(metadata={'source': 'state_of_union.txt'}, page_content='So tonight I’m offering a Unity Agenda for the Nation. Four big things we can do together.\n\nFirst, beat the opioid epidemic.'),
 Document(metadata={'source': 'state_of_union.txt'}, page_content='Invest in America. Educate Americans. Grow the workforce.'),
 Document(metadata={'source': 'state_of_union.txt'}, page_content='Let’s pass the Paycheck Fairness Act and paid leave.\n\nRaise the minimum wage to $15 an hour and extend the Child Tax Credit, so no one has to raise a family in poverty.\n\nLet’s increase Pell Grants and increase our historic support of HBCUs, and invest in what Jill—our First Lady who teaches full-time—calls America’s best-kept secret: community colleges.\n\nAnd let’s pass the PRO Act when a majority of workers want to form a union—they shouldn’t be stopped.')]

In [29]:
from langchain.retrievers.document_compressors import LLMChainFilter

filter = LLMChainFilter.from_llm(llm)
compression_retriever2 = ContextualCompressionRetriever(base_compressor=filter, base_retriever=retriever)
compressed_docs2 = compression_retriever2.invoke("What were the top three priorities outlined in the most recent State of the Union address?")
compressed_docs2

[Document(id='2ad7cc22-4e20-4e9d-aec0-b4ea2acee764', metadata={'source': 'state_of_union.txt'}, page_content='And soon, we’ll strengthen the Violence Against Women Act that I first wrote three decades ago. It is important for us to show the nation that we can come together and do big things. \n\nSo tonight I’m offering a Unity Agenda for the Nation. Four big things we can do together.  \n\nFirst, beat the opioid epidemic. \n\nThere is so much we can do. Increase funding for prevention, treatment, harm reduction, and recovery.'),
 Document(id='acbe618e-2ef3-42fe-8e3a-49b8cae5c015', metadata={'source': 'state_of_union.txt'}, page_content='Let’s pass the Paycheck Fairness Act and paid leave.  \n\nRaise the minimum wage to $15 an hour and extend the Child Tax Credit, so no one has to raise a family in poverty. \n\nLet’s increase Pell Grants and increase our historic support of HBCUs, and invest in what Jill—our First Lady who teaches full-time—calls America’s best-kept secret: community co

In [30]:
from langchain.retrievers.document_compressors import EmbeddingsFilter
embeddings_filter = EmbeddingsFilter(embeddings=embedding_model)
compression_retriever3 = ContextualCompressionRetriever(base_compressor=embeddings_filter, base_retriever=retriever)
compressed_docs3 = compression_retriever3.invoke("What were the top three priorities outlined in the most recent State of the Union address?")
compressed_docs3

[_DocumentWithState(metadata={'source': 'state_of_union.txt'}, page_content='And soon, we’ll strengthen the Violence Against Women Act that I first wrote three decades ago. It is important for us to show the nation that we can come together and do big things. \n\nSo tonight I’m offering a Unity Agenda for the Nation. Four big things we can do together.  \n\nFirst, beat the opioid epidemic. \n\nThere is so much we can do. Increase funding for prevention, treatment, harm reduction, and recovery.', state={'embedded_doc': [-0.01875210925936699, -0.051563967019319534, -0.012639066204428673, -0.015856551006436348, -0.006436474621295929, 0.045767638832330704, 0.05876805633306503, -0.005884653888642788, -0.04514610022306442, -0.04062095284461975, 0.050369106233119965, 0.001644234056584537, -0.08319339901208878, 0.033796198666095734, 0.001673919497989118, 0.06262464076280594, 0.08327729254961014, 0.005334349814802408, 0.02231842838227749, 0.013441515155136585, -0.009698674082756042, 0.075989603

In [31]:
from langchain.retrievers.document_compressors import DocumentCompressorPipeline
from langchain_community.document_transformers import EmbeddingsRedundantFilter
from langchain_text_splitters import CharacterTextSplitter

splitter = CharacterTextSplitter(chunk_size=300, chunk_overlap=0, separator=". ")
redundant_filter = EmbeddingsRedundantFilter(embeddings=embedding_model)
relevant_filter = EmbeddingsFilter(embeddings=embedding_model, similarity_threshold=0.76)

pipeline_compressor = DocumentCompressorPipeline(transformers=[splitter, redundant_filter, relevant_filter])

compression_retriever = ContextualCompressionRetriever(base_compressor=pipeline_compressor, base_retriever=retriever)

compressed_docs = compression_retriever.invoke("What were the top three priorities outlined in the most recent State of the Union address?")

compressed_docs





[]