### Contextual Compression in Document Retrieval

This code demonstrates the implementation of contextual compression in a document retrieval system using LangChain and OpenAI's language models. The technique aims to improve the relevance and conciseness of retrieved information by compressing and extracting the most pertinent parts of documents in the context of a given query.

In [5]:
import os
import sys
from dotenv import load_dotenv
load_dotenv()
from langchain_groq import ChatGroq
from langchain_huggingface import HuggingFaceEmbeddings
from langchain_text_splitters import RecursiveCharacterTextSplitter
from langchain_community.document_loaders import WebBaseLoader
from langchain.vectorstores import Chroma
from langchain_core.documents import Document
from langchain_openai import ChatOpenAI
from langchain_core.prompts import PromptTemplate, ChatPromptTemplate
from utility import encode_pdf, show_context, retrieve_context_per_question
from langchain_core.output_parsers import StrOutputParser
from typing import List
from concurrent.futures import ThreadPoolExecutor, as_completed
from langchain_community.docstore.in_memory import InMemoryDocstore
from tqdm import tqdm
from langchain.vectorstores import Chroma, FAISS
import faiss
from langchain_text_splitters import RecursiveCharacterTextSplitter
from langchain_community.document_loaders import PyPDFLoader, TextLoader
from utility import replace_t_with_space
from langchain_experimental.text_splitter import SemanticChunker

In [2]:
file_path = "data/Understanding_Climate_Change.pdf"
vector_store = encode_pdf(file_path)

  from .autonotebook import tqdm as notebook_tqdm


In [3]:
#Base retriever 
retriever = vector_store.as_retriever(search_kwargs={'k':3})

from langchain.retrievers.contextual_compression import ContextualCompressionRetriever
from langchain.retrievers.document_compressors import LLMChainExtractor

#SEtup the compressor using LLM
groq_api_key=os.getenv("GROQ_API_KEY")
llm=ChatGroq(groq_api_key=groq_api_key,model_name="Llama3-8b-8192")
compressor = LLMChainExtractor.from_llm(llm)

#Create contextual compression retriever 
compression_retriever = ContextualCompressionRetriever(
    base_retriever=retriever,
    base_compressor=compressor
)

In [6]:
from langchain.chains import create_retrieval_chain
from langchain.chains.combine_documents import create_stuff_documents_chain
system_prompt = """ 
    Use the given context to answer the question. "
    "If you don't know the answer, say you don't know. "
    "Use three sentence maximum and keep the answer concise. "
    "Context: {context}
"""
prompt = ChatPromptTemplate(
    [
        ("system",system_prompt),
        ("user","{input}")
    ]
)

qa_chain = create_stuff_documents_chain(llm,prompt)
retriever_chain = create_retrieval_chain(retriever,qa_chain)

In [7]:
query = "What is the main topic of the document?"
result = retriever_chain.invoke({"input":query})

In [8]:
result

{'input': 'What is the main topic of the document?',
 'context': [Document(id='ecda88c8-3e10-4c41-838a-e28691f33ffb', metadata={'producer': 'Microsoft® Word 2021', 'creator': 'Microsoft® Word 2021', 'creationdate': '2024-07-13T20:17:34+03:00', 'author': 'Nir', 'moddate': '2024-07-13T20:17:34+03:00', 'source': 'data/Understanding_Climate_Change.pdf', 'total_pages': 33, 'page': 26, 'page_label': '27'}, page_content='challenges. This includes collaboration between scientists, policymakers, businesses, and \ncommunities. Interdisciplinary research and solutions are more holistic and effective. \nCitizen Science \nCitizen science involves engaging the public in scientific research and data collection. This \nempowers individuals to contribute to climate knowledge and action. Citizen science projects \ncan enhance data accuracy, raise awareness, and foster community engagement. \nHope and Inspiration \nPositive Narratives'),
  Document(id='59043a38-45d4-4b25-abd1-4a143d54599b', metadata={'pr