# RAG with PDF using Langchain

## Step 1: Import the necessary libraries and instantiate models

In [2]:
# libraries and models setup
import os
from langchain.document_loaders import PyPDFLoader
from langchain.text_splitter import RecursiveCharacterTextSplitter
from langchain.embeddings.openai import OpenAIEmbeddings
from langchain.vectorstores import Chroma
from langchain.retrievers import SelfQueryRetriever
from langchain.chains import RetrievalQA
from langchain_openai import AzureChatOpenAI, AzureOpenAIEmbeddings
from dotenv import load_dotenv, find_dotenv
_ = load_dotenv(find_dotenv())

Instantiate the ` AzureChatOpenAI` and `AzureOpenAIEmbeddings` models as previously.

In [None]:
# Chat Model definition
llm = AzureChatOpenAI(
    openai_api_version="2023-09-01-preview",
    azure_endpoint=os.getenv('AZURE_API_ENDPOINT'),
    api_key=os.getenv('AZURE_OPENAI_KEY'),
    azure_deployment=os.getenv('OPENAI_DEPLOYMENT_NAME'),
    model_name=os.getenv('OPENAI_MODEL_NAME'),
    model_version=os.getenv('OPENAI_API_VERSION'),
    temperature=.7
)

# Embeddings model definition
embedding_model = AzureOpenAIEmbeddings(
    openai_api_version="2023-09-01-preview",
    azure_endpoint=os.getenv('AZURE_API_ENDPOINT'),
    api_key=os.getenv('AZURE_OPENAI_KEY'),
    azure_deployment=os.getenv('OPENAI_DEPLOYMENT_NAME_EMBEDDING')
)

## Step 2: Load the data from the PDF

Use the function `PyPDFLoader` to load the PDF file from the `data`folder. Note that once the function instantiated with the file path, you can use the `load` method to load the document.

In [4]:
# Specify the path to your PDF file
pdf_file_path = 'data/2005.11401v4.pdf'

# Create a loader instance
loader = PyPDFLoader(pdf_file_path)

# Load the data from the PDF
data = loader.load()

## Step 3: Split the PDF content into smaller chunks

Use the function `RecursiveCharacterTextSplitter` function with the following parameters `chunk_size=1000, chunk_overlap=150`

In [5]:
# Create an instance of the RecursiveCharacterTextSplitter
text_splitter = RecursiveCharacterTextSplitter(chunk_size=1000, chunk_overlap=150)

# Split the PDF content into smaller chunks
docs = text_splitter.split_documents(data)

## Step 4: Create a Chroma vector store to store the embeddings

In [6]:
# Define the path to store the Chroma database
db_path = "./"

# Create a Chroma vector store
chroma_db = Chroma.from_documents(docs, embedding_model, persist_directory=db_path)


## Step 5: Retrieve relevant documents (RETRIEVAL)

In [8]:
# Create a retriever object using Chroma
retriever = chroma_db.as_retriever()

# Search for relevant documents based on a user query
query = "What is the topic of the PDF?"
docs = retriever.invoke(query)
docs

[Document(page_content='Jonathan Berant. Coarse-to-ﬁne question answering for long documents. In Proceedings of the\n55th Annual Meeting of the Association for Computational Linguistics (Volume 1: Long Papers) ,\npages 209–220, Vancouver, Canada, July 2017. Association for Computational Linguistics. doi:\n10.18653/v1/P17-1020. URL https://www.aclweb.org/anthology/P17-1020 .\n10', metadata={'page': 9, 'source': 'data/2005.11401v4.pdf'}),
 Document(page_content='Processing , pages 3950–3959, Brussels, Belgium, October-November 2018. Association for\nComputational Linguistics. doi: 10.18653/v1/D18-1429. URL https://www.aclweb.org/\nanthology/D18-1429 .\n[43] Tri Nguyen, Mir Rosenberg, Xia Song, Jianfeng Gao, Saurabh Tiwary, Rangan Majumder,\nand Li Deng. MS MARCO: A human generated machine reading comprehension dataset. In\nTarek Richard Besold, Antoine Bordes, Artur S. d’Avila Garcez, and Greg Wayne, editors,\nProceedings of the Workshop on Cognitive Computation: Integrating neural and s

## Step 6: Perform RAG using RetrievalQA

In [10]:
# Create a retriever object with specific search configurations
retriever = chroma_db.as_retriever(search_kwargs={"k": 4})

# Create a RetrievalQA instance
rag_chain = RetrievalQA.from_chain_type(
    llm=llm,
    chain_type="refine",
    retriever=retriever,
    return_source_documents=False
)

# Call the RAG chain with a user query
question = "What is the main topic discussed in the PDF?"
response = rag_chain.invoke({"query": question})
print(response)


{'query': 'What is the main topic discussed in the PDF?', 'result': 'The main topic discussed in the PDF is the Retrieval-Augmented Generation (RAG) model, which is a combination of a parametric neural network and a non-parametric memory. The PDF presents the design and evaluation of RAG in the context of natural language generation and fact verification tasks, with a focus on its ability to generate coherent and plausible text while also being able to verify the truthfulness of the generated information. Results are presented for various benchmarks, including the FEVER dataset, where RAG achieves competitive performance compared to state-of-the-art models without requiring intermediate retrieval supervision.'}
