<h1> A Beginner’s Guide to Retrieval Augmented Generation (RAG) </h1>

In [1]:
import PyPDF2
from langchain.text_splitter import RecursiveCharacterTextSplitter
from langchain.embeddings import HuggingFaceEmbeddings
from langchain.vectorstores import FAISS
from langchain_community.llms import Ollama
from langchain.chains import RetrievalQA




In [2]:
# read PDF document
def read_pdf(file):
    reader = PyPDF2.PdfReader(file)
    text = ""
    for page in reader.pages:
        text += page.extract_text()

    return text

In [3]:
# Chunk text
def chunk_text(text):
    text_splitter = RecursiveCharacterTextSplitter(chunk_size=500, chunk_overlap=100)
    chunks = text_splitter.split_text(text)
    return chunks

In [4]:
# Create embedding and index
def create_embeddings_and_index(chunks):
    embeddings = HuggingFaceEmbeddings(model_name="sentence-transformers/all-MiniLM-L6-v2")
    vector_store = FAISS.from_texts(chunks, embedding=embeddings)
    return vector_store

In [5]:
def query_document(llm, vector_store, query):
    qa_chain = RetrievalQA.from_chain_type(llm=llm, retriever=vector_store.as_retriever())
    result = qa_chain.run(query)
    return result

In [None]:
def rag_system(llm, document, query):
    chunks_texts = chunk_text(document)
    embedding = create_embeddings_and_index(chunks_texts)
    response = query_document(llm, embedding, query)
    return response

In [None]:
llm = Ollama(model="llama2")

In [None]:
document_path = r"path_of_pdf_document"
document = read_pdf(document_path)
query = "What is profit after tax of this financial year?"
response = rag_system(llm, document, query)
print(response)