In [1]:
# Installation (prerequisites)
!pip install -U langchain langchain-community faiss-cpu openai langchain-openai



In [2]:
from google.colab import files
uploaded = files.upload()

Saving attention-is-all-you-need.pdf to attention-is-all-you-need (3).pdf


In [3]:
import os
print(os.path.exists("attention-is-all-you-need.pdf"))

True


In [5]:
# OpenAI
from google.colab import userdata
OPENAI_API_KEY = userdata.get("OpenAI-key")
from langchain_openai import ChatOpenAI
llm = ChatOpenAI(model_name="gpt-3.5-turbo", temperature=0.3, openai_api_key=OPENAI_API_KEY)

# Parent-child chunking
from langchain_community.document_loaders import PyPDFLoader
from langchain.text_splitter import RecursiveCharacterTextSplitter

loader = PyPDFLoader("attention-is-all-you-need.pdf")
docs = loader.load()

parent_splitter = RecursiveCharacterTextSplitter(chunk_size=2000)
child_splitter = RecursiveCharacterTextSplitter(chunk_size=512)

parent_docs = parent_splitter.split_documents(docs)
child_docs_nested = [child_splitter.split_documents([doc]) for doc in parent_docs]

from itertools import chain

def parent_child_chunking(child_docs_nested):
  child_docs = list(chain.from_iterable(child_docs_nested))
  return child_docs

child_docs = parent_child_chunking(child_docs_nested)

# Hybrid Search
from langchain.vectorstores import FAISS
from langchain_openai import OpenAIEmbeddings
embedder = OpenAIEmbeddings(openai_api_key=OPENAI_API_KEY)
child_vectorstore = FAISS.from_documents(child_docs, embedder)

def hybrid_search(query , child_vectorstore, k_retrieve):
    vector_results = child_vectorstore.similarity_search(query, k=k_retrieve)
    keyword_results = child_vectorstore.max_marginal_relevance_search(query, k=k_retrieve)
    return vector_results + keyword_results

# Reranking Setup
from sentence_transformers import CrossEncoder
cross_encoder = CrossEncoder('cross-encoder/ms-marco-MiniLM-L-6-v2')
def rerank(query, docs):
    scores = cross_encoder.predict([(query, doc.page_content) for doc in docs])
    ranked_docs = sorted(zip(docs, scores), key=lambda x: x[1], reverse=True)
    return [doc for doc, score in ranked_docs[:5]]

# Prompt_engineering
def prompt_engineering(reranked, query):
  context = "\n".join([f"Document {i}:: {doc.page_content}" for i, doc in enumerate(reranked)])
  prompt = f"""Answer using ONLY these documents:
{context}
Question: {query}"""
  return prompt

# RAG Pipeline Function
def advanced_rag(query, llm, child_docs_nested, child_vectorstore, k_retrieve=30, k_final=5):

    # Parent_child_chunking
    parent_child_chunking(child_docs_nested)

    # Hybrid retrieval
    combined_docs = hybrid_search(query, child_vectorstore,  k_retrieve )

    # Reranking
    reranked = rerank(query, combined_docs)

    # Format prompt
    prompt = prompt_engineering(reranked, query)
    return llm.invoke(prompt)

# Example Usage
query = "What is the self-attention mechanism in Transformers?"
response = advanced_rag(query, llm, child_docs_nested, child_vectorstore)

print(response)

content='The self-attention mechanism in Transformers is a mechanism that relates different positions of a single sequence in order to compute a representation of the sequence. It involves averaging attention-weighted positions to reduce the number of operations, but this can lead to reduced effective resolution, which is counteracted by Multi-Head Attention.' additional_kwargs={'refusal': None} response_metadata={'token_usage': {'completion_tokens': 61, 'prompt_tokens': 531, 'total_tokens': 592, 'completion_tokens_details': {'accepted_prediction_tokens': 0, 'audio_tokens': 0, 'reasoning_tokens': 0, 'rejected_prediction_tokens': 0}, 'prompt_tokens_details': {'audio_tokens': 0, 'cached_tokens': 0}}, 'model_name': 'gpt-3.5-turbo-0125', 'system_fingerprint': None, 'id': 'chatcmpl-BbUPUoEPalqwlr1UyGsYA0w9em7h9', 'service_tier': 'default', 'finish_reason': 'stop', 'logprobs': None} id='run--6278282d-a285-477d-a2d5-2c9268ac83fc-0' usage_metadata={'input_tokens': 531, 'output_tokens': 61, 'to