In [None]:
!pip install -U langchain langchain-community faiss-cpu openai langchain-openai sentence_transformers pypdf



In [None]:
from google.colab import files
uploaded = files.upload()

In [None]:
import os
print(os.path.exists("attention-is-all-you-need.pdf"))

True


In [None]:
from langchain_community.document_loaders import PyPDFLoader
from langchain.text_splitter import RecursiveCharacterTextSplitter
from itertools import chain
from google.colab import userdata
from langchain_openai import ChatOpenAI, OpenAIEmbeddings
from langchain.vectorstores import FAISS
from huggingface_hub import login
from sentence_transformers import CrossEncoder
from langchain_core.prompts import ChatPromptTemplate
from langchain_core.output_parsers import StrOutputParser

#Parent-child chunking
parent_splitter = RecursiveCharacterTextSplitter(chunk_size=2000)
child_splitter = RecursiveCharacterTextSplitter(chunk_size=512)

parent_docs = parent_splitter.split_documents(docs)
child_docs_nested = [child_splitter.split_documents([doc]) for doc in parent_docs]

def parent_child_chunking(child_docs_nested):
  child_docs = list(chain.from_iterable(child_docs_nested))
  return child_docs

child_docs = parent_child_chunking(child_docs_nested)


# Rewriting Query
def rewrite_query(query, llm):
    template = """
    You are a helpful assistant that rewrites user queries to improve document retrieval.
    Consider the following user query and generate a more effective query for a search engine.
    The goal is to broaden or clarify the search to find more relevant information.

    Original query: {query}

    Rewritten query:
    """
    prompt = ChatPromptTemplate.from_template(template)
    chain = prompt | llm | StrOutputParser() # Use StrOutputParser to get plain string
    rewritten_query = chain.invoke({"query": query})
    print(f"Original Query: {query}")
    print(f"Rewritten Query: {rewritten_query}")
    return rewritten_query



# Hybrid Search
OPENAI_API_KEY = userdata.get("OpenAI-key")
llm = ChatOpenAI(model_name="gpt-3.5-turbo", temperature=0.3, openai_api_key=OPENAI_API_KEY)
embedder = OpenAIEmbeddings(openai_api_key=OPENAI_API_KEY)

child_vectorstore = FAISS.from_documents(child_docs, embedder)

def hybrid_search(query , child_vectorstore, k_retrieve):
    vector_results = child_vectorstore.similarity_search(query, k=k_retrieve)
    keyword_results = child_vectorstore.max_marginal_relevance_search(query, k=k_retrieve)
    # Combine and deduplicate results
    combined = {}
    for doc in vector_results + keyword_results:
        combined[doc.page_content] = doc
    return list(combined.values())

# Reranking
HF_API_KEY = userdata.get("HFAPI-key")
login(HF_API_KEY)
cross_encoder = CrossEncoder('cross-encoder/ms-marco-MiniLM-L-6-v2')

def rerank(query, docs):
    scores = cross_encoder.predict([(query, doc.page_content) for doc in docs])
    ranked_docs = sorted(zip(docs, scores), key=lambda x: x[1], reverse=True)
    return [doc for doc, score in ranked_docs[:5]] # Return top 5 reranked documents

# Prompt Engineering
def prompt_engineering(reranked, query):
  context = "\n".join([f"Document {i}:: {doc.page_content}" for i, doc in enumerate(reranked)])
  prompt = f"""Answer using ONLY these documents:
{context}
Question: {query}"""
  return prompt



# Evaluation
def evaluate_rag_response(query, retrieved_docs, final_response):
    print("\n--- RAG Pipeline Evaluation ---")
    print(f"Original Query: {query}")
    print("\nRetrieved and Reranked Documents:")
    for i, doc in enumerate(retrieved_docs):
        print(f"Document {i+1}:")
        print(f"Content: {doc.page_content[:200]}...") # Print first 200 chars
        print("-" * 30)

    print("\nFinal Generated Response:")
    print(final_response)

    print("\n--- End Evaluation ---")


# RAG Pipeline Function
def advanced_rag(query, llm, child_docs_nested, child_vectorstore, k_retrieve=30, k_final=5, enable_query_rewriting=True):

    # Parent-child chunking (already done during setup, but keeping this call for conceptual clarity if it were dynamic)
    parent_child_chunking(child_docs_nested)

    # Rewriting Query
    original_query = query # Keep original for evaluation
    if enable_query_rewriting:
        query = rewrite_query(query, llm)

    # Hybrid retrieval
    combined_docs = hybrid_search(query, child_vectorstore,  k_retrieve )

    # Reranking
    reranked = rerank(query, combined_docs)

    # Format prompt
    prompt = prompt_engineering(reranked, query)

    final_response = llm.invoke(prompt).content # Get the string content of the response

    # Return everything needed for evaluation
    return original_query, reranked, final_response

# Example Usage
print("--- Running RAG with Query Rewriting and Evaluation ---")
query = "What is the self-attention mechanism in Transformers?"
original_query, retrieved_docs_for_eval, final_response_obj = advanced_rag(query, llm, child_docs_nested, child_vectorstore, enable_query_rewriting=True)

# Call the evaluation function
evaluate_rag_response(original_query, retrieved_docs_for_eval, final_response_obj)

print("\n\n--- Running RAG without Query Rewriting (for comparison) ---")
query_no_rewrite = "What is the self-attention mechanism in Transformers?"
original_query_no_rewrite, retrieved_docs_no_rewrite, final_response_no_rewrite = advanced_rag(query_no_rewrite, llm, child_docs_nested, child_vectorstore, enable_query_rewriting=False)

# Call the evaluation function for comparison
evaluate_rag_response(original_query_no_rewrite, retrieved_docs_no_rewrite, final_response_no_rewrite)

--- Running RAG with Query Rewriting and Evaluation ---
Original Query: What is the self-attention mechanism in Transformers?
Rewritten Query: Explain the concept of self-attention mechanism in Transformers and its significance.

--- RAG Pipeline Evaluation ---
Original Query: What is the self-attention mechanism in Transformers?

Retrieved and Reranked Documents:
Document 1:
Content: reduced to a constant number of operations, albeit at the cost of reduced effective resolution due
to averaging attention-weighted positions, an effect we counteract with Multi-Head Attention as
descr...
------------------------------
Document 2:
Content: typical encoder-decoder attention mechanisms in sequence-to-sequence models such as
[38, 2, 9].
• The encoder contains self-attention layers. In a self-attention layer all of the keys, values
and quer...
------------------------------
Document 3:
Content: encoder.
• Similarly, self-attention layers in the decoder allow each position in the decoder to att