## __RAG function__

In [1]:
import re
import os
from langchain_groq import ChatGroq
from langchain_huggingface import HuggingFaceEmbeddings
from langchain_community.vectorstores import FAISS

# Function to clean text
def clean_text(text):
    # Replace non-breaking space with regular space
    text = text.replace('\xa0', ' ')
    
    # Remove any HTML tags (if any)
    text = re.sub(r'<[^>]+>', '', text)  # Removes HTML tags
    
    # Remove references in brackets (e.g., [7], [39])
    text = re.sub(r'\[.*?\]', '', text)  # Removes references inside square brackets
    
    # Remove extra spaces and newlines
    text = ' '.join(text.split())  # This will remove extra spaces and newline characters
    
    return text

def rag_function(query, db_path, index_name):
    api_key=os.environ["GROQ_API_KEY"]

    embeddings = HuggingFaceEmbeddings(model_name="sentence-transformers/all-mpnet-base-v2")

    db=FAISS.load_local(folder_path=db_path, index_name=index_name, embeddings=embeddings, allow_dangerous_deserialization=True)

    retrieved_docs = db.similarity_search(query, k=3)

    retrieved_context=[clean_text(retrieved_docs[0].page_content + retrieved_docs[1].page_content + retrieved_docs[2].page_content)]


    augmented_prompt=f"""

    Given the context below answer the question.

    Question: {query} 

    Context : {retrieved_context}

    Remember to answer only based on the context provided and not from any other source. 

    If the question cannot be answered based on the provided context, say I don’t know.

    """

    llm = ChatGroq(
        model="llama-3.1-8b-instant", # Recommended Groq model
        temperature=0.1**50,
        max_tokens=None,
        timeout=None,
        max_retries=2
        # The api_key parameter is automatically inferred from the GROQ_API_KEY 
        # environment variable if not provided explicitly
    )

    messages = [("human", augmented_prompt)]

    # Invoke the LLM
    ai_msg = llm.invoke(messages)

    # Extract the answer from the response object
    response = ai_msg.content

    # print(response)

    return retrieved_context, response

  from pydantic.v1.fields import FieldInfo as FieldInfoV1
  from .autonotebook import tqdm as notebook_tqdm


In [2]:
rag_function(query="Who won National Film Award 2025 ?", db_path="./Assets/Data", index_name="CWC_index")

(['National Film Awards71st National Film Awards71st National Film Awards'],
 "I don't know. \n\nThe context provided does not contain any information about the winner of the National Film Award 2025.")

In [4]:
rag_function(query="Name three personalities from the Indian film industry was appointed to evaluate the award ?", db_path="./Assets/Data", index_name="CWC_index")

(['A committee consisting three personalities from the Indian film industry was appointed to evaluate the award. Following were the jury members: Jury MembersAward ceremony for Indian films of 2023Directorate of Film Festivals Ministry of Information and Broadcasting Cinema of India Lifetime Achievement'],
 "I don't know. \n\nThe context only mentions that a committee consisting of three personalities from the Indian film industry was appointed to evaluate the award, but it does not provide the names of the jury members.")

In [3]:
rag_function(query="what is Dadasaheb Phalke Award ?", db_path="./Assets/Data", index_name="CWC_index")

(["Dadasaheb Phalke Award 2 Dadasaheb Phalke AwardIntroduced in 1969, the Dadasaheb Phalke Award is India's highest award in the field of cinema given to recognise the contributions of film personalities towards the development of Indian cinema and for distinguished contributions to the medium, its growth and promotion. The recipient is awarded with 'Golden Lotus Award' (Swarna Kamal), cash prize of ₹ 10 lakh (US$12,000), medallion and a shawl. The 2023 recipient is Mohanlal ."],
 "The Dadasaheb Phalke Award is India's highest award in the field of cinema. It was introduced in 1969 and is given to recognise the contributions of film personalities towards the development of Indian cinema and for distinguished contributions to the medium, its growth and promotion.")

If you observe our RAG system is still not properly aligned to act on similar meaning words.