### Install all necessary dependencies

In [None]:
!pip install wikipedia-api faiss-cpu numpy tqdm sentence-transformers langdetect transformers rank-bm25 indic-nlp-library sacrebleu deep-translator datasets

### Import all necessary dependencies

In [None]:
import wikipediaapi
import re
import faiss
import numpy as np
from sentence_transformers import SentenceTransformer
from langdetect import detect
from transformers import T5Tokenizer, T5ForConditionalGeneration
from deep_translator import GoogleTranslator
import torch

### Configuration Parameters

This section outlines the configurable parameters used in the RAG pipeline. To adjust the system's behavior, modify the following variables as needed:
* MODEL_NAME
* LANGUAGE
* TOP_K

Don't forget to add key:value pair if you add extra varibles in **Topic Mapping and Languange Code Mapping**

In [None]:
# Model Selection (Uncomment the desired model)
# MODEL_NAME = "t5-small"
MODEL_NAME = "t5-base"
# MODEL_NAME = "t5-large"

# Language Selection (Uncomment the desired language)
#LANGUAGE = "hi"  # Hindi
LANGUAGE = "en"  # English
# LANGUAGE = "bn"  # Bengali
# LANGUAGE = "mr" # Marathi
# LANGUAGE = "ta" #Tamil
#LANGUAGE = "te" #Telugu

# Number of retrieved documents

TOP_K = 5

# Language Code Mapping
LANGUAGE_MAPPING = {
    "hi": "hi",  # Hindi
    "en": "en",  # English
    "bn": "bn",  # Bengali
    "mr": "mr", # Marathi
    "ta": "ta", #Tamil
    "te": "te", #Telugu
}

# Topic Mapping
TOPIC_MAPPING = {
    "hi": "संस्कृति", # Culture in Hindi
    "en": "Culture",
    "bn": "সংস্কৃতি",
    "mr": "संस्कृती",
    "ta": "பண்பாடு",
    "te": "సంస్కృతి",
}

**The RAG pipeline encompasses these key functions:**

* Wikipedia Scraping
* Data Preprocessing
* FAISS-based Indexing (Dense Retrieval)
* Language Detection and Translation
* RAG Response Generation

In [None]:
# Step 1: Wikipedia Scraping
def scrape_wikipedia(pages=5000, language=LANGUAGE):
  wiki_lang = LANGUAGE_MAPPING.get(language, "en")  # Default to English if not found
  topic = TOPIC_MAPPING.get(language, "Culture")
  wiki = wikipediaapi.Wikipedia(language=wiki_lang, user_agent="RAG-INDI/1.0 (sanjaydeo96@gmail.com)")
  page = wiki.page(topic)

  if not page.exists():
      print(f"⚠️ Error: Wikipedia page '{topic}' does not exist in {wiki_lang}.")
      return []

  articles = []
  for link in page.links.values():
      if len(articles) >= pages:
          break
      try:
          sub_page = wiki.page(link.title)
          if sub_page.exists():
              text = sub_page.text
              text = preprocess_text(text)
              if len(text.split()) > 50:
                  articles.append(text)
      except Exception as e:
          print(f"Error retrieving page '{link.title}': {e}")

  return articles

# Step 2: Data Preprocessing
def preprocess_text(text):
  text = re.sub(r'\s+', ' ', text)
  text = re.sub(r'\[[0-9]*\]', '', text)
  return text.strip()

# Step 3: Indexing using FAISS (Dense Retrieval)
class Indexing:
  def __init__(self, documents, model_name="paraphrase-multilingual-MiniLM-L12-v2"):
      self.documents = [preprocess_text(doc) for doc in documents]
      self.embedding_model = SentenceTransformer(model_name)

      self.doc_embeddings = np.array(self.embedding_model.encode(self.documents), dtype=np.float32)
      self.index = faiss.IndexFlatL2(self.doc_embeddings.shape[1])
      self.index.add(self.doc_embeddings)

  def retrieve(self, query, k=TOP_K):
      query_embedding = np.array(self.embedding_model.encode([query]), dtype=np.float32)
      _, top_faiss_idx = self.index.search(query_embedding, k)
      return [self.documents[i] for i in top_faiss_idx[0]]

# Step 4: Language Detection & Translation
def detect_and_translate(query, target_lang="en"):
  lang = detect(query)
  if lang != target_lang:
      translated_query = GoogleTranslator(source=lang, target=target_lang).translate(query)
      return translated_query, lang
  return query, lang

# Step 5: RAG Response Generation (T5)
def generate_response(query, retrieved_docs, language, model_name=MODEL_NAME, max_response_length=200):
  try:
      tokenizer = T5Tokenizer.from_pretrained(model_name)
      model = T5ForConditionalGeneration.from_pretrained(model_name)
      device = torch.device("cuda" if torch.cuda.is_available() else "cpu")
      model.to(device)

      context = " ".join(retrieved_docs)
      max_context_length = 512
      if len(context) > max_context_length:
          context = context[:max_context_length]
      prompt = f"Answer the following question based on the context: Question: {query} Context: {context}"

      inputs = tokenizer(prompt, return_tensors="pt").to(device)
      outputs = model.generate(
          **inputs,
          max_length=max_response_length,
          num_beams=4,
          early_stopping=True,
          num_return_sequences=1
      )

      response = tokenizer.decode(outputs[0], skip_special_tokens=True)

      if language != "en":
          response = GoogleTranslator(source="en", target=language).translate(response)

      del inputs
      torch.cuda.empty_cache()

      return response

  except Exception as e:
      print(f"Error generating response: {e}")
      return "An error occurred while generating the response."

### Runinng pipeline by choosing sample **QUERY** variable from below cell

In [None]:
# Cultural Query 1: World Festivals
QUERY = "दुनिया में सबसे लोकप्रिय त्योहार कौन सा है?" if LANGUAGE == "hi" else "What is the most popular festival in the world?" if LANGUAGE == "en" else "পৃথিবীতে সবচেয়ে জনপ্রিয় উৎসব কোনটি?" if LANGUAGE == "bn" else "जगातील सर्वात लोकप्रिय सण कोणता आहे?" if LANGUAGE == "mr" else "உலகில் மிகவும் பிரபலமான பண்டிகை எது?" if LANGUAGE == "ta" else "ప్రపంచంలో అత్యంత ప్రజాదరణ పొందిన పండుగ ఏది?" if LANGUAGE == "te" else "What is the most popular festival in the world?"
# Cultural Query 2: World Music Genres
#QUERY = "दुनिया में सबसे लोकप्रिय संगीत शैली कौन सी है?" if LANGUAGE == "hi" else "What is the most popular music genre in the world?" if LANGUAGE == "en" else "পৃথিবীতে সবচেয়ে জনপ্রিয় সঙ্গীত ধারা কোনটি?" if LANGUAGE == "bn" else "जगातील सर्वात लोकप्रिय संगीत प्रकार कोणता आहे?" if LANGUAGE == "mr" else "உலகில் மிகவும் பிரபலமான இசை வகை எது?" if LANGUAGE == "ta" else "ప్రపంచంలో అత్యంత ప్రజాదరణ పొందిన సంగీత శైలి ఏది?" if LANGUAGE == "te" else "What is the most popular music genre in the world?"
# Cultural Query 3: World Dance Forms
#QUERY = "दुनिया में सबसे लोकप्रिय नृत्य रूप कौन सा है?" if LANGUAGE == "hi" else "What is the most popular dance form in the world?" if LANGUAGE == "en" else "পৃথিবীতে সবচেয়ে জনপ্রিয় নৃত্যশৈলী কোনটি?" if LANGUAGE == "bn" else "जगातील सर्वात लोकप्रिय नृत्य प्रकार कोणता आहे?" if LANGUAGE == "mr" else "உலகில் மிகவும் பிரபலமான நடன வடிவம் எது?" if LANGUAGE == "ta" else "ప్రపంచంలో అత్యంత ప్రజాదరణ పొందిన నృత్య రూపం ఏది?" if LANGUAGE == "te" else "What is the most popular dance form in the world?"

In [None]:
# Step 6: Running the Pipeline
if __name__ == "__main__":
    print("Scraping Wikipedia...")
    documents = scrape_wikipedia()

    if not documents:
        print("No documents found. Exiting...")
        exit()

    print("Indexing Documents...")
    indexer = Indexing(documents)

    query = QUERY
    translated_query, original_lang = detect_and_translate(query)

    print(f"Query Detected Language: {original_lang}")
    print(f"Translated Query: {translated_query}")

    retrieved_docs = indexer.retrieve(translated_query, k=TOP_K)

    print("Generating Response...")
    response = generate_response(translated_query, retrieved_docs, original_lang)

    print("\nFinal Response:", response)