In [9]:
!pip install -q sentence-transformers faiss-cpu gradio datasets transformers accelerate
import os, torch, numpy as np, faiss
from datetime import datetime
gpu_available = torch.cuda.is_available()
print(f"GPU Available: {gpu_available}")
if gpu_available:
print("GPU available but using FAISS-CPU (still very fast for semantic search)")
else:
print("Using FAISS-CPU (optimal for most RAG applications)")
PROJECT = "multilingual_semantic_rag"
ASSETS = os.path.join(PROJECT, "assets")
os.makedirs(ASSETS, exist_ok=True)
LANGS = ["en", "hi", "fr", "es"]
print(" Setup complete", datetime.now().isoformat())
!python -V

GPU Available: True
GPU available but using FAISS-CPU (still very fast for semantic search)
 Setup complete 2025-08-11T13:48:29.754566
Python 3.11.13


In [10]:
from google.colab import drive
drive.mount('/content/drive')
BASE = "/content/drive/MyDrive/semantic_rag"
os.makedirs(BASE, exist_ok=True)
print("Saving project assets to:", BASE)

Drive already mounted at /content/drive; to attempt to forcibly remount, call drive.mount("/content/drive", force_remount=True).
Saving project assets to: /content/drive/MyDrive/semantic_rag


In [11]:
!pip -q install wikipedia
import os, json
from collections import Counter
import wikipedia
from datasets import Dataset
import pandas as pd
if "LANGS" not in globals():
LANGS = ["en", "hi", "fr", "es"]
if "ASSETS" not in globals():
ASSETS = "multilingual_semantic_rag/assets"
os.makedirs(ASSETS, exist_ok=True)
TOPICS = [
"Biodiversity",
"Climate change",
"Renewable energy",
"Sustainable agriculture",
"Water scarcity",
"Public health",
"Artificial intelligence",
"Soil erosion",
"Food security",
"Waste management",
]
def get_summary_for_topic(lang: str, topic: str, sentences: int = 3):
"""
Tries to retrieve a short summary paragraph for `topic` in the given `lang`.
1) Attempt the exact title (with autosuggest/redirects).
2) If that fails (title differs across languages), search and try the top few results.
Returns: (summary_text, resolved_title) or (None, None) if nothing found.
"""
wikipedia.set_lang(lang)
try:
text = wikipedia.summary(topic, sentences=sentences, auto_suggest=True, redirect=True)
return text, topic
except Exception:
try:
results = wikipedia.search(topic)
for title in results[:3]:
try:
text = wikipedia.summary(title, sentences=sentences, auto_suggest=False, redirect=True)
return text, title
except Exception:
continue
except Exception:
pass
return None, None
docs = []
doc_id = 1
for lang in LANGS:
for topic in TOPICS:
text, resolved_title = get_summary_for_topic(lang, topic, sentences=3)
if not text:
continue
text = " ".join(text.split())
if len(text) < 120:
continue
docs.append({
"id": doc_id,
"lang": lang,
"title": resolved_title,
"text": text
})
doc_id += 1
print(f"Collected {len(docs)} passages across {len(LANGS)} languages and {len(TOPICS)} topics.")
seen = set()
unique_docs = []
for d in docs:
key = (d["lang"], d["text"])
if key in seen:
continue
seen.add(key)
unique_docs.append(d)
docs = unique_docs
print(f"After de-duplication: {len(docs)} passages.")
dataset = Dataset.from_list(docs)
counts_by_lang = Counter(dataset["lang"])
print("Counts per language:", dict(counts_by_lang))
display(pd.DataFrame(docs).head(8))
DATA_DIR = os.path.join(ASSETS, "wiki_dataset")
os.makedirs(DATA_DIR, exist_ok=True)
dataset.save_to_disk(DATA_DIR)
jsonl_path = os.path.join(ASSETS, "wiki_docs.jsonl")
with open(jsonl_path, "w", encoding="utf-8") as f:
for row in docs:
f.write(json.dumps(row, ensure_ascii=False) + "\n")
print("Saved HF dataset to:", DATA_DIR)
print("Saved JSONL to:", jsonl_path)

  Preparing metadata (setup.py) ... [?25l[?25hdone
  Building wheel for wikipedia (setup.py) ... [?25l[?25hdone




  lis = BeautifulSoup(html).find_all('li')


Collected 40 passages across 4 languages and 10 topics.
After de-duplication: 40 passages.
Counts per language: {'en': 10, 'hi': 10, 'fr': 10, 'es': 10}


Unnamed: 0,id,lang,title,text
0,1,en,Biodiversity,Biodiversity refers to the variety and variabi...
1,2,en,Climate change,Present-day climate change includes both globa...
2,3,en,Renewable energy,Renewable energy (also called green energy) is...
3,4,en,Sustainable agriculture,Sustainable agriculture is farming in sustaina...
4,5,en,Water scarcity,Water scarcity (closely related to water stres...
5,6,en,Public health,"Public health is ""the science and art of preve..."
6,7,en,Artificial intelligence,Artificial intelligence (AI) is the capability...
7,8,en,Soil erosion,Soil erosion is the denudation or wearing away...


Saving the dataset (0/1 shards):   0%|          | 0/40 [00:00<?, ? examples/s]

Saved HF dataset to: multilingual_semantic_rag/assets/wiki_dataset
Saved JSONL to: multilingual_semantic_rag/assets/wiki_docs.jsonl


In [13]:
import os, numpy as np, faiss, math, json
from datasets import load_from_disk
from sentence_transformers import SentenceTransformer
from tqdm.auto import tqdm
if "ASSETS" not in globals():
ASSETS = "multilingual_semantic_rag/assets"
DATA_DIR = os.path.join(ASSETS, "wiki_dataset")
assert os.path.isdir(DATA_DIR), "Dataset directory not found. Run Step 2 first."
dataset = load_from_disk(DATA_DIR)
print("Loaded rows:", len(dataset), "| columns:", dataset.column_names)
MODEL_NAME = "intfloat/multilingual-e5-base"
embedder = SentenceTransformer(MODEL_NAME)
def embed_texts(texts, batch_size=64, is_query=False):
"""
Encodes a list of strings into L2-normalized vectors.
E5 expects 'query: ...' for queries and 'passage: ...' for documents.
"""
prefix = "query: " if is_query else "passage: "
inputs = [prefix + t for t in texts]
vecs = embedder.encode(
inputs,
batch_size=batch_size,
normalize_embeddings=True,
convert_to_numpy=True,
show_progress_bar=True
)
return vecs
doc_texts = dataset["text"]
doc_ids   = dataset["id"]
doc_langs = dataset["lang"]
doc_titles= dataset["title"]
doc_vecs = embed_texts(doc_texts, batch_size=64, is_query=False)
dim = doc_vecs.shape[1]
print(f"Embedding dim = {dim}, num docs = {len(doc_vecs)}")
EMB_PATH = os.path.join(ASSETS, "doc_vecs.npy")
np.save(EMB_PATH, doc_vecs)
META_PATH = os.path.join(ASSETS, "metadata.jsonl")
with open(META_PATH, "w", encoding="utf-8") as f:
for i in range(len(dataset)):
row = {
"id": int(doc_ids[i]),
"lang": doc_langs[i],
"title": doc_titles[i],
"text": doc_texts[i],
}
f.write(json.dumps(row, ensure_ascii=False) + "\n")
print("Saved embeddings ->", EMB_PATH)
print("Saved metadata   ->", META_PATH)
index = faiss.IndexFlatIP(dim)
index.add(doc_vecs)
print("FAISS ntotal =", index.ntotal)
INDEX_PATH = os.path.join(ASSETS, "faiss.index")
faiss.write_index(index, INDEX_PATH)
print("Wrote FAISS index ->", INDEX_PATH)
def search_raw(query_text, k=5):
q_vec = embed_texts([query_text], is_query=True)
D, I = index.search(q_vec, k)
scores = D[0].tolist()
idxs   = [int(x) for x in I[0].tolist()]
hits = []
for rank, (score, idx) in enumerate(zip(scores, idxs), start=1):
hits.append({
"rank":  rank,
"score": float(score),
"id":    int(doc_ids[idx]),
"lang":  doc_langs[idx],
"title": doc_titles[idx],
"text":  (doc_texts[idx][:200] + ("..." if len(doc_texts[idx]) > 200 else ""))
})
return hits
for q in ["biodiversity importance", "खाद्य सुरक्षा क्या है? (What is food security?)"]:
print("\nQuery:", q)
for h in search_raw(q, k=3):
print(f"  [{h['rank']}] score={h['score']:.3f} | {h['lang']} | {h['title']}")
print("      ", h["text"])

Loaded rows: 40 | columns: ['id', 'lang', 'title', 'text']


Batches:   0%|          | 0/1 [00:00<?, ?it/s]

Embedding dim = 768, num docs = 40
Saved embeddings -> multilingual_semantic_rag/assets/doc_vecs.npy
Saved metadata   -> multilingual_semantic_rag/assets/metadata.jsonl
FAISS ntotal = 40
Wrote FAISS index -> multilingual_semantic_rag/assets/faiss.index

Query: biodiversity importance


Batches:   0%|          | 0/1 [00:00<?, ?it/s]

  [1] score=0.857 | en | Biodiversity
       Biodiversity refers to the variety and variability of life on Earth. It can be measured at multiple levels, including genetic variability, species diversity, ecosystem diversity and phylogenetic diver...
  [2] score=0.831 | fr | Biodiversity
       La biodiversité désigne la variété des formes de vie sur la Terre. Ce terme est composé du préfixe bio (du grec βίος / bíos, « vie ») et du mot « diversité ». Elle s'apprécie en considérant la diversi...
  [3] score=0.814 | es | Biodiversity
       Neodiversity. A journal of neotropical biodiversity (abreviado Neodiversity)​ es una revista con ilustraciones y descripciones botánicas que es editada en Brasil desde el año 2006.

Query: खाद्य सुरक्षा क्या है? (What is food security?)


Batches:   0%|          | 0/1 [00:00<?, ?it/s]

  [1] score=0.909 | hi | Food security
       खाद्य सुरक्षा (food security) से तात्पर्य खाद्य पदार्थों की सुनिश्चित आपूर्ति एवं जनसामान्य के लिये भोज्य पदार्थों की उपलब्धता से है। पूरे इतिहास में खाद्य सुरक्षा सदा से एक चिन्ता का विषय रहा है। सन ...
  [2] score=0.882 | en | Food security
       Food security is the state of having reliable access to a sufficient quantity of affordable, healthy food. The availability of food for people of any class, gender, ethnicity, or religion is another e...
  [3] score=0.813 | hi | Sustainable agriculture
       स्थाई कृषि या टिकाऊ खेती या संधारणीय कृषि (Sustainable agriculture) पादप एवं जानवरों के उत्पादन की समन्वित कृषि प्रणाली है जो पर्यावरणीय सिद्धान्तों को ध्यान में रखकर की जाती है। संधारणीय कृषि दीर्घाव...


In [14]:
from typing import List, Dict
def semantic_search(query: str, k: int = 5) -> List[Dict]:
"""
Run cross-lingual semantic search.
Returns a list of dicts with score + metadata, sorted by score desc.
"""
q_vec = embed_texts([query], is_query=True)
D, I = index.search(q_vec, k)
scores = D[0].tolist()
idxs   = [int(x) for x in I[0].tolist()]
hits = []
for rank, (score, idx) in enumerate(zip(scores, idxs), start=1):
hits.append({
"rank":  rank,
"score": float(score),
"doc_id": int(doc_ids[idx]),
"lang":   doc_langs[idx],
"title":  doc_titles[idx],
"text":   doc_texts[idx],
})
return hits
for q in ["biodiversity importance", "खाद्य सुरक्षा क्या है?"]:
print("\nQuery:", q)
for h in semantic_search(q, k=3):
print(f"[{h['rank']}] {h['score']:.3f} | {h['lang']} | {h['title']}")


Query: biodiversity importance


Batches:   0%|          | 0/1 [00:00<?, ?it/s]

[1] 0.857 | en | Biodiversity
[2] 0.831 | fr | Biodiversity
[3] 0.814 | es | Biodiversity

Query: खाद्य सुरक्षा क्या है?


Batches:   0%|          | 0/1 [00:00<?, ?it/s]

[1] 0.916 | hi | Food security
[2] 0.868 | en | Food security
[3] 0.807 | fr | Food security


In [16]:
from transformers import AutoTokenizer, AutoModelForSeq2SeqLM
import torch
from typing import List
TRANS_NAME = "facebook/nllb-200-distilled-600M"
trans_tok = AutoTokenizer.from_pretrained(TRANS_NAME, src_lang="eng_Latn", tgt_lang="eng_Latn")
trans_model = AutoModelForSeq2SeqLM.from_pretrained(TRANS_NAME).to("cuda" if torch.cuda.is_available() else "cpu").eval()
NLLB_CODE = {
"en": "eng_Latn",
"fr": "fra_Latn",
"es": "spa_Latn",
"hi": "hin_Deva",
}
def get_lang_id(lang_code: str) -> int:
return trans_tok.convert_tokens_to_ids(lang_code)
_translation_cache = {}
def translate_once(text: str, src: str, tgt: str, max_new_tokens: int = 256) -> str:
if src == tgt:
return text
key = (text, src, tgt)
if key in _translation_cache:
return _translation_cache[key]
enc = trans_tok(text, return_tensors="pt", truncation=True).to(trans_model.device)
out = trans_model.generate(
**enc,
forced_bos_token_id=get_lang_id(NLLB_CODE[tgt]),
max_new_tokens=max_new_tokens,
num_beams=2,
no_repeat_ngram_size=3
)
translated = trans_tok.batch_decode(out, skip_special_tokens=True)[0]
_translation_cache[key] = translated
return translated
def translate_batch(texts: List[str], srcs: List[str], tgt: str) -> List[str]:
"""
Batched translation where each text may have its own source language.
We loop (since sources differ), but you still benefit from model warm state.
"""
out = []
for t, s in zip(texts, srcs):
out.append(translate_once(t, s, tgt))
return out
hits = semantic_search("renewable energy benefits", k=3)
texts = [h["text"] for h in hits]
srcs  = [h["lang"] for h in hits]
to_hi = translate_batch(texts, srcs, tgt="hi")
for i, (h, t) in enumerate(zip(hits, to_hi), 1):
print(f"\nHit {i}: {h['lang']} → hi | {h['title']}")
print("Translated snippet:", t[:200] + ("..." if len(t) > 200 else ""))

Batches:   0%|          | 0/1 [00:00<?, ?it/s]


Hit 1: en → hi | Renewable energy
Translated snippet: नवीकरणीय ऊर्जा (ग्रीन एनर्जी भी कहा जाता है) नवीकरनीय प्राकृतिक संसाधनों से बनाई गई ऊर्जा है जो मानव समय के पैमाने पर पुनःपूर्ति की जाती है। सबसे व्यापक रूप से उपयोग की जाने वाली अक्षय ऊर्जा प्रकार सौ...

Hit 2: hi → hi | Renewable energy
Translated snippet: अक्षय उर्जा या नवीकरणीय ऊर्जा (अंग्रेजी:renewable energy) में वे सारी उर्जा शामिल हैं जो प्रदूषणकारक नहीं हैं तथा जिनके स्रोत का क्षय नहीं होता, या जिनके स्रोत का पुनः-भरण होता रहता है। सौर ऊर्जा, पवन...

Hit 3: es → hi | Renewable energy
Translated snippet: एनआरईएल एक अमेरिकी प्रयोगशाला है जो नवीकरणीय ऊर्जा के अनुसंधान और विकास, ऊर्जा दक्षता, ऊर्जा प्रणालियों के एकीकरण और सतत परिवहन में विशेषज्ञता प्राप्त है। एनआरएल एक संघीय निधि के साथ वित्त पोषित अनुसं...


In [20]:
from transformers import AutoTokenizer, AutoModelForCausalLM
import torch
GEN_NAME = "TinyLlama/TinyLlama-1.1B-Chat-v1.0"
gen_tok   = AutoTokenizer.from_pretrained(GEN_NAME)
dtype     = torch.float16 if torch.cuda.is_available() else torch.float32
gen_model = AutoModelForCausalLM.from_pretrained(
GEN_NAME, torch_dtype=dtype
).to("cuda" if torch.cuda.is_available() else "cpu").eval()
MAX_TOKENS_CONTEXT = 800
MAX_TOKENS_SNIPPET = 120
OUTPUT_BUDGET      = 220
def trim_snippet(snippet, max_tokens=MAX_TOKENS_SNIPPET):
toks = gen_tok.encode(snippet, add_special_tokens=False)
if len(toks) <= max_tokens:
return snippet
return gen_tok.decode(toks[:max_tokens], skip_special_tokens=True)
def build_prompt(docs_translated, query, q_lang):
bullets = "\n".join(f"- {t.strip()}" for t in docs_translated)
return (
f"You are a precise multilingual assistant. Use ONLY the context to answer in {q_lang}.\n"
f"If the context is insufficient, say you don't know.\n\n"
f"Context:\n{bullets}\n\n"
f"Question ({q_lang}): {query}\n\n"
f"Answer ({q_lang}):"
)
@torch.inference_mode()
def generate_answer(prompt, max_new_tokens=OUTPUT_BUDGET):
inputs = gen_tok(prompt, return_tensors="pt", truncation=True, max_length=2048).to(gen_model.device)
output_ids = gen_model.generate(
**inputs,
max_new_tokens=max_new_tokens,
do_sample=False,
eos_token_id=gen_tok.eos_token_id,
pad_token_id=gen_tok.eos_token_id
)
text = gen_tok.decode(output_ids[0], skip_special_tokens=True)
return text[text.rfind("Answer"):].split(":", 1)[-1].strip()
def rag_answer(query: str, q_lang: str = "en", k: int = 4):
hits = semantic_search(query, k=k)
processed, total_tokens = [], 0
for h in hits:
txt = translate_once(h["text"], h["lang"], q_lang) if h["lang"] != q_lang else h["text"]
txt = trim_snippet(txt)
tok_count = len(gen_tok.encode(txt, add_special_tokens=False))
if total_tokens + tok_count <= MAX_TOKENS_CONTEXT:
processed.append(txt)
total_tokens += tok_count
else:
break
prompt = build_prompt(processed, query, q_lang)
answer = generate_answer(prompt)
return answer, hits
ans, hits = rag_answer("नवीकरणीय ऊर्जा के क्या लाभ हैं?", q_lang="hi", k=4)
print("🟢 RAG Answer:\n", ans, "\n")
print("🔎 Sources:")
for h in hits:
print(f"- [{h['lang']}] {h['title']}")

Batches:   0%|          | 0/1 [00:00<?, ?it/s]

Token indices sequence length is longer than the specified maximum sequence length for this model (4088 > 2048). Running this sequence through the model will result in indexing errors


🟢 RAG Answer:
 नवीकरणीय ऊर्जा के लाभ हैं जो प्रदूषणकारक हैं और जिनके स्रोत नहीं हैं। इस ऊर्जा के लिए स्थाई कृषि या टिकाऊ खेती या संधारणीय कृषि प्रणाली है। जलवायु परिवर्तन पर अंतर-सरकारी पैनल (आयपीसीसी; Intergovernmental Panel on Climate Change; IPCC) 

🔎 Sources:
- [hi] Renewable energy
- [en] Renewable energy
- [hi] Sustainable agriculture
- [hi] Climate change


In [21]:
import gradio as gr
def _short(text, n=160):
return (text[:n] + "…") if len(text) > n else text
def app_pipeline(query, answer_lang, k):
answer, hits = rag_answer(query, q_lang=answer_lang, k=int(k))
lines = []
for h in hits:
lines.append(f"[{h['lang']}] {h['title']}  —  score={h['score']:.3f}\n  {_short(h['text'])}")
sources = "\n\n".join(lines) if lines else "No sources found."
return answer, sources
with gr.Blocks(title="Multilingual Semantic Search + RAG") as demo:
gr.Markdown("
with gr.Row():
query      = gr.Textbox(label="Your question", placeholder="e.g., ¿Qué es la biodiversidad?")
answer_lang= gr.Dropdown(choices=["en","hi","fr","es"], value="en", label="Answer language")
k          = gr.Slider(1, 8, value=4, step=1, label="Top-k documents")
run_btn  = gr.Button("Search & Answer 🚀")
answer_o = gr.Textbox(label="Answer", lines=6)
sources_o= gr.Textbox(label="Sources (original language)", lines=12)
run_btn.click(app_pipeline, inputs=[query, answer_lang, k], outputs=[answer_o, sources_o])
demo.launch(share=True)

Colab notebook detected. To show errors in colab notebook, set debug=True in launch()
* Running on public URL: https://b8d9753d327f7be524.gradio.live

This share link expires in 1 week. For free permanent hosting and GPU upgrades, run `gradio deploy` from the terminal in the working directory to deploy to Hugging Face Spaces (https://huggingface.co/spaces)


