In [1]:
!pip install transformers sentencepiece accelerate tqdm




In [2]:
from transformers import AutoTokenizer, AutoModelForSeq2SeqLM
import torch

MODEL_NAME = "google/flan-t5-base"   # Free, no API key

tokenizer = AutoTokenizer.from_pretrained(MODEL_NAME)
model = AutoModelForSeq2SeqLM.from_pretrained(MODEL_NAME)

device = "cuda" if torch.cuda.is_available() else "cpu"
model.to(device)

print("✅ FLAN-T5 loaded on", device)


The secret `HF_TOKEN` does not exist in your Colab secrets.
To authenticate with the Hugging Face Hub, create a token in your settings tab (https://huggingface.co/settings/tokens), set it as secret in your Google Colab and restart your session.
You will be able to reuse this secret in all of your notebooks.
Please note that authentication is recommended but still optional to access public models or datasets.


tokenizer_config.json: 0.00B [00:00, ?B/s]

spiece.model:   0%|          | 0.00/792k [00:00<?, ?B/s]

tokenizer.json: 0.00B [00:00, ?B/s]

special_tokens_map.json: 0.00B [00:00, ?B/s]

config.json: 0.00B [00:00, ?B/s]

model.safetensors:   0%|          | 0.00/990M [00:00<?, ?B/s]

generation_config.json:   0%|          | 0.00/147 [00:00<?, ?B/s]

✅ FLAN-T5 loaded on cuda


In [3]:
def flan_generate(prompt, max_tokens=200):
    inputs = tokenizer(prompt, return_tensors="pt").to(device)
    outputs = model.generate(
        **inputs,
        max_new_tokens=max_tokens,
        temperature=0.9,
        do_sample=True
    )
    return tokenizer.decode(outputs[0], skip_special_tokens=True)


In [4]:
import random, json
from tqdm import tqdm

class LocalDataGenerator:

    def __init__(self):
        self.languages = ['en', 'es', 'fr', 'de', 'ja', 'ko', 'zh', 'ar']
        self.language_names = {
            'en': 'English', 'es': 'Spanish', 'fr': 'French', 'de': 'German',
            'ja': 'Japanese', 'ko': 'Korean', 'zh': 'Chinese', 'ar': 'Arabic'
        }
        self.topics = [
            'artificial intelligence', 'climate change', 'healthcare',
            'renewable energy', 'space exploration', 'quantum computing',
            'biodiversity', 'financial technology'
        ]

    def gen_passage(self, lang, topic):
        ln = self.language_names[lang]
        prompt = f"""
Write a 120-word educational article paragraph about {topic} in {ln}.
Make it informative, factual, and clear.
"""
        return flan_generate(prompt, max_tokens=150)

    def gen_query(self, lang, topic):
        ln = self.language_names[lang]
        prompt = f"""
Write a natural search query in {ln} asking about {topic}.
Make it look like something a real person would search.
"""
        return flan_generate(prompt, max_tokens=40)

    def generate_passages(self, n=400):
        passages = []
        for i in tqdm(range(n), desc="📝 Generating passages"):
            lang = random.choice(self.languages)
            topic = random.choice(self.topics)
            passages.append({
                "id": f"passage_{i}",
                "language": lang,
                "topic": topic,
                "text": self.gen_passage(lang, topic)
            })
        return passages

    def generate_queries(self, passages, n=100):
        queries = []
        for i in tqdm(range(n), desc="🔍 Generating queries"):
            lang = random.choice(self.languages)
            topic = random.choice(self.topics)

            pos = random.choice([p for p in passages if p["topic"] == topic])

            queries.append({
                "id": f"query_{i}",
                "language": lang,
                "topic": topic,
                "text": self.gen_query(lang, topic),
                "positive_passage_ids": [pos["id"]]
            })
        return queries

    def save(self, passages, queries, prefix="flant5"):
        with open(f"{prefix}_passages.json", "w", encoding="utf-8") as f:
            json.dump(passages, f, indent=2, ensure_ascii=False)

        with open(f"{prefix}_queries.json", "w", encoding="utf-8") as f:
            json.dump(queries, f, indent=2, ensure_ascii=False)

        print("💾 Dataset saved successfully!")


In [5]:
generator = LocalDataGenerator()

passages = generator.generate_passages(n=400)
queries   = generator.generate_queries(passages, n=100)

generator.save(passages, queries)


📝 Generating passages: 100%|██████████| 400/400 [09:35<00:00,  1.44s/it]
🔍 Generating queries: 100%|██████████| 100/100 [00:37<00:00,  2.63it/s]

💾 Dataset saved successfully!





In [6]:
print("Sample Passage:\n", passages[0])
print("\nSample Query:\n", queries[0])


Sample Passage:
 {'id': 'passage_0', 'language': 'en', 'topic': 'quantum computing', 'text': "The first word of this article is: 1. How do you use to understand how a person perceives quantum information? 2. By definition, it is the ability to comprehend and interpret the information in a manner that is at least in the least oblique of the mind or body. 3. How do you get a human being to interact with quantum information? 4. How do you understand the power of quantum information? 5. What have you learned that is so powerful and ingenious that we do not think they're intelligent at all? 6. What are the practicalities of quantum computation and how do they affect society? 7. How does the human consciousness use quantum information? 8. Which of these can be the most useful methods of understanding quantum information"}

Sample Query:
 {'id': 'query_0', 'language': 'ko', 'topic': 'climate change', 'text': 'climate change is an environmental issue', 'positive_passage_ids': ['passage_379']}
