In [1]:
# Block 1: Install dependencies and imports
!pip install datasets
!pip install openpyxl
!pip install -q -U google-genai
!pip install google-generativeai

from datasets import load_dataset
import random
import google.generativeai as genai
import time
import pandas as pd

from datasets import load_dataset
from langdetect import detect



  from .autonotebook import tqdm as notebook_tqdm


In [2]:
# Stream the dataset (no full download)
dataset = load_dataset("allenai/tulu-3-sft-mixture", split="train", streaming=True)

from langdetect import detect_langs

def is_english(messages, min_prob=0.90):
    # Join all message content into one string
    text = " ".join([m.get("content", "") for m in messages]).strip()
    if not text:
        return False
    try:
        # detect_langs returns a list like [en:0.99, es:0.01]
        langs = detect_langs(text)
        for lang in langs:
            if lang.lang == "en" and lang.prob >= min_prob:
                return True
        return False
    except:
        return False

# Filter as a generator
ds = (example for example in dataset if is_english(example["messages"]))

print("English filtering applied successfully")

English filtering applied successfully


In [3]:
# Block 3: Extract questions and answers from Tulu dataset
def extract_qa_from_tulu(item):
    """Extract question and answer from Tulu dataset item"""
    try:
        # Tulu dataset typically has 'messages' field with conversation format
        if 'messages' in item:
            messages = item['messages']
            question = ""
            answer = ""

            for msg in messages:
                if msg.get('role') == 'user':
                    question = msg.get('content', '')
                elif msg.get('role') == 'assistant':
                    answer = msg.get('content', '')
                    break  # Take first assistant response

            return question, answer

        # Alternative structure - check for other common fields
        elif 'prompt' in item and 'completion' in item:
            return item['prompt'], item['completion']
        elif 'input' in item and 'output' in item:
            return item['input'], item['output']
        else:
            return None, None

    except Exception as e:
        print(f"Error extracting Q&A: {e}")
        return None, None

In [4]:
def get_diverse_tulu_examples(ds, n_samples=5, total_samples=5000):
    """Get diverse examples from Tulu dataset"""

    # Initialize buckets for different content lengths
    buckets = {
        'short': [],   # 0-300 chars
        'medium': [],  # 300-1000 chars
        'long': [],    # 1000+ chars
    }

    count = 0
    for item in ds:
        if count >= total_samples:
            break

        question, answer = extract_qa_from_tulu(item)

        if question and answer:
            content_length = len(question) + len(answer)

            # Determine bucket
            if content_length < 300:
                bucket_name = 'short'
            elif content_length < 1000:
                bucket_name = 'medium'
            else:
                bucket_name = 'long'

            # Add to bucket (keep max 100 per bucket)
            bucket = buckets[bucket_name]
            if len(bucket) < 100:
                bucket.append((question, answer))
            else:
                # Replace random item (reservoir sampling)
                replace_idx = random.randint(0, len(bucket))
                if replace_idx < len(bucket):
                    bucket[replace_idx] = (question, answer)

        count += 1

    # Select examples from buckets for diversity
    diverse_examples = []
    for bucket_name in ['short', 'medium', 'long']:
        if buckets[bucket_name]:
            samples_from_bucket = min(len(buckets[bucket_name]), max(1, n_samples // 3))
            diverse_examples.extend(random.sample(buckets[bucket_name], samples_from_bucket))

    # Fill remaining slots
    while len(diverse_examples) < n_samples:
        for bucket in buckets.values():
            if bucket and len(diverse_examples) < n_samples:
                remaining = [ex for ex in bucket if ex not in diverse_examples]
                if remaining:
                    diverse_examples.append(random.choice(remaining))

    return diverse_examples[:n_samples]

In [5]:
# Block 5: Get examples and separate into questions/answers
examples = get_diverse_tulu_examples(ds, n_samples=5, total_samples=10000)

print(f"Selected {len(examples)} diverse examples:")
for i, (q, a) in enumerate(examples):
    print(f"Example {i + 1}: Q length = {len(q)} chars, A length = {len(a)} chars")

questions = [ex[0] for ex in examples]
answers = [ex[1] for ex in examples]

# Setup Gemini API
GEMINI_API_KEY = "AIzaSyCtsfF_f6isWzN3B5wrUEDhaE1IujdoOnQ"  # Replace with your actual API key
genai.configure(api_key=GEMINI_API_KEY)
model = genai.GenerativeModel('gemini-2.5-pro')

Selected 5 diverse examples:
Example 1: Q length = 199 chars, A length = 69 chars
Example 2: Q length = 165 chars, A length = 276 chars
Example 3: Q length = 1237 chars, A length = 12 chars
Example 4: Q length = 195 chars, A length = 102 chars
Example 5: Q length = 157 chars, A length = 173 chars


In [6]:
# Block 6: Enhanced Translation functions
def translate_to_hebrew(text, text_type="text"):
    """Translate text to Hebrew using Gemini API with enhanced prompts"""
    try:
        if text_type == "question":
            prompt = f"""You are an expert Hebrew translator specializing in technical and academic content. Translate the following question from English to Hebrew with these specific requirements:

1. Preserve ALL technical terms, code snippets, mathematical expressions, formulas, and programming syntax exactly as they appear
2. Use natural, fluent Hebrew for explanatory text while maintaining the original meaning and tone
3. Keep proper nouns, brand names, and specific terminology in their original form
4. Maintain the question structure and formatting
5. Use appropriate Hebrew question words and syntax
6. Ensure the translation sounds natural to native Hebrew speakers
7. If there are any ambiguities, choose the most contextually appropriate translation

Original Question:
{text}

Hebrew Translation:"""

        elif text_type == "answer":
            prompt = f"""You are an expert Hebrew translator specializing in technical and academic content. Translate the following answer from English to Hebrew with these specific requirements:

1. Preserve ALL code blocks, mathematical expressions, formulas, technical syntax, and programming elements exactly as written
2. Translate explanatory text into clear, natural Hebrew while maintaining technical accuracy
3. Keep function names, variable names, library names, and technical terminology unchanged
4. Maintain the logical flow and structure of the explanation
5. Use appropriate Hebrew technical vocabulary where it exists
6. Ensure code comments are translated if they contain natural language
7. Keep bullet points, numbering, and formatting intact
8. Make the Hebrew text sound professional and clear for technical audiences

Original Answer:
{text}

Hebrew Translation:"""

        else:
            prompt = f"""You are an expert Hebrew translator. Translate the following text from English to Hebrew with these requirements:

1. Maintain the original meaning and tone
2. Use natural, fluent Hebrew appropriate for the context
3. Preserve any technical terms, proper nouns, or specialized vocabulary
4. Keep formatting and structure intact
5. Ensure the translation reads naturally to Hebrew speakers

Original Text:
{text}

Hebrew Translation:"""

        response = model.generate_content(prompt)
        time.sleep(0.5)  # Rate limiting

        # Clean up response - remove any explanatory text that might be added
        hebrew_text = response.text.strip()

        # Remove common prefixes that the model might add
        prefixes_to_remove = [
            "Hebrew Translation:",
            "תרגום לעברית:",
            "התרגום לעברית:",
            "בעברית:"
        ]

        for prefix in prefixes_to_remove:
            if hebrew_text.startswith(prefix):
                hebrew_text = hebrew_text[len(prefix):].strip()

        return hebrew_text

    except Exception as e:
        print(f"Translation error: {e}")
        return text

def validate_hebrew_translation(original_text, translated_text):
    """Optional function to validate translation quality"""
    # Basic validation checks
    if not translated_text or translated_text == original_text:
        return False

    # Check if Hebrew characters are present
    hebrew_chars = any('\u0590' <= char <= '\u05FF' for char in translated_text)

    return hebrew_chars

In [7]:
def create_hebrew_answer_prompt(english_questions, english_answers, new_english_question):
    """Create prompt for generating good Hebrew answers"""

    prompt = """You are an expert assistant who provides clear, precise, and comprehensive answers in Hebrew. Given questions and their answers in English, you create high-quality Hebrew answers that are understandable and well-structured.

Your Hebrew answers should be:
- Clear and easy to understand
- Comprehensive but concise
- Accurate and precise
- Use proper Hebrew terminology
- Keep mathematical expressions, code, and technical terms in their original form
- Well-structured and organized

Here are examples:

"""

    # Add few-shot examples
    for i in range(len(english_questions)):
        prompt += f"""Example {i+1}:
English Question: {english_questions[i]}

English Answer: {english_answers[i]}

Hebrew Answer:
[This would be filled with a clear, comprehensive Hebrew answer]

---

"""

    prompt += f"""Now provide a clear, comprehensive Hebrew answer for this question:

English Question: {new_english_question}

Hebrew Answer:"""

    return prompt

In [8]:
def generate_tulu_hebrew_dataset(english_questions, english_answers, num_questions=10):
    """Generate Hebrew answer dataset from Tulu examples"""

    # Load fresh dataset stream
    ds_fresh = load_dataset("allenai/tulu-3-sft-mixture", split="train", streaming=True)
    ds_fresh = (example for example in ds_fresh if is_english(example["messages"]))
    results = []
    count = 0
    processed = 0

    print(f"Processing {num_questions} questions from Tulu dataset...")

    for item in ds_fresh:
        if processed >= num_questions:
            break

        question, answer = extract_qa_from_tulu(item)

        if question and answer and len(question.strip()) > 10 and len(answer.strip()) > 10:
            try:
                print(f"Processing question {processed+1}/{num_questions}")
                print(f"Question preview: {question[:100]}...")

                # Generate Hebrew answer using few-shot learning
                answer_prompt = create_hebrew_answer_prompt(english_questions, english_answers, question)
                response = model.generate_content(answer_prompt)
                hebrew_answer_generated = response.text

                # Store result
                result = {
                    'question_english': question,
                    'answer_english': answer,
                    'question_hebrew': '',  # Will fill later
                    'answer_hebrew_generated': hebrew_answer_generated,
                    'answer_hebrew_translated': ''     # Will fill later
                }

                results.append(result)
                print(f"✅ Generated Hebrew answer for question {processed+1}")

                processed += 1
                time.sleep(1)  # Rate limiting

            except Exception as e:
                print(f"❌ Error processing question {processed+1}: {e}")
                continue

        count += 1
        if count >= 10000:  # Safety limit
            break

    return pd.DataFrame(results)

In [9]:
hebrew_dataset = generate_tulu_hebrew_dataset(
    english_questions=questions,
    english_answers=answers,
    num_questions=10
)

Processing 10 questions from Tulu dataset...
Processing question 1/10
Question preview: Create a snippet of Terraform HCL code that create an AWS autoscaling group, and an ALB in front to ...
✅ Generated Hebrew answer for question 1
Processing question 2/10
Question preview: Which languages are spoken in Costa Rica?...
✅ Generated Hebrew answer for question 2
Processing question 3/10
Question preview: Can u summarize me story from the book Harry Potter and the Philosopher's Stone?...
✅ Generated Hebrew answer for question 3
Processing question 4/10
Question preview: Did Jesus realy exist...
✅ Generated Hebrew answer for question 4
Processing question 5/10
Question preview: ¿Puedes darme un ejemplo del patrón de diseño Factory en el lenguaje de programación Java?...
✅ Generated Hebrew answer for question 5
Processing question 6/10
Question preview: Write five lines of iambic pentameter about a subject of your choosing. Do not use any trochaic subs...
✅ Generated Hebrew answer for questi

In [10]:
print("Translating questions to Hebrew...")
hebrew_questions = []
for i, question in enumerate(hebrew_dataset['question_english']):
    print(f"Translating question {i+1}/{len(hebrew_dataset)}")
    hebrew_questions.append(translate_to_hebrew(question, "question"))
    time.sleep(0.5)

print("Translating answers to Hebrew...")
hebrew_answers_translated = []
for i, answer in enumerate(hebrew_dataset['answer_english']):
    print(f"Translating answer {i+1}/{len(hebrew_dataset)}")
    hebrew_answers_translated.append(translate_to_hebrew(answer, "answer"))
    time.sleep(0.5)

Translating questions to Hebrew...
Translating question 1/10
Translating question 2/10
Translating question 3/10
Translating question 4/10
Translating question 5/10
Translating question 6/10
Translating question 7/10
Translating question 8/10
Translating question 9/10
Translating question 10/10
Translating answers to Hebrew...
Translating answer 1/10
Translating answer 2/10
Translating answer 3/10
Translating answer 4/10
Translating answer 5/10
Translating answer 6/10
Translating answer 7/10
Translating answer 8/10
Translating answer 9/10
Translating answer 10/10


In [11]:
hebrew_dataset['question_hebrew'] = hebrew_questions
hebrew_dataset['answer_hebrew_translated'] = hebrew_answers_translated

final_dataset = hebrew_dataset[[
    'question_english',
    'answer_english',
    'question_hebrew',
    'answer_hebrew_generated',
    'answer_hebrew_translated'
]]

print("Final dataset structure:")
print(final_dataset.head())

# Save to CSV
hebrew_dataset.to_csv('/Users/oribar-joseph/Downloads/hebrew_tulu_dataset.csv')

Final dataset structure:
                                    question_english  \
0  Create a snippet of Terraform HCL code that cr...   
1          Which languages are spoken in Costa Rica?   
2  Can u summarize me story from the book Harry P...   
3                              Did Jesus realy exist   
4  ¿Puedes darme un ejemplo del patrón de diseño ...   

                                      answer_english  \
0  Sure, here's an example Terraform HCL code tha...   
1  The primary language spoken in Costa Rica is S...   
2  Harry Potter, an orphan, discovers he is a wiz...   
3  As an AI language model, I cannot provide a de...   
4  interface Shape {\n   void draw();\n}\n\nclass...   

                                     question_hebrew  \
0  צור קטע קוד Terraform HCL המקים קבוצת `autosca...   
1                     אילו שפות מדוברות בקוסטה ריקה?   
2  האם תוכל/י לסכם לי את הסיפור מהספר "הארי פוטר ...   
3                               האם ישו באמת התקיים?   
4  תוכל לתת לי דוגמה 

In [12]:
hebrew_dataset

Unnamed: 0,question_english,answer_english,question_hebrew,answer_hebrew_generated,answer_hebrew_translated
0,Create a snippet of Terraform HCL code that cr...,"Sure, here's an example Terraform HCL code tha...",צור קטע קוד Terraform HCL המקים קבוצת `autosca...,"בטח, הנה דוגמת קוד `Terraform` (HCL) ליצירת תש...","כמובן, להלן דוגמת קוד Terraform HCL היוצרת AWS..."
1,Which languages are spoken in Costa Rica?,The primary language spoken in Costa Rica is S...,אילו שפות מדוברות בקוסטה ריקה?,"השפה הרשמית של קוסטה ריקה היא ספרדית, והיא השפ...","השפה העיקרית המדוברת בקוסטה ריקה היא ספרדית, ע..."
2,Can u summarize me story from the book Harry P...,"Harry Potter, an orphan, discovers he is a wiz...","האם תוכל/י לסכם לי את הסיפור מהספר ""הארי פוטר ...","בטח, הנה סיכום עלילת הספר ""הארי פוטר ואבן החכמ...","הארי פוטר, יתום, מגלה שהוא קוסם ומתחיל את לימו..."
3,Did Jesus realy exist,"As an AI language model, I cannot provide a de...",האם ישו באמת התקיים?,שאלה זו היא אחת מהשאלות המרכזיות בהיסטוריה של ...,"כמודל שפה של בינה מלאכותית, אין ביכולתי לספק ת..."
4,¿Puedes darme un ejemplo del patrón de diseño ...,interface Shape {\n void draw();\n}\n\nclass...,תוכל לתת לי דוגמה לתבנית העיצוב Factory בשפת ה...,"כמובן, הנה דוגמה מפורטת של תבנית העיצוב Factor...",```java\ninterface Shape {\n void draw();\n}...
5,Write five lines of iambic pentameter about a ...,The big brown cat jumped down the tallest tree...,כתוב חמש שורות ב**פנטמטר יאמבי** (iambic penta...,"להלן חמש שורות שיר במשקל פנטמטר יאמבי, ללא חרי...",החתול החום הגדול קפץ למטה מהעץ הגבוה ביותר\nכי...
6,"You will create a table with macronutrients, m...","Here is a table showing the macronutrients, mi...",עליך ליצור טבלה המפרטת את ערכי ה-macronutrient...,"בטח, הנה טבלה מפורטת עם הערכים התזונתיים עבור ...","להלן טבלה המציגה את המאקרו-נוטריינטים, המיקרו-..."
7,Write me POSIX-shellscript to convert Celsius ...,Sure! Here is a POSIX shell script to convert ...,כתוב לי `POSIX-shellscript` שממיר `Celsius` ל-...,"בטח, הנה סקריפט shell בתקן POSIX להמרת טמפרטור...","כמובן! להלן תרגום של התשובה לעברית, בהתאם לדרי..."
8,How do I compile a Java application to native ...,To compile a Java application to native with G...,איך אני מקמפל אפליקציית Java ל-native באמצעות ...,כדי להדר (לקמפל) יישום Java לקוד נייטיב (nativ...,כדי לקמפל יישום Java לקובץ הרצה נייטיבי (nativ...
9,"In JavaScript, why is the value of ""this"" keyw...","In JavaScript, the value of the this keyword i...","ב-JavaScript, מדוע ערכו של `this` בתוך פונקציי...","ב-JavaScript, ערך מילת המפתח `this` בתוך פונקצ...","ב-JavaScript, הערך של מילת המפתח `this` בתוך פ..."


In [13]:
hebrew_dataset.to_csv('/Users/oribar-joseph/Downloads/hebrew_tulu_dataset.csv')