In [30]:
# Block 1: Install dependencies and imports
!pip install datasets
!pip install openpyxl
!pip install -q -U google-genai
!pip install google-generativeai

from datasets import load_dataset
import random
import google.generativeai as genai
import time
import pandas as pd

from datasets import load_dataset
from langdetect import detect






In [40]:
# Stream the dataset (no full download)
dataset = load_dataset("allenai/tulu-3-sft-mixture", split="train", streaming=True)

from langdetect import detect_langs

def is_english(messages, min_prob=0.90):
    # Join all message content into one string
    text = " ".join([m.get("content", "") for m in messages]).strip()
    if not text:
        return False
    try:
        # detect_langs returns a list like [en:0.99, es:0.01]
        langs = detect_langs(text)
        for lang in langs:
            if lang.lang == "en" and lang.prob >= min_prob:
                return True
        return False
    except:
        return False


# Filter as a generator
ds = (example for example in dataset if is_english(example["messages"]))

print("English filtering applied successfully")

English filtering applied successfully


In [41]:
# Block 3: Extract questions and answers from Tulu dataset
def extract_qa_from_tulu(item):
    """Extract question and answer from Tulu dataset item"""
    try:
        # Tulu dataset typically has 'messages' field with conversation format
        if 'messages' in item:
            messages = item['messages']
            question = ""
            answer = ""

            for msg in messages:
                if msg.get('role') == 'user':
                    question = msg.get('content', '')
                elif msg.get('role') == 'assistant':
                    answer = msg.get('content', '')
                    break  # Take first assistant response

            return question, answer

        # Alternative structure - check for other common fields
        elif 'prompt' in item and 'completion' in item:
            return item['prompt'], item['completion']
        elif 'input' in item and 'output' in item:
            return item['input'], item['output']
        else:
            return None, None

    except Exception as e:
        print(f"Error extracting Q&A: {e}")
        return None, None

In [42]:
def get_diverse_tulu_examples(ds, n_samples=5, total_samples=5000):
    """Get diverse examples from Tulu dataset"""

    # Initialize buckets for different content lengths
    buckets = {
        'short': [],   # 0-300 chars
        'medium': [],  # 300-1000 chars
        'long': [],    # 1000+ chars
    }

    count = 0
    for item in ds:
        if count >= total_samples:
            break

        question, answer = extract_qa_from_tulu(item)

        if question and answer:
            content_length = len(question) + len(answer)

            # Determine bucket
            if content_length < 300:
                bucket_name = 'short'
            elif content_length < 1000:
                bucket_name = 'medium'
            else:
                bucket_name = 'long'

            # Add to bucket (keep max 100 per bucket)
            bucket = buckets[bucket_name]
            if len(bucket) < 100:
                bucket.append((question, answer))
            else:
                # Replace random item (reservoir sampling)
                replace_idx = random.randint(0, len(bucket))
                if replace_idx < len(bucket):
                    bucket[replace_idx] = (question, answer)

        count += 1

    # Select examples from buckets for diversity
    diverse_examples = []
    for bucket_name in ['short', 'medium', 'long']:
        if buckets[bucket_name]:
            samples_from_bucket = min(len(buckets[bucket_name]), max(1, n_samples // 3))
            diverse_examples.extend(random.sample(buckets[bucket_name], samples_from_bucket))

    # Fill remaining slots
    while len(diverse_examples) < n_samples:
        for bucket in buckets.values():
            if bucket and len(diverse_examples) < n_samples:
                remaining = [ex for ex in bucket if ex not in diverse_examples]
                if remaining:
                    diverse_examples.append(random.choice(remaining))

    return diverse_examples[:n_samples]

In [43]:
# Block 5: Get examples and separate into questions/answers
examples = get_diverse_tulu_examples(ds, n_samples=5, total_samples=5000)

print(f"Selected {len(examples)} diverse examples:")
for i, (q, a) in enumerate(examples):
    print(f"Example {i + 1}: Q length = {len(q)} chars, A length = {len(a)} chars")

questions = [ex[0] for ex in examples]
answers = [ex[1] for ex in examples]


Selected 5 diverse examples:
Example 1: Q length = 255 chars, A length = 19 chars
Example 2: Q length = 829 chars, A length = 131 chars
Example 3: Q length = 1364 chars, A length = 7 chars
Example 4: Q length = 161 chars, A length = 28 chars
Example 5: Q length = 709 chars, A length = 59 chars


In [44]:

# Block 6: Setup Gemini API
GEMINI_API_KEY = "AIzaSyCtsfF_f6isWzN3B5wrUEDhaE1IujdoOnQ"  # Replace with your actual API key
genai.configure(api_key=GEMINI_API_KEY)
model = genai.GenerativeModel('gemini-2.5-pro')

In [45]:
# Block 7: Translation functions
def translate_to_hebrew(text, text_type="text"):
    """Translate text to Hebrew using Gemini API"""
    try:
        if text_type == "question":
            prompt = f"Translate the following question to Hebrew. Keep any mathematical expressions, code, or technical terms unchanged. Only translate the natural language portions:\n\n{text}"
        elif text_type == "answer":
            prompt = f"Translate the following answer to Hebrew. Keep any mathematical expressions, code, formulas, or technical terms unchanged. Only translate the explanatory text:\n\n{text}"
        else:
            prompt = f"Translate the following to Hebrew:\n\n{text}"

        response = model.generate_content(prompt)
        time.sleep(0.5)  # Rate limiting
        return response.text
    except Exception as e:
        print(f"Translation error: {e}")
        return text

In [46]:
def create_hebrew_cot_prompt(english_questions, english_answers, new_english_question):
    """Create prompt for generating Hebrew CoT explanations"""

    prompt = """You are an expert tutor who provides detailed step-by-step explanations in Hebrew. Given questions and their answers in English, you create comprehensive Hebrew explanations that show the complete reasoning process.

Your Hebrew explanations should be:
- Detailed and show all reasoning steps
- Clear and easy to follow
- Use proper Hebrew terminology
- Keep mathematical expressions, code, and technical terms in their original form
- Provide step-by-step logical progression

Here are examples:

"""

    # Add few-shot examples
    for i in range(len(english_questions)):
        prompt += f"""Example {i+1}:
English Question: {english_questions[i]}

English Answer: {english_answers[i]}

Hebrew Step-by-Step Explanation:
[This would be filled with a detailed Hebrew explanation showing the reasoning process]

---

"""

    prompt += f"""Now provide a detailed Hebrew step-by-step explanation for this question:

English Question: {new_english_question}

Hebrew Step-by-Step Explanation:"""

    return prompt


In [49]:
def generate_tulu_hebrew_dataset(english_questions, english_answers, num_questions=10):
    """Generate Hebrew CoT dataset from Tulu examples"""

    # Load fresh dataset stream
    ds_fresh = load_dataset("allenai/tulu-3-sft-mixture", split="train", streaming=True)
    ds_fresh = (example for example in ds_fresh if is_english(example["messages"]))
    results = []
    count = 0
    processed = 0

    print(f"Processing {num_questions} questions from Tulu dataset...")

    for item in ds_fresh:
        if processed >= num_questions:
            break

        question, answer = extract_qa_from_tulu(item)

        if question and answer and len(question.strip()) > 10 and len(answer.strip()) > 10:
            try:
                print(f"Processing question {processed+1}/{num_questions}")
                print(f"Question preview: {question[:100]}...")

                # Generate Hebrew CoT using few-shot learning
                cot_prompt = create_hebrew_cot_prompt(english_questions, english_answers, question)
                response = model.generate_content(cot_prompt)
                hebrew_cot = response.text

                # Store result
                result = {
                    'question_english': question,
                    'answer_english': answer,
                    'cot_hebrew': hebrew_cot,
                    'question_hebrew': '',  # Will fill later
                    'answer_hebrew': ''     # Will fill later
                }

                results.append(result)
                print(f"✓ Generated Hebrew CoT for question {processed+1}")

                processed += 1
                time.sleep(1)  # Rate limiting

            except Exception as e:
                print(f"✗ Error processing question {processed+1}: {e}")
                continue

        count += 1
        if count >= 10000:  # Safety limit
            break

    return pd.DataFrame(results)

In [50]:
hebrew_dataset = generate_tulu_hebrew_dataset(
    english_questions=questions,
    english_answers=answers,
    num_questions=10
)

Processing 10 questions from Tulu dataset...
Processing question 1/10
Question preview: Create a snippet of Terraform HCL code that create an AWS autoscaling group, and an ALB in front to ...
✓ Generated Hebrew CoT for question 1
Processing question 2/10
Question preview: Which languages are spoken in Costa Rica?...
✓ Generated Hebrew CoT for question 2
Processing question 3/10
Question preview: Can u summarize me story from the book Harry Potter and the Philosopher's Stone?...
✓ Generated Hebrew CoT for question 3
Processing question 4/10
Question preview: Did Jesus realy exist...
✓ Generated Hebrew CoT for question 4
Processing question 5/10
Question preview: ¿Puedes darme un ejemplo del patrón de diseño Factory en el lenguaje de programación Java?...
✓ Generated Hebrew CoT for question 5
Processing question 6/10
Question preview: Write five lines of iambic pentameter about a subject of your choosing. Do not use any trochaic subs...
✓ Generated Hebrew CoT for question 6
Processing qu

In [51]:
print("Translating questions to Hebrew...")
hebrew_questions = []
for i, question in enumerate(hebrew_dataset['question_english']):
    print(f"Translating question {i+1}/{len(hebrew_dataset)}")
    hebrew_questions.append(translate_to_hebrew(question, "question"))
    time.sleep(0.5)

print("Translating answers to Hebrew...")
hebrew_answers = []
for i, answer in enumerate(hebrew_dataset['answer_english']):
    print(f"Translating answer {i+1}/{len(hebrew_dataset)}")
    hebrew_answers.append(translate_to_hebrew(answer, "answer"))
    time.sleep(0.5)


Translating questions to Hebrew...
Translating question 1/10
Translating question 2/10
Translating question 3/10
Translating question 4/10
Translating question 5/10
Translating question 6/10
Translating question 7/10
Translating question 8/10
Translating question 9/10
Translating question 10/10
Translating answers to Hebrew...
Translating answer 1/10
Translating answer 2/10
Translating answer 3/10
Translating answer 4/10
Translating answer 5/10
Translating answer 6/10
Translating answer 7/10
Translating answer 8/10
Translating answer 9/10
Translating answer 10/10


In [55]:
hebrew_dataset['question_hebrew'] = hebrew_questions
hebrew_dataset['answer_hebrew'] = hebrew_answers

In [56]:
final_dataset = hebrew_dataset[[
    'question_english',
    'answer_english',
    'question_hebrew',
    'cot_hebrew',
    'answer_hebrew'
]]

print("Final dataset structure:")
print(final_dataset.head())

Final dataset structure:
                                    question_english  \
0  Create a snippet of Terraform HCL code that cr...   
1          Which languages are spoken in Costa Rica?   
2  Can u summarize me story from the book Harry P...   
3                              Did Jesus realy exist   
4  ¿Puedes darme un ejemplo del patrón de diseño ...   

                                      answer_english  \
0  Sure, here's an example Terraform HCL code tha...   
1  The primary language spoken in Costa Rica is S...   
2  Harry Potter, an orphan, discovers he is a wiz...   
3  As an AI language model, I cannot provide a de...   
4  interface Shape {\n   void draw();\n}\n\nclass...   

                                     question_hebrew  \
0  צור קטע קוד Terraform HCL שיוצר AWS autoscalin...   
1                     אילו שפות מדוברות בקוסטה ריקה?   
2  האם תוכל/י לסכם לי את הסיפור מהספר Harry Potte...   
3                               האם ישו באמת התקיים?   
4  ?תוכל לתת לי דוגמה

In [57]:
hebrew_dataset.to_csv('/Users/oribar-joseph/Downloads/cot_tulu.csv')

{'id': 'oasst1_5921', 'messages': [{'content': 'Create a snippet of Terraform HCL code that create an AWS autoscaling group, and an ALB in front to expose an application to internet.', 'role': 'user'}, {'content': 'Sure, here\'s an example Terraform HCL code that creates an AWS Autoscaling Group and an Application Load Balancer to expose an application to the internet:\n``` \n# Configure the AWS provider\nprovider "aws" {\n  region = "us-east-1"\n}\n\n# Create a security group to allow traffic to the ALB\nresource "aws_security_group" "alb_sg" {\n  name_prefix = "alb_sg"\n  ingress {\n    from_port = 80\n    to_port = 80\n    protocol = "tcp"\n    cidr_blocks = ["0.0.0.0/0"]\n  }\n}\n\n# Create an ALB and target group\nresource "aws_lb" "alb" {\n  name               = "example-alb"\n  internal           = false\n  load_balancer_type = "application"\n\n  subnets = ["subnet-12345678", "subnet-87654321"]\n\n  security_groups = [aws_security_group.alb_sg.id]\n\n  tags = {\n    Environment 