In [21]:
# Block 1: Install dependencies and imports (unchanged)
!pip install datasets
!pip install openpyxl
!pip install -q -U google-genai
!pip install google-generativeai

from datasets import load_dataset
import random
import google.generativeai as genai
import time
import pandas as pd
from langdetect import detect_langs

# Block 2: Load Alpaca dataset as stream (unchanged)
dataset = load_dataset("tatsu-lab/alpaca", split="train", streaming=True)
print("Loaded Alpaca dataset as stream")


Loaded Alpaca dataset as stream


In [22]:
def extract_qa_from_alpaca(item):
    """Extract question and answer from Alpaca dataset item"""
    try:
        instruction = item.get('instruction', '').strip()
        input_text = item.get('input', '').strip()
        output = item.get('output', '').strip()

        if input_text:
            question = f"{instruction}\n\n{input_text}"
        else:
            question = instruction

        return question, output
    except Exception as e:
        print(f"Error extracting Q&A: {e}")
        return None, None

# Extract Q&A pairs from stream
all_examples = []
print("Extracting Q&A pairs from stream...")

count = 0
for item in dataset:
    if count >= 10000:
        break

    if count % 1000 == 0:
        print(f"Processed {count} examples, found {len(all_examples)} valid examples")

    question, answer = extract_qa_from_alpaca(item)

    if question and answer and len(question.strip()) > 10 and len(answer.strip()) > 10:
        all_examples.append((question, answer))

    count += 1

print(f"Found {len(all_examples)} valid Q&A pairs from {count} processed examples")

Extracting Q&A pairs from stream...
Processed 0 examples, found 0 valid examples
Processed 1000 examples, found 932 valid examples
Processed 2000 examples, found 1877 valid examples
Processed 3000 examples, found 2822 valid examples
Processed 4000 examples, found 3759 valid examples
Processed 5000 examples, found 4705 valid examples
Processed 6000 examples, found 5644 valid examples
Processed 7000 examples, found 6570 valid examples
Processed 8000 examples, found 7501 valid examples
Processed 9000 examples, found 8443 valid examples
Found 9395 valid Q&A pairs from 10000 processed examples


In [23]:
# Block 3: Sample diverse examples for few-shot learning (unchanged)
def get_diverse_alpaca_examples(examples, n_samples=5):
    """Get diverse examples from Alpaca dataset for few-shot learning"""

    buckets = {
        'short': [],   # 0-300 chars
        'medium': [],  # 300-1000 chars
        'long': [],    # 1000+ chars
    }

    for question, answer in examples:
        content_length = len(question) + len(answer)

        if content_length < 300:
            bucket_name = 'short'
        elif content_length < 1000:
            bucket_name = 'medium'
        else:
            bucket_name = 'long'

        buckets[bucket_name].append((question, answer))

    diverse_examples = []
    for bucket_name in ['short', 'medium', 'long']:
        if buckets[bucket_name]:
            samples_from_bucket = min(len(buckets[bucket_name]), max(1, n_samples // 3))
            diverse_examples.extend(random.sample(buckets[bucket_name], samples_from_bucket))

    all_examples_list = [ex for bucket in buckets.values() for ex in bucket]
    while len(diverse_examples) < n_samples and all_examples_list:
        remaining = [ex for ex in all_examples_list if ex not in diverse_examples]
        if remaining:
            diverse_examples.append(random.choice(remaining))
        else:
            break

    return diverse_examples[:n_samples]

few_shot_examples = get_diverse_alpaca_examples(all_examples, n_samples=5)
print(f"Selected {len(few_shot_examples)} diverse examples for few-shot learning")


Selected 5 diverse examples for few-shot learning


In [24]:
# Block 4: Setup Gemini API
GEMINI_API_KEY = "AIzaSyCtsfF_f6isWzN3B5wrUEDhaE1IujdoOnQ"  # Replace with your actual API key
genai.configure(api_key=GEMINI_API_KEY)
model = genai.GenerativeModel('gemini-2.5-pro')

In [25]:
# Block 5: Enhanced translation function
def translate_to_hebrew(text, text_type="text"):
    """Translate text to Hebrew using Gemini API with enhanced prompts"""
    try:
        if text_type == "question":
            prompt = f"""You are a professional Hebrew translator specializing in educational content.

Translate the following question to Hebrew while maintaining:
- Natural Hebrew flow and readability
- Technical accuracy for any domain-specific terms
- Proper Hebrew sentence structure and grammar
- Mathematical expressions, code snippets, formulas, and technical identifiers should remain unchanged
- Use appropriate Hebrew academic/educational terminology where applicable
- Maintain the original question's intent and clarity

Question to translate:
{text}

Provide only the Hebrew translation:"""

        elif text_type == "answer":
            prompt = f"""You are a professional Hebrew translator specializing in educational content.

Translate the following answer to Hebrew while ensuring:
- Clear, natural Hebrew expression that maintains the original meaning
- Technical terms are appropriately translated or kept in original form when standard practice
- Mathematical expressions, code, formulas, URLs, and technical identifiers remain unchanged
- Proper Hebrew academic/educational writing style
- Logical flow and coherence in Hebrew
- Appropriate use of Hebrew punctuation and formatting

Answer to translate:
{text}

Provide only the Hebrew translation:"""

        else:
            prompt = f"""Translate the following text to Hebrew while maintaining natural flow and accuracy:

{text}

Provide only the Hebrew translation:"""

        response = model.generate_content(prompt)
        time.sleep(0.5)  # Rate limiting
        return response.text.strip()
    except Exception as e:
        print(f"Translation error: {e}")
        return text

In [26]:
# Block 6: Hebrew answer generation function (replaces CoT)
def create_hebrew_answer_prompt(few_shot_examples, new_english_question):
    """Create prompt for generating comprehensive Hebrew answers"""

    prompt = """You are an expert educational assistant who provides clear, comprehensive, and well-structured answers in Hebrew. You have deep knowledge across various domains and can explain complex concepts in an accessible way.

Your Hebrew answers should be:
- Clear, precise, and comprehensive
- Well-structured with logical flow
- Accurate and informative
- Use proper Hebrew terminology and academic language
- Keep mathematical expressions, code, formulas, and technical identifiers in their original form
- Include relevant examples or explanations when helpful
- Be appropriately detailed for the question's complexity
- Maintain educational value and clarity

Here are examples of the style and quality expected:

"""

    # Add few-shot examples
    for i, (question, answer) in enumerate(few_shot_examples):
        prompt += f"""Example {i+1}:
English Question: {question}

English Answer: {answer}

Expected Hebrew Answer Style: [A clear, comprehensive Hebrew response that thoroughly addresses the question with proper structure and terminology]

---

"""

    prompt += f"""Now provide a comprehensive, well-structured Hebrew answer for this question:

English Question: {new_english_question}

Hebrew Answer:"""

    return prompt

In [27]:
# Block 7: Generate Hebrew dataset function
def generate_alpaca_hebrew_dataset(few_shot_examples, num_questions=10):
    """Generate Hebrew answer dataset from Alpaca examples"""

    results = []
    processed = 0

    print(f"Processing {num_questions} questions from Alpaca dataset...")

    # Sample from our examples, avoiding few-shot duplicates
    few_shot_questions = [ex[0] for ex in few_shot_examples]
    sample_examples = random.sample(all_examples, min(num_questions * 2, len(all_examples)))

    for question, answer in sample_examples:
        if processed >= num_questions:
            break

        # Skip if this question is already in our few-shot examples
        if question in few_shot_questions:
            continue

        try:
            print(f"Processing question {processed+1}/{num_questions}")
            print(f"Question preview: {question[:100]}...")

            # Generate Hebrew answer using few-shot learning
            answer_prompt = create_hebrew_answer_prompt(few_shot_examples, question)
            response = model.generate_content(answer_prompt)
            hebrew_answer_generated = response.text.strip()

            # Store result
            result = {
                'question_english': question,
                'answer_english': answer,
                'question_hebrew': '',  # Will fill later
                'answer_hebrew_generated': hebrew_answer_generated,
                'answer_hebrew_translated': ''  # Will fill later
            }

            results.append(result)
            print(f"Generated Hebrew answer for question {processed+1}")

            processed += 1
            time.sleep(1)  # Rate limiting

        except Exception as e:
            print(f"Error processing question {processed+1}: {e}")
            continue

    return pd.DataFrame(results)

# Block 8: Generate the dataset
hebrew_dataset = generate_alpaca_hebrew_dataset(
    few_shot_examples=few_shot_examples,
    num_questions=10
)

print(f"Generated {len(hebrew_dataset)} Hebrew answer examples")

Processing 10 questions from Alpaca dataset...
Processing question 1/10
Question preview: Brainstorm a relevant title for the following article.

The Benefits of Utilizing Recycled Materials...
Generated Hebrew answer for question 1
Processing question 2/10
Question preview: What do you think are the major causes of poverty in developing countries?

No input required....
Generated Hebrew answer for question 2
Processing question 3/10
Question preview: Generate a wish for someone's birthday.

Recipient: My Best Friend...
Generated Hebrew answer for question 3
Processing question 4/10
Question preview: Name three cities in the United States with population over 1 million....
Generated Hebrew answer for question 4
Processing question 5/10
Question preview: What are the methods available for sentiment analysis?...
Generated Hebrew answer for question 5
Processing question 6/10
Question preview: Describe the basic principles of thermodynamics....
Generated Hebrew answer for question 6
Proce

In [28]:
# Block 9: Translate questions and answers to Hebrew
print("Translating questions to Hebrew...")
hebrew_questions = []
for i, question in enumerate(hebrew_dataset['question_english']):
    print(f"Translating question {i+1}/{len(hebrew_dataset)}")
    hebrew_questions.append(translate_to_hebrew(question, "question"))
    time.sleep(0.5)

print("Translating answers to Hebrew...")
hebrew_answers_translated = []
for i, answer in enumerate(hebrew_dataset['answer_english']):
    print(f"Translating answer {i+1}/{len(hebrew_dataset)}")
    hebrew_answers_translated.append(translate_to_hebrew(answer, "answer"))
    time.sleep(0.5)

# Add translations to dataset
hebrew_dataset['question_hebrew'] = hebrew_questions
hebrew_dataset['answer_hebrew_translated'] = hebrew_answers_translated

# Block 10: Create final dataset and save
final_dataset = hebrew_dataset[[
    'question_english',
    'answer_english',
    'question_hebrew',
    'answer_hebrew_generated',
    'answer_hebrew_translated'
]]

print("Final dataset structure:")
print(final_dataset.head())
print(f"\nDataset shape: {final_dataset.shape}")
print(f"Columns: {list(final_dataset.columns)}")

# Save to CSV
final_dataset.to_csv('/Users/oribar-joseph/Downloads/alpaca_hebrew_answers_dataset.csv', index=False)
print("Dataset saved to 'alpaca_hebrew_answers_dataset.csv'")

# Display sample
print("\nSample from final dataset:")
for i, row in final_dataset.head(2).iterrows():
    print(f"\n--- Example {i+1} ---")
    print(f"English Question: {row['question_english'][:150]}...")
    print(f"English Answer: {row['answer_english'][:150]}...")
    print(f"Hebrew Question: {row['question_hebrew'][:150]}...")
    print(f"Hebrew Generated Answer: {row['answer_hebrew_generated'][:150]}...")
    print(f"Hebrew Translated Answer: {row['answer_hebrew_translated'][:150]}...")


Translating questions to Hebrew...
Translating question 1/10
Translating question 2/10
Translating question 3/10
Translating question 4/10
Translating question 5/10
Translating question 6/10
Translating question 7/10
Translating question 8/10
Translating question 9/10
Translating question 10/10
Translating answers to Hebrew...
Translating answer 1/10
Translating answer 2/10
Translating answer 3/10
Translating answer 4/10
Translating answer 5/10
Translating answer 6/10
Translating answer 7/10
Translating answer 8/10
Translating answer 9/10
Translating answer 10/10
Final dataset structure:
                                    question_english  \
0  Brainstorm a relevant title for the following ...   
1  What do you think are the major causes of pove...   
2  Generate a wish for someone's birthday.\n\nRec...   
3  Name three cities in the United States with po...   
4  What are the methods available for sentiment a...   

                                      answer_english  \
0  "Giving N