In [67]:
!pip install datasets
!pip install openpyxl
!pip install -q -U google-genai
!pwd

/Users/galastra/Projects/qwen-hebrew-finetuning/synth


In [68]:
from datasets import load_dataset
import random
!pip install google-generativeai
import google.generativeai as genai
import time
import pandas as pd



In [69]:
import google.generativeai as genai

In [70]:
ds = load_dataset("nvidia/OpenMathReasoning", split="cot", streaming=True)

In [71]:
# Block 2: Fast diversity sampling function
def get_diverse_examples_fast(ds, n_samples=5, total_samples=5000):
    """Fast method using reservoir sampling to get examples with diverse COT lengths"""

    # Initialize buckets for different length ranges
    buckets = {
        'short': [],  # 0-500 chars
        'medium': [],  # 500-1500 chars
        'long': [],  # 1500+ chars
    }

    count = 0
    for item in ds:
        if count >= total_samples:
            break

        cot_length = len(item["generated_solution"])

        # Determine bucket
        if cot_length < 500:
            bucket_name = 'short'
        elif cot_length < 1500:
            bucket_name = 'medium'
        else:
            bucket_name = 'long'

        # Add to bucket (keep max 100 per bucket)
        bucket = buckets[bucket_name]
        if len(bucket) < 100:
            bucket.append((item["problem"], item["generated_solution"], item["expected_answer"]))
        else:
            # Replace random item (reservoir sampling)
            replace_idx = random.randint(0, len(bucket))
            if replace_idx < len(bucket):
                bucket[replace_idx] = (item["problem"], item["generated_solution"], item["expected_answer"])

        count += 1

    # Select examples from buckets to get diversity
    diverse_examples = []
    for bucket_name in ['short', 'medium', 'long']:
        if buckets[bucket_name]:
            # Take multiple from each bucket if needed
            samples_from_bucket = min(len(buckets[bucket_name]), max(1, n_samples // 3))
            diverse_examples.extend(random.sample(buckets[bucket_name], samples_from_bucket))

    # If we need more samples, fill from any bucket
    while len(diverse_examples) < n_samples:
        for bucket in buckets.values():
            if bucket and len(diverse_examples) < n_samples:
                remaining = [ex for ex in bucket if ex not in diverse_examples]
                if remaining:
                    diverse_examples.append(random.choice(remaining))

    return diverse_examples[:n_samples]

In [6]:
# Pair them together
# Block 3: Get diverse examples (this replaces your slow sorting)
examples = get_diverse_examples_fast(ds, n_samples=5, total_samples=5000)

# Block 4: Check results
print(f"Selected {len(examples)} diverse examples:")
for i, (q, cot, ans) in enumerate(examples):
    print(f"Example {i + 1}: COT length = {len(cot)} chars")

# Block 5: Extract fields if you need them separately
questions = [ex[0] for ex in examples]
cot_solutions = [ex[1] for ex in examples]
expected_answers = [ex[2] for ex in examples]

Selected 5 diverse examples:
Example 1: COT length = 1278 chars
Example 2: COT length = 25241 chars
Example 3: COT length = 29271 chars
Example 4: COT length = 50501 chars
Example 5: COT length = 45700 chars


In [7]:
GEMINI_API_KEY = ""  # Replace with your actual API key
genai.configure(api_key=GEMINI_API_KEY)
model = genai.GenerativeModel('gemini-2.5-pro')

In [8]:
def translate_to_hebrew(text, text_type="text"):
    """Translate text to Hebrew using Gemini API with rate limiting"""
    try:
        if text_type == "math_problem":
            prompt = f"Translate the following mathematical problem to Hebrew. Keep all mathematical expressions, numbers, and formulas unchanged. Only translate the text portions:\n\n{text}"
        elif text_type == "solution":
            prompt = f"Translate the following mathematical solution to Hebrew. Keep all mathematical expressions, calculations, and formulas unchanged. Only translate the explanatory text:\n\n{text}"
        else:
            prompt = f"Translate the following to Hebrew:\n\n{text}"

        response = model.generate_content(prompt)
        time.sleep(0.5)  # Rate limiting - adjust as needed
        return response.text
    except Exception as e:
        print(f"Translation error: {e}")
        return text

In [9]:
# Translate all content
print("Translating questions...")
translated_questions = []
for i, question in enumerate(questions):
    print(f"Translating question {i+1}/{len(questions)}")
    translated_questions.append(translate_to_hebrew(question, "math_problem"))

print("Translating COT solutions...")
translated_cot_solutions = []
for i, cot in enumerate(cot_solutions):
    print(f"Translating solution {i+1}/{len(cot_solutions)}")
    translated_cot_solutions.append(translate_to_hebrew(cot, "solution"))

print("Translating answers...")
translated_answers = []
for i, answer in enumerate(expected_answers):
    print(f"Translating answer {i+1}/{len(expected_answers)}")
    translated_answers.append(translate_to_hebrew(answer))

print("Translation complete!")

Translating questions...
Translating question 1/5
Translating question 2/5
Translating question 3/5
Translating question 4/5
Translating question 5/5
Translating COT solutions...
Translating solution 1/5
Translating solution 2/5
Translating solution 3/5
Translating solution 4/5
Translating solution 5/5
Translating answers...
Translating answer 1/5
Translating answer 2/5
Translating answer 3/5
Translating answer 4/5
Translating answer 5/5
Translation complete!


In [None]:
def create_few_shot_prompt(translated_questions, translated_cot_solutions, translated_answers, new_english_question, new_english_solution):
    """Create a few-shot prompt using translated examples to generate Hebrew CoT"""

    prompt = """You are an expert mathematics tutor who provides detailed step-by-step solutions in fluent Hebrew. Your explanations should be comprehensive, clear, and educational.

Here are examples of how to solve mathematical problems with detailed Hebrew explanations:

"""

    # Add few-shot examples (Hebrew question -> Hebrew CoT)
    for i in range(len(translated_questions)):
        prompt += f"""Example {i+1}:
Question: {translated_questions[i]}

Solution: {translated_cot_solutions[i]}

Answer: {translated_answers[i]}

---

"""

    # Add the new English question to solve
    prompt += f"""Now solve this question with a detailed step-by-step explanation in Hebrew while I provide you with the solution:

Question: {new_english_question}

Solution: {new_english_solution}

Please provide a comprehensive, fluent Hebrew explanation showing all steps. Make sure your explanation is:
- Clear and easy to follow
- Shows all mathematical steps
- Uses proper Hebrew mathematical terminology
- Informative and educational

Answer:"""

    return prompt

In [None]:
def translate_few_shot_prompt(questions, english_answers, hebrew_answers, new_english_question, new_english_answer):
    """Create a few-shot prompt using translated examples to generate Hebrew CoT"""

    prompt = """You are an expert mathematics tutor who provides detailed step-by-step solutions in fluent Hebrew by translating the answers from English. Your explanations should be comprehensive, clear, and educational.

Here are examples of translations givem the English questions, English answers and their Hebrew translations of the answers:

"""

    # Add few-shot examples (Hebrew question -> Hebrew CoT)
    for i in range(len(questions)):
        prompt += f"""Example {i+1}:
English Question: {questions[i]}

English Answer: {english_answers[i]}

Hebrew Answer: {hebrew_answers[i]}

---

"""

    # Add the new English question to solve
    prompt += f"""Now translate the answer to this question into Hebrew with a detailed step-by-step explanation in Hebrew.

English Question: {new_english_question}

English Answer: {new_english_answer}

Please provide a comprehensive, fluent Hebrew explanation showing all steps. Make sure your explanation is:
- Clear and easy to follow
- Shows all mathematical steps
- Uses proper Hebrew mathematical terminology
- Informative and educational

Hebrew Answer:"""

    return prompt

In [None]:
def generate_hebrew_cot_for_dataset(translated_questions, translated_cot_solutions, translated_answers,
                                  num_questions=10, model=model):
    """
    Generate Hebrew CoT for questions from the original dataset

    Args:
        translated_questions: List of Hebrew translated questions (few-shot examples)
        translated_cot_solutions: List of Hebrew translated CoT solutions (few-shot examples)
        translated_answers: List of Hebrew translated answers (few-shot examples)
        num_questions: Number of questions to process from dataset
        model: Gemini model instance

    Returns:
        pandas DataFrame with columns: question_english, generated_cot_hebrew, real_cot_english, real_answer_english
    """

    # Load the dataset
    ds = load_dataset("nvidia/OpenMathReasoning", split="cot", streaming=True)

    results = []
    count = 0

    print(f"Processing {num_questions} questions from the dataset...")

    for item in ds:
        if count >= num_questions:
            break

        try:
            # Extract data from dataset
            english_question = item["problem"]
            real_english_cot = item["generated_solution"]
            real_english_answer = item["expected_answer"]

            print(f"Processing question {count+1}/{num_questions}")
            print(f"Question preview: {english_question[:100]}...")

            # Create few-shot prompt
            prompt = create_few_shot_prompt(
                translated_questions,
                translated_cot_solutions,
                translated_answers,
                english_question, real_english_answer
            )

            # Generate Hebrew CoT
            response = model.generate_content(prompt)
            generated_hebrew_cot = response.text

            # Add to results
            result = {
                'question_english': english_question,
                'generated_cot_hebrew': generated_hebrew_cot,
                'real_cot_english': real_english_cot,
                'real_answer_english': real_english_answer
            }

            results.append(result)
            print(f"✓ Successfully generated Hebrew CoT for question {count+1}")

            # Rate limiting
            time.sleep(1)  # Adjust as needed for API rate limits

        except Exception as e:
            print(f"✗ Error processing question {count+1}: {e}")
            # Add error result to maintain count
            result = {
                'question_english': item.get("problem", "Error loading question"),
                'generated_cot_hebrew': f"Error: {str(e)}",
                'real_cot_english': item.get("generated_solution", "Error loading CoT"),
                'real_answer_english': item.get("expected_answer", "Error loading answer")
            }
            results.append(result)

        count += 1

    # Convert to DataFrame
    df = pd.DataFrame(results)
    return df

In [None]:
df = generate_hebrew_cot_for_dataset(
    english_questions=questions,  # Your original English questions
    translated_cot_solutions=translated_cot_solutions,  # Hebrew CoT solutions
    english_answers=expected_answers,  # Your original English answers
    num_questions=10,
    model=model
)

NameError: name 'questions' is not defined

## Load pre-generated dataframe

In [None]:
try:
    df = pd.read_csv("synth_fewshots_results.csv")
    df
except:
    print('no file pre-generated or wrong path.')

Unnamed: 0.1,Unnamed: 0,question_english,generated_cot_hebrew,real_cot_english,real_answer_english
0,0,Given a group of \( N \) balls consisting of \...,"<think>\nטוב, הבעיה הזאת עוסקת בחישוב הסתברות ...","<think>\nOkay, so I need to find the probabili...",\(\frac{C_{n_1}^{a_1} \cdot C_{n_2}^{a_2} \cdo...
1,1,How many lines can be drawn that are equidista...,"<think>\nטוב, אז השאלה שואלת כמה ישרים אפשר לש...",,\frac{n(n-1)}{2}
2,2,Find all functions \( f: \mathbb{R} \to \mathb...,"<think>\nאוקיי, צריך למצוא את כל הפונקציות \(f...","<think>\nOkay, let's try to solve this functio...",\( f(x) = f(1)x \)
3,3,Find the sum of the roots of the equation \((x...,"<think>\nאוקיי, צריך למצוא את סכום השורשים של ...","<think>\nOkay, let's see. I need to find the s...",2
4,4,Determine how many 1000 digit numbers \( A \) ...,"<think>\nטוב, אז השאלה היא למצוא כמה מספרים בנ...","<think>\nOkay, so I need to figure out how man...",32
5,5,Calculate the integral\n\n\[\n\int^{\frac{3\pi...,"<think>\nאוקיי, צריך לחשב את האינטגרל הזה:\n\n...","<think>\nOkay, let me try to solve this integr...",\(\frac{2}{\pi} + \frac{32}{9\pi^2}\)
6,6,"In $\triangle ABC$ with incenter $I$, points $...","<think>\nאוקיי, בואו ננתח את הבעיה. יש לנו משו...","<think>\nAlright, let me try to tackle this ge...",1
7,7,"Let \( x_0 = 1 \), and \( x_{n+1} = 2x_n + \sq...","<think>\nטוב, אז יש לנו נוסחת נסיגה: \( x_{n+1...","<think>\nOkay, let's see. The problem is about...",\(\frac{(3-\sqrt{3})(2-\sqrt{3})^{2016}+(3+\sq...
8,8,"Which is greater, $\sqrt[2012]{2013!}$ or $\sq...","<think>\nאוקיי, צריך להשוות בין \(\sqrt[2012]{...","<think>\nOkay, so I need to figure out whether...",$\sqrt[2012]{2013!} > \sqrt[2013]{2012!}$
9,9,"On average, how long will you have to flip a c...","<think>\nאוקיי, השאלה שואלת כמה הטלות מטבע בממ...","<think>\nOkay, so I need to figure out the exp...",20


# Translation

_Assuming we already applied the synthesization..._

We will now iterate on the dataframe:
- Adding a translation
- Judging if the better solution is the synthesized or the translated one

In [None]:
import pandas as pd
from prompts.open_math_prompts import GSM_ENGLISH_QUESTIONS, GSM_ENGLISH_ANSWERS, GSM_HEBREW_ANSWERS

def translate_answer(english_question, english_answer):
    res = model.generate_content(translate_few_shot_prompt(GSM_ENGLISH_QUESTIONS, GSM_ENGLISH_ANSWERS, GSM_HEBREW_ANSWERS, english_question, english_answer))
    return res.text


df = pd.read_csv("synth_fewshots_results.csv")
df = df[df.real_cot_english.notna()]

# translations = []
# for index, row in df.iterrows():
#     print(f"Translating row {index+1}/{len(df)}")
#     translations.append(translate_answer(row['question_english'], row['real_cot_english']))
df['translated_cot'] = df.apply(lambda row: translate_answer(row['question_english'], row['real_cot_english']), axis=1)
df

## LLM-as-a-Judge

In [None]:
import enum
from src.call_models import call_gemini, all_string_gemini_config, google_connect
from my_access_keys import google_access_key

google_client = google_connect(google_access_key)

class COT_VERSION(enum.Enum):
  TRANSLATED = "Translated"
  SYNTHESIZED = "Synthesized"


def judge_cot_version(trans: str, synth: str):
    content = f"""
    You are a judge evaluating two answers to a question. The answers are in Hebrew, and you need to assess them based on the following criteria:
    1. Correctness – Is the answer factually accurate? Make sure the following mathematical steps and calculations are correct.
    2. Logic – Does the reasoning follow a clear and valid path?
    3. Coherence – Is the answer internally consistent and well-structured?
    4. Didactic quality – How well does the answer explain the concept to a reader?
    5. Hebrew fluency – Is the answer written in fluent, natural Hebrew? Check for grammar, vocabulary, and overall readability.
    
    Here are the answers:
    
    <translated>
    {trans}
    </translated>
    
    <synthesized>
    {synth}
    </synthesized>
    
    Which answer is better overall?  
    """

    try:
        generate_content_config = all_string_gemini_config(['decision'], 'ALWAYS THINK IN HEBREW BEFORE ANSWERING!', think_bud=200, enum=COT_VERSION)
        response = call_gemini(google_client, content, generate_content_config)
        return response.text
    except Exception as E:
        return str(E)

In [None]:
df.to_csv('/Users/oribar-joseph/Downloads/cot.csv')