In [1]:
!pip install datasets
!pip install openpyxl
!pip install -q -U google-genai
# !pip install transformers
# !pip install accelerate
# !pip install peft
# !pip install bitsandbytes

Collecting datasets
  Downloading datasets-4.0.0-py3-none-any.whl.metadata (19 kB)
Collecting dill<0.3.9,>=0.3.0 (from datasets)
  Downloading dill-0.3.8-py3-none-any.whl.metadata (10 kB)
Collecting xxhash (from datasets)
  Downloading xxhash-3.5.0-cp310-cp310-manylinux_2_17_x86_64.manylinux2014_x86_64.whl.metadata (12 kB)
Collecting multiprocess<0.70.17 (from datasets)
  Downloading multiprocess-0.70.16-py310-none-any.whl.metadata (7.2 kB)
Collecting fsspec<=2025.3.0,>=2023.1.0 (from fsspec[http]<=2025.3.0,>=2023.1.0->datasets)
  Downloading fsspec-2025.3.0-py3-none-any.whl.metadata (11 kB)
Collecting huggingface-hub>=0.24.0 (from datasets)
  Downloading huggingface_hub-0.33.4-py3-none-any.whl.metadata (14 kB)
Collecting aiohttp!=4.0.0a0,!=4.0.0a1 (from fsspec[http]<=2025.3.0,>=2023.1.0->datasets)
  Downloading aiohttp-3.12.14-cp310-cp310-manylinux_2_17_x86_64.manylinux2014_x86_64.whl.metadata (7.6 kB)
Collecting aiohappyeyeballs>=2.5.0 (from aiohttp!=4.0.0a0,!=4.0.0a1->fsspec[http]<=

# Imports

In [5]:
%load_ext autoreload
%autoreload 2

import pandas as pd
import sys

# Call models
from src.call_models import bedrock_connect, call_claude_bedrock
from src.call_models import google_connect, call_gemini, all_string_gemini_config, all_int_gemini_config
from src.translate_func import claude_translation, gemini_translation

# Datasets
from src.benchmarks_code import arc_ai
from src.benchmarks_code import gsm8k
from prompts import arc_prompts
from prompts import gsm_prompts
#from prompts import hellaswag_prompts

# Access keys
from my_access_keys import google_access_key, aws_access_key, aws_secret_key

# .csv utils
from src.save_utils import add_dataset_to_csv

# Remove annoying warning
from IPython.core.display_functions import display

The autoreload extension is already loaded. To reload it, use:
  %reload_ext autoreload


In [12]:
from my_access_keys import google_project_id

In [15]:
# Get the model's access keys
bedrock_client = bedrock_connect(aws_access_key, aws_secret_key)
google_client = google_connect(google_access_key)

# Check Models Calls

In [16]:
print('Gemini:')
generate_content_config = all_string_gemini_config(['recipe', 'ingredients'], 'ALWAYS THINK BEFORE ANSWERING!', think_bud=200)
response = call_gemini(google_client, "List a popular cookie recipe, and include the amounts of ingredients.", generate_content_config)
print(response.text)
print('---')
my_recipes = response.parsed
print(my_recipes.keys())

Gemini:
{"recipe": "Classic Chocolate Chip Cookies", "ingredients": "1 cup (2 sticks) unsalted butter, softened; 3/4 cup granulated sugar; 3/4 cup packed light brown sugar; 2 large eggs; 1 teaspoon vanilla extract; 2 1/4 cups all-purpose flour; 1 teaspoon baking soda; 1/2 teaspoon salt; 12 ounces chocolate chips (semi-sweet or milk chocolate)"}
---
dict_keys(['recipe', 'ingredients'])


# ARC_AI2

## Get Dataset

In [29]:
arc_dataset = arc_ai.get_arc_ai2_datasets()
arc_dataset['arc_challenge_train'] = arc_dataset['arc_challenge_train'].skip(5).take(90)
file_name = 'compare_csv/arc_ai2_chall_train_top_100_for_comparison.csv'

print(file_name)
arc_dataset

compare_csv/arc_ai2_chall_train_top_100_for_comparison.csv


{'arc_challenge_train': Dataset({
     features: ['id', 'question', 'choices', 'answerKey'],
     num_rows: 90
 })}

In [36]:
df = pd.read_csv(file_name)
text_df = pd.read_csv(file_name[:-4] + '-text.csv')
display(df.head(2))
display(text_df.head(2))

Unnamed: 0,original,claude_v2_refine
0,<question>Which land form is the result of the...,<question>איזו צורת נוף היא תוצאה של הכוח הבונ...
1,<question>Which statement best compares single...,<question>איזה משפט משווה בצורה הטובה ביותר בי...


Unnamed: 0,original,claude_v2_refine text
0,<question>Which land form is the result of the...,First translation:\n<question>איזו צורת קרקע ה...
1,<question>Which statement best compares single...,First translation:\n<question>איזו אמירה משווה...


## Run Translation

### Claude

In [12]:
exp_name = 'claude_v2_refine'

hebrew_datasets, text_output = claude_translation(
    bedrock_client,
    arc_dataset,
    arc_prompts.ARC_INSTRUCT_V2_CLAUDE_REFINE,
    arc_prompts.ARC_FEW_SHOTS,
    arc_prompts.ARC_FORMAT,
    arc_ai.arc_sample_to_dict,
    arc_ai.arc_dict_to_sample,
)

Translating arc_challenge_train...


  0%|          | 0/100 [00:00<?, ?it/s]

In [14]:
df = add_dataset_to_csv(file_name, exp_name, hebrew_datasets['arc_challenge_train'], arc_ai.arc_sample_to_dict)
text_df[exp_name + ' text'] = text_output['arc_challenge_train']
text_df.to_csv(file_name[:-4] + '-text.csv', index=False)
display(df.head(2))
display(text_df.head(2))

Unnamed: 0,original,claude_v2_refine
0,<question>Which land form is the result of the...,<question>איזו צורת נוף היא תוצאה של הכוח הבונ...
1,<question>Which statement best compares single...,<question>איזה משפט משווה בצורה הטובה ביותר בי...


Unnamed: 0,original,claude_v2_refine text
0,<question>Which land form is the result of the...,First translation:\n<question>איזו צורת קרקע ה...
1,<question>Which statement best compares single...,First translation:\n<question>איזו אמירה משווה...


### Gemini

In [44]:
exp_name = 'gemini_pro_think_v2'

hebrew_datasets, text_output = gemini_translation(
    google_client,
    arc_dataset,
    # small,
    arc_prompts.ARC_INSTRUCT_V2_GEMINI,
    arc_prompts.ARC_FEW_SHOTS,
    arc_ai.arc_sample_to_dict,
    arc_ai.arc_dict_to_sample,
    if_pro=True,
    # if_pro=False,
    think_bud=20_000
    # think_bud=2_000
)

Translating arc_challenge_train...


  0%|          | 0/90 [00:00<?, ?it/s]

Sleeping in the While.... Done!                   

In [45]:
hebrew_datasets

{'arc_challenge_train': Dataset({
     features: ['id', 'question', 'choices', 'answerKey', 'translation_status'],
     num_rows: 90
 })}

In [46]:
df = add_dataset_to_csv(file_name, exp_name, hebrew_datasets['arc_challenge_train'], arc_ai.arc_sample_to_dict)
text_df[exp_name + ' text'] = text_output['arc_challenge_train']
text_df.to_csv(file_name[:-4] + '-text.csv', index=False)
display(df.head(2))
display(text_df.head(2))

Unnamed: 0,original,claude_v2_refine,gemini_pro_think_v2
0,<question>Which land form is the result of the...,<question>איזו צורת נוף היא תוצאה של הכוח הבונ...,<question>איזו תצורת נוף היא תוצאה של הכוח הבו...
1,<question>Which statement best compares single...,<question>איזה משפט משווה בצורה הטובה ביותר בי...,<question>איזה משפט משווה בצורה הטובה ביותר בי...


Unnamed: 0,original,claude_v2_refine text,gemini_pro_think_v2 text
0,<question>Which land form is the result of the...,First translation:\n<question>איזו צורת קרקע ה...,**Hebrew Translation of a Glacial Geology Ques...
1,<question>Which statement best compares single...,First translation:\n<question>איזו אמירה משווה...,**Thought Process: Translating a Science Quest...


### Multi-options Translation - Gemini

### Claude vs Gemini (using Gemini as judge)

# GSM8K

## Get Dataset

In [20]:
gsm_dataset = gsm8k.get_gsm8k_datasets()
gsm_dataset['gsm8k_test'] = gsm_dataset['gsm8k_test'].take(200)
gsm_file_name = 'compare_csv/gsm8k_main_test_top_200.csv'

print(gsm_file_name)
gsm_dataset

compare_csv/gsm8k_main_test_top_200.csv


{'gsm8k_test': Dataset({
     features: ['question', 'answer'],
     num_rows: 200
 })}

In [22]:
df = add_dataset_to_csv(gsm_file_name, 'original', gsm_dataset['gsm8k_test'], gsm8k.gsm8k_sample_to_dict)
text_df = add_dataset_to_csv(gsm_file_name[:-4] + '-text.csv', 'original', gsm_dataset['gsm8k_test'], gsm8k.gsm8k_sample_to_dict)
display(df.head(2))
display(text_df.head(2))

Unnamed: 0,original
0,<question>Janet’s ducks lay 16 eggs per day. S...
1,<question>A robe takes 2 bolts of blue fiber a...


Unnamed: 0,original
0,<question>Janet’s ducks lay 16 eggs per day. S...
1,<question>A robe takes 2 bolts of blue fiber a...


## Run Translation

### Claude

In [56]:
exp_name = 'claude_v2_refine'

hebrew_datasets, text_output = claude_translation(
    bedrock_client,
    # small,
    gsm_dataset,
    gsm_prompts.GSM_INSTRUCT_CLAUDE_REFINE_V2,
    gsm_prompts.GSM_FEW_SHOTS,
    # gsm_prompts.GSM_FORMAT,
    gsm_prompts.GSM_FORMAT_REFINE,
    gsm8k.gsm8k_sample_to_dict,
    gsm8k.gsm8k_dict_to_sample,
)

Translating gsm8k_test...


  0%|          | 0/200 [00:00<?, ?it/s]

In [68]:
def len_mat(text):
    pattern = r"<(?!response_format\b)([^>]+)>(.*?)</\1>"
    matches = re.findall(pattern, text, re.DOTALL)
    return len(matches)

In [69]:
import re
pd.Series([len_mat(i) for i in text_output['gsm8k_test']]).value_counts()

5    200
Name: count, dtype: int64

In [70]:
df = add_dataset_to_csv(gsm_file_name, exp_name, hebrew_datasets['gsm8k_test'], gsm8k.gsm8k_sample_to_dict)
df.head(2)

Unnamed: 0,original,claude_v2_refine
0,<question>Janet’s ducks lay 16 eggs per day. S...,<question>הברווזים של יעל מטילים 16 ביצים ביום...
1,<question>A robe takes 2 bolts of blue fiber a...,<question>לתפירת חלוק נדרשים 2 גלילי אריג כחול...


In [71]:
text_df[exp_name + ' text'] = text_output['gsm8k_test']
text_df.to_csv(gsm_file_name[:-4] + '-text.csv', index=False)
text_df.head(2)

Unnamed: 0,original,claude_v2_refine text
0,<question>Janet’s ducks lay 16 eggs per day. S...,First translation attempt:\n<question>התרנגולו...
1,<question>A robe takes 2 bolts of blue fiber a...,First translation attempt:\n<question>גלימה דו...


In [73]:
for i in range(10):
    print(text_df['claude_v2_refine text'].iloc[i])
    print('\n - - - - - - - - - - - - - - - - - - - - - - - - - - - -\n')

First translation attempt:
<question>התרנגולות של ינט מטילות 16 ביצים ביום. היא אוכלת שלוש לארוחת בוקר כל בוקר ואופה מאפינס לחבריה כל יום עם ארבע. היא מוכרת את השאר בשוק האיכרים מדי יום ב-2$ לביצת ברווז טרייה. כמה דולרים היא מרוויחה כל יום בשוק האיכרים?</question>
<answer>ינט מוכרת 16 - 3 - 4 = <<16-3-4=9>>9 ביצי ברווז ביום.
היא מרוויחה 9 * 2 = $<<9*2=18>>18 כל יום בשוק האיכרים.
#### 18</answer>

<explain>
The translation needs several improvements:
1. "Janet" should be changed to a more common Israeli name like "יעל"
2. The original mentions "ducks" but I mistakenly translated to "תרנגולות" (chickens) in the first sentence
3. Need to change dollars ($) to shekels (₪)
4. "Farmers' market" should be translated to the more common Israeli term "שוק איכרים" 
5. Need to ensure consistency with "duck eggs" throughout the translation
6. The formatting and structure should be preserved but with Hebrew right-to-left orientation
</explain>

Improved translation:
<question>הברווזים של יעל מטילים 

### Gemini

In [None]:
exp_name = 'gemini_pro_think_v2'

hebrew_datasets, text_output = gemini_translation(
    google_client,
    # small,
    gsm_dataset,
    gsm_prompts.GSM_INSTRUCT_GEMINI_V2,
    gsm_prompts.GSM_FEW_SHOTS,
    gsm8k.gsm8k_sample_to_dict,
    gsm8k.gsm8k_dict_to_sample,
    if_pro=True,
    think_bud=4_000,
)

Translating gsm8k_test...


  0%|          | 0/200 [00:00<?, ?it/s]

In [94]:
df = add_dataset_to_csv(gsm_file_name, exp_name, hebrew_datasets['gsm8k_test'], gsm8k.gsm8k_sample_to_dict)
df.head(2)

Unnamed: 0,original,claude_v2_refine,gemini_pro_think_v2
0,<question>Janet’s ducks lay 16 eggs per day. S...,<question>הברווזים של יעל מטילים 16 ביצים ביום...,<question>הברווזים של יעל מטילים 16 ביצים ביום...
1,<question>A robe takes 2 bolts of blue fiber a...,<question>לתפירת חלוק נדרשים 2 גלילי אריג כחול...,<question>להכנת חלוק צריך 2 גלילי בד כחול וחצי...


In [96]:
text_df[exp_name + ' text'] = text_output['gsm8k_test']
text_df.to_csv(gsm_file_name[:-4] + '-text.csv', index=False)
text_df.head(2)

Unnamed: 0,original,claude_v2_refine text,gemini_pro_think_v2 text
0,<question>Janet’s ducks lay 16 eggs per day. S...,First translation attempt:\n<question>התרנגולו...,**My Thought Process: Translating a Math Probl...
1,<question>A robe takes 2 bolts of blue fiber a...,First translation attempt:\n<question>גלימה דו...,"**Problem Solving in Translation**\n\nOkay, so..."


In [91]:
print('Done!')

Done!
