In [None]:
!pip install datasets
!pip install openpyxl
!pip install -q -U google-genai
# !pip install transformers
# !pip install accelerate
# !pip install peft
# !pip install bitsandbytes
# Imports

# Imports

In [1]:
%load_ext autoreload
%autoreload 2

import pandas as pd
import numpy as np
import sys

# Call models
from src.call_models import bedrock_connect, call_claude_bedrock
from src.call_models import google_connect, call_gemini, all_string_gemini_config, all_int_gemini_config
from src.translate_func import claude_translation, gemini_translation

# Datasets
from prompts import mmlu_prompts
from src.benchmarks_code import mmlu
# Access keys
from my_access_keys import google_access_key, aws_access_key, aws_secret_key

# .csv utils
from src.save_utils import add_dataset_to_csv

# Remove annoying warning
from IPython.core.display_functions import display

In [2]:
bedrock_client = bedrock_connect(aws_access_key, aws_secret_key)
google_client = google_connect(google_access_key)

# Get Dataset

In [3]:
mmlu_dataset = mmlu.get_mmlu_datasets()
# mmlu_dataset['mmlu_test'] = mmlu_dataset['mmlu_test']
size = mmlu_dataset['mmlu_test'].num_rows
mmlu_dataset['mmlu_test'] = mmlu_dataset['mmlu_test'] #.select(np.arange(0, size, 90))

df_mmlu_full = pd.DataFrame(mmlu_dataset['mmlu_test'])
df_mmlu_part = pd.DataFrame(mmlu_dataset['mmlu_test'].select(np.arange(0, size, 90)))
df_mmlu_part.index = np.arange(0, size, 90)

print(df_mmlu_full.shape, df_mmlu_part.shape)
# display(df_mmlu_full.head(2))
# display(df_mmlu_part.head(2))

mmlu_dataset

(14042, 4) (157, 4)


{'mmlu_test': Dataset({
     features: ['question', 'subject', 'choices', 'answer'],
     num_rows: 14042
 })}

## How much needed from each subject?

In [4]:
df_mmlu_labeled = pd.read_csv('labeled_files/mmlu_test_labeled_gradio.csv')
df_mmlu_labeled.shape

(177, 18)

In [5]:
need_from_each = pd.Series(index=df_mmlu_full['subject'].unique(), data=20)

remove = df_mmlu_part[(df_mmlu_labeled.iloc[:df_mmlu_part.shape[0]]['rating'] != 'SKIP').values]['subject'].value_counts()
remove = remove.reindex(need_from_each.index, fill_value=0)
need_from_each -= remove
need_from_each = need_from_each.sort_values(ascending=False)
display(need_from_each.head(3))
print()
display(need_from_each.tail(3))

electrical_engineering          20
econometrics                    20
high_school_computer_science    20
dtype: int64




miscellaneous       13
moral_scenarios     12
professional_law     6
dtype: int64

In [6]:
use_subjects = [
    "professional_psychology",
    "high_school_psychology",
    "high_school_macroeconomics",
    "elementary_mathematics",
    "prehistory",
    "philosophy",
    "high_school_biology",
    "nutrition",
    "professional_accounting",
    "professional_medicine",
    "high_school_mathematics",
    "clinical_knowledge",
    "security_studies",
    "high_school_microeconomics",
    "high_school_world_history",
    "conceptual_physics",
    "marketing",
    "human_aging",
    "high_school_statistics",
    "high_school_chemistry",
    "sociology",
    "high_school_geography",
    "college_medicine",
    "world_religions",
    "virology",
    "high_school_european_history",
    "logical_fallacies",
    "astronomy",
    "high_school_physics",
    "electrical_engineering",
    "college_biology",
    "anatomy",
    "human_sexuality",
    "formal_logic",
    "international_law",
    "econometrics",
    "machine_learning",
    "public_relations",
    "management",
    "college_physics",
    "college_computer_science",
    "college_mathematics",
    "global_facts",
    "high_school_computer_science",
    "computer_security",
    "abstract_algebra",
    "business_ethics",
    "college_chemistry",
    "medical_genetics",
]

side_subjects = [
    "professional_law",
    "moral_scenarios",
    "miscellaneous",
    "moral_disputes",
    "high_school_us_history",
    "high_school_government_and_politics",
    "jurisprudence",
    "us_foreign_policy",
]

In [7]:
need_from_each[need_from_each.index.isin(use_subjects)].sum(), need_from_each[need_from_each.index.isin(side_subjects)].sum()

(892, 124)

In [8]:
df_mmlu_full = df_mmlu_full[~df_mmlu_full.index.isin(df_mmlu_part.index)]
df_mmlu_full.shape

(13885, 4)

In [9]:
needed_use_only = need_from_each[need_from_each.index.isin(side_subjects)]
print(needed_use_only.shape)

df_mmlu_full['subject_cnt'] = df_mmlu_full.groupby('subject').cumcount()
take_indices = df_mmlu_full[df_mmlu_full['subject'].isin(side_subjects)]
# take_indices = take_indices[take_indices.apply(lambda x: x['subject_cnt'] < 4, axis=1)]
take_indices = take_indices[take_indices.apply(lambda x: x['subject_cnt'] < needed_use_only.loc[x['subject']] + 5, axis=1)]
print(take_indices.shape)
mmlu_dataset['mmlu_test'] = mmlu_dataset['mmlu_test'].select(take_indices.index)
mmlu_dataset

(8,)
(164, 5)


{'mmlu_test': Dataset({
     features: ['question', 'subject', 'choices', 'answer'],
     num_rows: 164
 })}

In [10]:
take_indices['subject'].value_counts()

subject
high_school_us_history                 25
high_school_government_and_politics    24
us_foreign_policy                      24
jurisprudence                          23
moral_disputes                         22
miscellaneous                          18
moral_scenarios                        17
professional_law                       11
Name: count, dtype: int64

In [11]:
(pd.DataFrame(mmlu_dataset['mmlu_test'])['subject'].value_counts().sort_index() == needed_use_only.sort_index()).all()

False

## Create save file

In [12]:
mmlu_dataset

{'mmlu_test': Dataset({
     features: ['question', 'subject', 'choices', 'answer'],
     num_rows: 164
 })}

In [13]:
mmlu_dataset['mmlu_test'] = mmlu_dataset['mmlu_test']
mmlu_file_name = 'compare_csv/mmlu/mmlu_test_prob_sub.csv'

In [14]:
mmlu_dataset['mmlu_test'] = mmlu_dataset['mmlu_test']

In [108]:
mmlu_dataset['mmlu_test'] = mmlu_dataset['mmlu_test'].select(list(range(24)) + list(range(25, 61)) + list(range(62, 164)))
mmlu_dataset['mmlu_test']

Dataset({
    features: ['question', 'subject', 'choices', 'answer'],
    num_rows: 162
})

In [109]:
df = add_dataset_to_csv(mmlu_file_name, 'original', mmlu_dataset['mmlu_test'], mmlu.mmlu_sample_to_dict)
text_df = add_dataset_to_csv(mmlu_file_name[:-4] + '-text.csv', 'original', mmlu_dataset['mmlu_test'], mmlu.mmlu_sample_to_dict)
display(df.head(2))
display(text_df.head(2))

Unnamed: 0,original
0,<question>Which of the following best describe...
1,<question>Which of the following statements do...


Unnamed: 0,original
0,<question>Which of the following best describe...
1,<question>Which of the following statements do...


In [110]:
df = pd.read_csv(mmlu_file_name)
text_df = pd.read_csv(mmlu_file_name[:-4] + '-text.csv')
display(df.head(2))
display(text_df.head(2))

Unnamed: 0,original
0,<question>Which of the following best describe...
1,<question>Which of the following statements do...


Unnamed: 0,original
0,<question>Which of the following best describe...
1,<question>Which of the following statements do...


In [111]:
df.shape, text_df.shape

((162, 1), (162, 1))

# Run Translation

## Claude

In [10]:
%%time

# exp_name = 'claude_3-7_v7_thinking'
exp_name = 'claude_thinking_v2'

hebrew_datasets, text_output = claude_translation(
    bedrock_client,
    mmlu_dataset,
    # small,
    mmlu_prompts.MMLU_INSTRUCT_CLAUDE_V2,
    mmlu_prompts.MMLU_FEW_SHOTS,
    mmlu_prompts.MMLU_FORMAT,
    mmlu.mmlu_sample_to_dict,
    mmlu.mmlu_dict_to_sample,
    if_four=False,
)

Translating mmlu_test...


  0%|          | 0/157 [00:00<?, ?it/s]

CPU times: user 1.11 s, sys: 161 ms, total: 1.27 s
Wall time: 41min 55s


In [11]:
def len_mat(text):
    pattern = r"<(?!response_format\b)([^>]+)>(.*?)</\1>"
    matches = re.findall(pattern, text, re.DOTALL)
    return len(matches)

In [12]:
import re
pd.Series([len_mat(i) for i in text_output['mmlu_test']]).value_counts()

6     129
5      24
10      2
15      1
11      1
Name: count, dtype: int64

In [20]:
pd.Series([len_mat(i.split('Text:')[-1]) for i in text_output['mmlu_test']]).value_counts()

6    132
5     25
Name: count, dtype: int64

In [21]:
print(text_output['mmlu_test'][3])

Thinking:
I need to translate this astronomy multiple-choice question about Jupiter's magnetic field from English to Hebrew. Let's break down the key terms and translate them accurately:

1. "Jupiter's magnetic field" - "השדה המגנטי של צדק"
2. "20000 times stronger" - "חזק פי 20000"
3. "Earth's magnetic field" - "השדה המגנטי של כדור הארץ"
4. "traps charged particles" - "לוכד חלקיקים טעונים"
5. "Io's volcanoes" - "הרי הגעש של איו" (Io is one of Jupiter's moons)
6. "plasma torus" - "טורוס פלזמה" (scientific term that would likely be used as-is in Hebrew with Hebrew pronunciation)
7. "auroral displays" - "מופעי זוהר" or "תצוגות זוהר קוטבי"
8. "north and south poles" - "הקטבים הצפוני והדרומי"
9. "switches polarity" - "מחליף קוטביות"
10. "every 11 years" - "כל 11 שנים"

This is a universal subject (astronomy/planetary science), so I will perform a direct translation without needing to adapt to an Israeli context.

Now for the full translation:

Question: "מה אינו נכון לגבי השדה המגנטי של צד

In [14]:
df = add_dataset_to_csv(mmlu_file_name, exp_name, hebrew_datasets['mmlu_test'], mmlu.mmlu_sample_to_dict)
df.head(2)

Unnamed: 0,original,claude_thinking_v2
0,<question>Find the degree for the given field ...,<question>מצאו את המעלה עבור הרחבת השדה הנתונה...
1,<question>Statement 1 | If a finite group has ...,<question>היגד 1 | אם חבורה סופית היא מסדר n א...


In [15]:
text_df[exp_name + ' text'] = text_output['mmlu_test']
text_df.to_csv(mmlu_file_name[:-4] + '-text.csv', index=False)
text_df.head(2)

Unnamed: 0,original,claude_thinking_v2 text
0,<question>Find the degree for the given field ...,Thinking:\nI need to translate the given Engli...
1,<question>Statement 1 | If a finite group has ...,Thinking:\nI need to translate the given abstr...


## Gemini

In [32]:
# jumped over 24? and 61?
small = {}
small['mmlu_test'] = mmlu_dataset['mmlu_test'].select(list(range(62, mmlu_dataset['mmlu_test'].num_rows)))
small

{'mmlu_test': Dataset({
     features: ['question', 'subject', 'choices', 'answer'],
     num_rows: 102
 })}

In [33]:
exp_name = 'gemini'

hebrew_datasets, text_output = gemini_translation(
    google_client,
    # mmlu_dataset,
    small,
    mmlu_prompts.MMLU_INSTRUCT_V1_GEMINI,
    mmlu_prompts.MMLU_FEW_SHOTS,
    mmlu.mmlu_sample_to_dict,
    mmlu.mmlu_dict_to_sample,
    if_pro=True,
    think_bud=4_096,
)

Translating mmlu_test...


  0%|          | 0/102 [00:00<?, ?it/s]

-|-|-|-|-|-|-|-|-|-|-|-|-|-|-|-|-|-|-|-|-|-|-|-|-|-|-|-|-|-|-|-|-|-|-|-|-|-|-|-|-|-|-|-|-|-|-|-|-|-|-|-|-|-|-|-|-|-|-|-|-|-|-|-|-|-|-|-|-|-|-|-|-|-|-|-|-|-|-|-|-|-|-|-|-|-|-|-|-|-|-|-|-|-|-|-|-|-|-|-|-|-|

In [34]:
hebrew_datasets

{'mmlu_test': Dataset({
     features: ['question', 'subject', 'choices', 'answer', 'translation_status'],
     num_rows: 102
 })}

In [35]:
import pickle
from datasets import Dataset, concatenate_datasets

with open('gemini_cp/ck1 - gemini_mmlu_test_15.pkl', 'rb') as f:
    lst_1 = pickle.load(f)
with open('gemini_cp/ck1 - gemini_mmlu_test_15_text.pkl', 'rb') as f:
    lst_1_text = pickle.load(f)

with open('gemini_cp/ck2 - gemini_mmlu_test_45.pkl', 'rb') as f:
    lst_1 += pickle.load(f)
with open('gemini_cp/ck2 - gemini_mmlu_test_45_text.pkl', 'rb') as f:
    lst_1_text += pickle.load(f)

len(lst_1), len(lst_1_text)

(60, 60)

In [36]:
lst_1 = Dataset.from_list(lst_1)

hebrew_datasets['mmlu_test_fixed'] = concatenate_datasets([lst_1, hebrew_datasets['mmlu_test']])
text_output['mmlu_test_fixed'] = lst_1_text + text_output['mmlu_test']

In [112]:
hebrew_datasets['mmlu_test_fixed']

Dataset({
    features: ['question', 'subject', 'choices', 'answer', 'translation_status'],
    num_rows: 162
})

In [113]:
df = add_dataset_to_csv(mmlu_file_name, exp_name, hebrew_datasets['mmlu_test_fixed'], mmlu.mmlu_sample_to_dict)
text_df[exp_name + ' text'] = text_output['mmlu_test_fixed']
text_df.to_csv(mmlu_file_name[:-4] + '-text.csv', index=False)
display(df.head(2))
display(df.tail(2))
display(text_df.head(2))

Unnamed: 0,original,gemini
0,<question>Which of the following best describe...,<question>איזו מהאפשרויות הבאות מתארת בצורה הט...
1,<question>Which of the following statements do...,<question>איזו מהטענות הבאות אינה מתארת נכונה ...


Unnamed: 0,original,gemini
160,<question>The dominant course for foreign poli...,"<question>התפיסה האסטרטגית, שהייתה בעלת השפעה ..."
161,<question>What led Britain to impose new taxes...,<question>מה הייתה סיבה מרכזית להטלת מיסים על ...


Unnamed: 0,original,gemini text
0,<question>Which of the following best describe...,**Analysis of a Translation Task: US Constitut...
1,<question>Which of the following statements do...,"**Okay, let's break down this task, step by st..."


In [114]:
df['answer_label'] = pd.Series(hebrew_datasets['mmlu_test_fixed']['answer'])
df['subject'] = pd.Series(hebrew_datasets['mmlu_test_fixed']['subject'])
print((df['answer_label'] == pd.Series(hebrew_datasets['mmlu_test_fixed']['answer'])).all())
df.to_csv(mmlu_file_name, index=False)

True


In [115]:
df.head()

Unnamed: 0,original,gemini,answer_label,subject
0,<question>Which of the following best describe...,<question>איזו מהאפשרויות הבאות מתארת בצורה הט...,ד,high_school_government_and_politics
1,<question>Which of the following statements do...,<question>איזו מהטענות הבאות אינה מתארת נכונה ...,ב,high_school_government_and_politics
2,<question>Which of the following plays the mos...,<question>לאיזה מהבאים יש את התפקיד המשמעותי ב...,ב,high_school_government_and_politics
3,<question>What power was granted to the states...,"<question>איזו מהסמכויות הבאות, שהוחזקה על ידי...",א,high_school_government_and_politics
4,<question>The primary function of political ac...,<question>תפקידן העיקרי של ועדות פעולה פוליטיו...,א,high_school_government_and_politics


In [116]:
df.tail()

Unnamed: 0,original,gemini,answer_label,subject
157,<question>Why do Liberal Internationalists arg...,<question>מדוע ליברלים אינטרנציונליסטים טוענים...,ב,us_foreign_policy
158,<question>What was the significance of the Tru...,<question>מה הייתה משמעותה של דוקטרינת טרומן?<...,ד,us_foreign_policy
159,<question>What tend to be the effects of oil a...,<question>מהן ההשפעות של סחר בנפט ובמשאבי טבע ...,ג,us_foreign_policy
160,<question>The dominant course for foreign poli...,"<question>התפיסה האסטרטגית, שהייתה בעלת השפעה ...",ג,us_foreign_policy
161,<question>What led Britain to impose new taxes...,<question>מה הייתה סיבה מרכזית להטלת מיסים על ...,ב,us_foreign_policy
