# Imports

In [1]:
%load_ext autoreload
%autoreload 2

import pandas as pd
import sys
import re
import json

from src.benchmarks_code import gsm8k
from src.benchmarks_code import arc_ai
from src.benchmarks_code import mmlu
from src.benchmarks_code import hellaswag

from src.parse_labeling import parse_from_gradio, parse_single_file_gradio, map_rating_to_model
from src.translate_func import dict_to_prompt

In [2]:
def query_to_sample(text):
    pattern = r"<(?!response_format\b)([^>]+)>(.*?)</\1>"
    matches = re.findall(pattern, text, re.DOTALL)
    return {key: value.strip() for key, value in matches}

In [3]:
def get_gold(or_df, return_original_sample=False):
    """
    or_df - a Dataframe with the columns 'rating model', 'gold', 'option 1', 'option 2', 'model 1', 'model 2'
    """
    final_lst = []
    
    if return_original_sample:
        original_text_lst = []
    for cnt, row in or_df.iterrows():
        if row['rating model'] == 'SKIP':
            continue
        elif row['gold'] != '':
            final_lst.append(row['gold'].replace('Option 1:\n', '').replace('Option 2:\n', ''))
        elif row['rating model'] == 'BOTH':
            if 'gemini' in row['model 1']:
                final_lst.append(row['option 1'])
            else:
                final_lst.append(row['option 2'])
        elif row['rating model'] == row['model 1']:
            final_lst.append(row['option 1'])
        elif row['rating model'] == row['model 2']:
            final_lst.append(row['option 2'])
        else:
            print(cnt, row)
            assert False, "ERROR ERROR ERRORRRRRR!"

        if return_original_sample:
            # Add the original (english) text always - unless it is a 'SKIP' sample
            original_text_lst.append(row['original'])
    
    final_lst = [query_to_sample(s) for s in final_lst]
    # original_text_lst = [query_to_sample(s) for s in original_text_lst]

    if return_original_sample:
        return final_lst, original_text_lst
    else:
        return final_lst

# ARC-AI2

In [4]:
arc_en = arc_ai.get_arc_ai2_datasets()
arc_en

README.md: 0.00B [00:00, ?B/s]

ARC-Challenge/train-00000-of-00001.parqu(…):   0%|          | 0.00/190k [00:00<?, ?B/s]

ARC-Challenge/test-00000-of-00001.parque(…):   0%|          | 0.00/204k [00:00<?, ?B/s]

ARC-Challenge/validation-00000-of-00001.(…):   0%|          | 0.00/55.7k [00:00<?, ?B/s]

Generating train split:   0%|          | 0/1119 [00:00<?, ? examples/s]

Generating test split:   0%|          | 0/1172 [00:00<?, ? examples/s]

Generating validation split:   0%|          | 0/299 [00:00<?, ? examples/s]

{'arc_challenge_train': Dataset({
     features: ['id', 'question', 'choices', 'answerKey'],
     num_rows: 1119
 }),
 'arc_challenge_test': Dataset({
     features: ['id', 'question', 'choices', 'answerKey'],
     num_rows: 1172
 })}

In [5]:
arc_label_train, arc_or_train = parse_from_gradio(
    'labeled_files/arc_ai_labeled_gradio.csv',
    'manual_compare/arc_ai2_train_top_200_FULL.csv',
)

print(arc_label_train.shape, arc_or_train.shape)

(195, 18) (195, 7)


In [6]:
arc_label_train['rating'].value_counts()

rating
2       82
1       62
BOTH    42
SKIP     9
Name: count, dtype: int64

In [7]:
# should be False
(arc_or_train['original'] != arc_label_train['text_column']).any()

False

In [8]:
arc_or_train['gold'] = arc_label_train['gold']

In [15]:
arc_label_test_1 = pd.read_csv('labeled_files/arc_ai_TEST_labeled_gradio.csv')
arc_label_test_2 = pd.read_csv('labeled_files/arc_ai_TEST_2_labeled_gradio.csv')

(arc_label_test_1.tail(51)['text_column'] == arc_label_test_2.head(51)['text_column'].values).all()

True

In [16]:
arc_label_test_1 = pd.read_csv('labeled_files/arc_ai_TEST_labeled_gradio.csv')
arc_label_test_2 = pd.read_csv('labeled_files/arc_ai_TEST_2_labeled_gradio.csv').iloc[51:]

arc_label_test = pd.concat([arc_label_test_1, arc_label_test_2], ignore_index=True)

arc_label_test = arc_label_test.fillna('')
arc_label_test['rating'] = arc_label_test['rating'].apply(str).replace('1.0', '1').replace('2.0', '2')

print(f"Throw away {(arc_label_test['rating'] == 'SKIP').sum()} 'SKIP' samples")
arc_label_test = arc_label_test[arc_label_test['rating'] != 'SKIP']

old = arc_label_test.copy()
arc_label_test.loc[arc_label_test['gold'] == '', 'gold'] = arc_label_test.loc[arc_label_test['gold'] == '']['new_text_column'].values

arc_label_test.shape

Throw away 45 'SKIP' samples


(802, 16)

In [17]:
(old['gold'] != '').sum()

255

In [18]:
# Should be True
(old.loc[old['gold'] != '', 'gold'] == arc_label_test.loc[old['gold'] != '', 'gold']).all()

True

___

In [19]:
arc_he_lst_train, original_txt_train = get_gold(arc_or_train, True)

arc_he_lst_train = [{
    'query': d['question'],
    'choices': [d[f'option {i}'] for i in [1, 2, 3, 4]],
} for d in arc_he_lst_train]

len(arc_he_lst_train), len(original_txt_train), arc_label_train.shape[0] - (arc_label_train['rating'] == 'SKIP').sum()

(186, 186, 186)

In [20]:
arc_he_lst_test = []
original_txt_test = []

for _, row in arc_label_test.iterrows():
    d = query_to_sample(row['gold'])
    arc_he_lst_test.append({
        'query': d['question'],
        'choices': [d[f'option {i}'] for i in [1, 2, 3, 4]],
    })
    
    original_txt_test.append(row['text_column'])

len(arc_he_lst_test), len(original_txt_test), arc_label_test.shape[0]

(802, 802, 802)

In [21]:
MAP_LABEL_TO_ANSWER_INDEX = {
    'A': 0,
    'B': 1,
    'C': 2,
    'D': 3,
    '1': 0,
    '2': 1,
    '3': 2,
    '4': 3,
}

In [22]:
map_indcs_train = {}
cnt = 0
for i in range(len(arc_he_lst_train)):
    while dict_to_prompt(arc_ai.arc_sample_to_dict(arc_en['arc_challenge_train'][cnt])) != original_txt_train[i]:
        cnt += 1
    map_indcs_train[i] = cnt

map_indcs_test = {}
cnt = 0
for i in range(len(arc_he_lst_test)):
    while len(arc_en['arc_challenge_test'][cnt]['choices']['label']) < 4 or dict_to_prompt(arc_ai.arc_sample_to_dict(arc_en['arc_challenge_test'][cnt])) != original_txt_test[i]:
        cnt += 1
    map_indcs_test[i] = cnt

In [23]:
for i in range(len(arc_he_lst_train)):
    if dict_to_prompt(arc_ai.arc_sample_to_dict(arc_en['arc_challenge_train'][map_indcs_train[i]])) != original_txt_train[i]:
        print('WWWWWWWWWWWWWAAAAAAAAAAAAAAAA', i)
        break
    arc_he_lst_train[i]['answer_index'] = MAP_LABEL_TO_ANSWER_INDEX[arc_en['arc_challenge_train'][map_indcs_train[i]]['answerKey']]
    arc_he_lst_train[i]['id'] = arc_en['arc_challenge_train'][map_indcs_train[i]]['id']
    arc_he_lst_train[i]['from_arc_split'] = 'train'

for i in range(len(arc_he_lst_test)):
    if dict_to_prompt(arc_ai.arc_sample_to_dict(arc_en['arc_challenge_test'][map_indcs_test[i]])) != original_txt_test[i]:
        print('WWWWWWWWWWWWWAAAAAAAAAAAAAAAA', i)
        break
    arc_he_lst_test[i]['answer_index'] = MAP_LABEL_TO_ANSWER_INDEX[arc_en['arc_challenge_test'][map_indcs_test[i]]['answerKey']]
    arc_he_lst_test[i]['id'] = arc_en['arc_challenge_test'][map_indcs_test[i]]['id']
    arc_he_lst_test[i]['from_arc_split'] = 'test'

len(arc_he_lst_train), len(arc_he_lst_test), len(arc_he_lst_train) + len(arc_he_lst_test)

(186, 802, 988)

___

In [25]:
output_filename = "final_hebrew_bnch/arc_ai2_chall_heb.jsonl"

# Open the file and write each dictionary as a new line
with open(output_filename, 'w') as f:
    for item in arc_he_lst_train:
        # Convert dict to a JSON string and add a newline character
        f.write(json.dumps(item) + '\n')

    for item in arc_he_lst_test:
        # Convert dict to a JSON string and add a newline character
        f.write(json.dumps(item) + '\n')

print(f"Successfully saved data to {output_filename}")

Successfully saved data to final_hebrew_bnch/arc_ai2_chall_heb.jsonl


# GSM8K

In [57]:
gsm_en = gsm8k.get_gsm8k_datasets()
gsm_en

{'gsm8k_test': Dataset({
     features: ['question', 'answer'],
     num_rows: 1319
 })}

In [58]:
gsm_label, gsm_or = parse_from_gradio(
    'labeled_files/gsm8k_labeled_gradio.csv',
    'manual_compare/gsm8k_169_FULL.csv',
)

print(gsm_label.shape, gsm_or.shape)

(169, 18) (169, 8)


In [59]:
(gsm_or['original'] != gsm_label['text_column']).any()

False

In [60]:
gsm_or['gold'] = gsm_label['gold']

___

In [61]:
gsm_he_lst = get_gold(gsm_or)

gsm_he_lst = [{
    'query': d['question'],
    'gold': d['answer'],
} for d in gsm_he_lst]

len(gsm_he_lst)

169

In [62]:
gsm_label_1 = pd.read_csv('labeled_files/gsm_TEST_labeled_gradio.csv')
gsm_label_2 = pd.read_csv('labeled_files/gsm_TEST_2_labeled_gradio.csv')

(gsm_label_1.tail(50)['text_column'] == gsm_label_2.head(50)['text_column'].values).all()

True

In [63]:
gsm_label_1 = pd.read_csv('labeled_files/gsm_TEST_labeled_gradio.csv')
gsm_label_2 = pd.read_csv('labeled_files/gsm_TEST_2_labeled_gradio.csv').iloc[50:215]

def clean(df):
    df = df.fillna('')
    df['rating'] = df['rating'].apply(str).replace('1.0', '1').replace('2.0', '2')

    print(f"Throw away {(df['rating'] == 'SKIP').sum()} 'SKIP' samples")
    df = df[df['rating'] != 'SKIP']

    old = df.copy()
    df.loc[df['gold'] == '', 'gold'] = df.loc[df['gold'] == '']['new_text_column'].values

    return df

gsm_label_1 = clean(gsm_label_1)
gsm_label_2 = clean(gsm_label_2)

gsm_label_1.shape, gsm_label_2.shape

Throw away 105 'SKIP' samples
Throw away 0 'SKIP' samples


((495, 16), (165, 16))

In [64]:
gsm_he_1 = []
gsm_he_2 = []

for _, row in gsm_label_1.iterrows():
    d = query_to_sample(row['gold'])
    gsm_he_1.append({
        'query': d['question'],
        'gold': d['answer'],
    })

for _, row in gsm_label_2.iterrows():
    if row['gold'] == 'דילגתי':
        continue
        #...
    d = query_to_sample(row['gold'])
    gsm_he_2.append({
        'query': d['question'],
        'gold': d['answer'],
    })

len(gsm_he_1), len(gsm_he_2)

(495, 164)

In [65]:
len(gsm_he_lst)

169

In [66]:
gsm_he_lst = gsm_he_lst + gsm_he_1 + gsm_he_2

len(gsm_he_lst)

828

___

In [67]:
output_filename = "final_hebrew_bnch/gsm8k_heb.jsonl"

# Open the file and write each dictionary as a new line
with open(output_filename, 'w') as f:
    for item in gsm_he_lst:
        # Convert dict to a JSON string and add a newline character
        f.write(json.dumps(item) + '\n')

print(f"Successfully saved data to {output_filename}")

Successfully saved data to final_hebrew_bnch/gsm8k_heb.jsonl


# MMLU

In [4]:
from datasets import load_dataset

mmlu_en = mmlu.get_mmlu_datasets()
mmlu_en['mmlu_train'] = load_dataset("cais/mmlu", "all", split='auxiliary_train')
mmlu_en['mmlu_val'] = load_dataset("cais/mmlu", "all", split='validation')
mmlu_en

{'mmlu_test': Dataset({
     features: ['question', 'subject', 'choices', 'answer'],
     num_rows: 14042
 }),
 'mmlu_train': Dataset({
     features: ['question', 'subject', 'choices', 'answer'],
     num_rows: 99842
 }),
 'mmlu_val': Dataset({
     features: ['question', 'subject', 'choices', 'answer'],
     num_rows: 1531
 })}

In [5]:
mmlu_labels = parse_single_file_gradio('labeled_files/mmlu_test_labeled_gradio.csv')

# manual fix
mmlu_labels.loc[145, 'rating'] = '1'  # rating missing
mmlu_labels.loc[35, 'rating'] = '2'  # rating missing
mmlu_labels.loc[35, 'rating model'] =  mmlu_labels.loc[35, 'model 2'] # rating missing
mmlu_labels.loc[68, 'gold'] = ''  # the gold is '\n' instead of ''

mmlu_labels.shape

(177, 22)

In [6]:
mmlu_labels['rating'].value_counts()

rating
2       58
1       52
SKIP    38
BOTH    29
Name: count, dtype: int64

In [7]:
print(f"Throw away {(mmlu_labels['rating'] == 'SKIP').sum()} 'SKIP' samples")
mmlu_labels = mmlu_labels[mmlu_labels['rating'] != 'SKIP']

Throw away 38 'SKIP' samples


In [8]:
mmlu_labels.shape

(139, 22)

___

In [9]:
mmlu_labels['rating'] = mmlu_labels.apply(lambda x: map_rating_to_model(x, mmlu_labels), axis=1)

In [10]:
mmlu_he_lst, mmlu_original_txt = get_gold(mmlu_labels, True)

mmlu_he_lst = [{
    'query': d['question'],
    'choices': [d[f'choice_{i}'] for i in ['a', 'b', 'c', 'd']],
} for d in mmlu_he_lst]

len(mmlu_he_lst), len(mmlu_original_txt), mmlu_labels.shape[0] - (mmlu_labels['rating'] == 'SKIP').sum()

(139, 139, 139)

In [26]:
a = pd.read_csv('labeled_files/mmlu_main_sub_TEST_2_labeled_gradio.csv')['rating'].iloc[34:] == 'SKIP'

In [28]:
a[a].shape

(8,)

In [11]:
more_mmlu = {}
more_mmlu['mmlu_main_1'] = pd.read_csv('labeled_files/mmlu_main_sub_TEST_labeled_gradio.csv')
more_mmlu['mmlu_main_2'] = pd.read_csv('labeled_files/mmlu_main_sub_TEST_2_labeled_gradio.csv')
more_mmlu['mmlu_prob_1'] = pd.read_csv('labeled_files/mmlu_prob_TEST_gradio.csv')
more_mmlu['mmlu_prob_2'] = pd.read_csv('labeled_files/mmlu_prob_TEST_2_gradio.csv').head(64)

# overlap:
assert (more_mmlu['mmlu_main_1']['text_column'].iloc[:500:15] == more_mmlu['mmlu_main_2']['text_column'].head(34).values).all(), 'nonono!'
more_mmlu['mmlu_main_2'] = more_mmlu['mmlu_main_2'].iloc[34:]

# manual fix
if more_mmlu['mmlu_main_1'].loc[326, 'gold'].startswith('question>'):
    more_mmlu['mmlu_main_1'].loc[326, 'gold'] = '<' + more_mmlu['mmlu_main_1'].loc[326, 'gold']

more_mmlu['mmlu_main_2'].loc[212, 'rating'] = 'SKIP'


def clean(df):
    df = df.fillna('')
    df['rating'] = df['rating'].apply(str).replace('1.0', '1').replace('2.0', '2')

    print(f"Throw away {(df['rating'] == 'SKIP').sum()} 'SKIP' samples")
    df = df[df['rating'] != 'SKIP']

    old = df.copy()
    df.loc[df['gold'] == '', 'gold'] = df.loc[df['gold'] == '']['new_text_column'].values

    return df


more_mmlu_he_lst = {k: [] for k in more_mmlu}
more_mmlu_original_txt = {k: [] for k in more_mmlu}
for k in more_mmlu:
    more_mmlu[k] = clean(more_mmlu[k])
    for _, row in more_mmlu[k].iterrows():
        d = query_to_sample(row['gold'])
        more_mmlu_he_lst[k].append({
            'query': d['question'],
            'choices': [d[f'choice_{i}'] for i in ['a', 'b', 'c', 'd']],
        })

        more_mmlu_original_txt[k].append(row['text_column'])

for k in more_mmlu:
    print(k, more_mmlu[k].shape, len(more_mmlu_original_txt[k]))

Throw away 111 'SKIP' samples
Throw away 9 'SKIP' samples
Throw away 13 'SKIP' samples
Throw away 4 'SKIP' samples
mmlu_main_1 (580, 16) 580
mmlu_main_2 (185, 16) 185
mmlu_prob_1 (68, 16) 68
mmlu_prob_2 (60, 16) 60


In [12]:
MAP_LABEL_TO_ANSWER_INDEX = {
    'A': 0,
    'B': 1,
    'C': 2,
    'D': 3,
    '1': 0,
    '2': 1,
    '3': 2,
    '4': 3,
    0: 0,
    1: 1,
    2: 2,
    3: 3,
}

In [13]:
map_indcs = {}
cnt = 0
# The mmlu test file contains also 22 samples from the val....
for i in range(len(mmlu_he_lst)):  # len(mmlu_he_lst)):
    while dict_to_prompt(mmlu.mmlu_sample_to_dict(mmlu_en['mmlu_test'][cnt])) != mmlu_original_txt[i]:
        cnt += 1
        if cnt >= mmlu_en['mmlu_test'].num_rows:
            break
    if cnt >= mmlu_en['mmlu_test'].num_rows:
        break
    map_indcs[i] = cnt

start_again_from = i
print(start_again_from)

map_indcs_val = {}
cnt = 0
for i in range(start_again_from, len(mmlu_he_lst)):
    while dict_to_prompt(mmlu.mmlu_sample_to_dict(mmlu_en['mmlu_val'][cnt])) != mmlu_original_txt[i]:
        cnt += 1
    map_indcs_val[i] = cnt

print('----')

more_mmlu_map_indcs = {k: {} for k in more_mmlu_he_lst}
for k in more_mmlu_he_lst:
    cnt = 0
    for i in range(len(more_mmlu_he_lst[k])):
        while dict_to_prompt(mmlu.mmlu_sample_to_dict(mmlu_en['mmlu_test'][cnt])) != more_mmlu_original_txt[k][i]:
            cnt += 1
        more_mmlu_map_indcs[k][i] = cnt

print(len(map_indcs), len(map_indcs_val), len(mmlu_he_lst), len(map_indcs) + len(map_indcs_val))
for k in more_mmlu_he_lst:
    print(len(more_mmlu_he_lst[k]), len(more_mmlu_map_indcs[k]))

121
----
121 18 139 139
580 580
185 185
68 68
60 60


In [14]:
for i in range(start_again_from):
    original_en = dict_to_prompt(mmlu.mmlu_sample_to_dict(mmlu_en['mmlu_test'][map_indcs[i]]))
    if original_en != mmlu_original_txt[i]:
        print('WWWWWWWWWWWWWAAAAAAAAAAAAAAAA', i)
        break
    new_answer = MAP_LABEL_TO_ANSWER_INDEX[mmlu_en['mmlu_test'][map_indcs[i]]['answer']]
    mmlu_he_lst[i]['answer_index'] = new_answer
    mmlu_he_lst[i]['from_mmlu_split'] = 'test'
    mmlu_he_lst[i]['subject'] = mmlu_en['mmlu_test'][map_indcs[i]]['subject']

for i in range(start_again_from, len(mmlu_he_lst)):
    original_en = dict_to_prompt(mmlu.mmlu_sample_to_dict(mmlu_en['mmlu_val'][map_indcs_val[i]]))
    if original_en != mmlu_original_txt[i]:
        print('WWWWWWWWWWWWWAAAAAAAAAAAAAAAA', i)
        break
    new_answer = MAP_LABEL_TO_ANSWER_INDEX[mmlu_en['mmlu_val'][map_indcs_val[i]]['answer']]
    mmlu_he_lst[i]['answer_index'] = new_answer
    mmlu_he_lst[i]['from_mmlu_split'] = 'validation'
    mmlu_he_lst[i]['subject'] = mmlu_en['mmlu_val'][map_indcs_val[i]]['subject']

for k in more_mmlu_he_lst:
    for i in range(len(more_mmlu_he_lst[k])):
        original_en = dict_to_prompt(mmlu.mmlu_sample_to_dict(mmlu_en['mmlu_test'][more_mmlu_map_indcs[k][i]]))
        if original_en != more_mmlu_original_txt[k][i]:
            print('WWWWWWWWWWWWWAAAAAAAAAAAAAAAA', i)
            break
        new_answer = MAP_LABEL_TO_ANSWER_INDEX[mmlu_en['mmlu_test'][more_mmlu_map_indcs[k][i]]['answer']]
        more_mmlu_he_lst[k][i]['answer_index'] = new_answer
        more_mmlu_he_lst[k][i]['from_mmlu_split'] = 'test'
        more_mmlu_he_lst[k][i]['subject'] = mmlu_en['mmlu_test'][more_mmlu_map_indcs[k][i]]['subject']

print(len(mmlu_he_lst))
for k in more_mmlu_he_lst:
    print(len(more_mmlu_he_lst[k]))

139
580
185
68
60


___

In [15]:
output_filename = "final_hebrew_bnch/MMLU_heb_2.jsonl"

# Open the file and write each dictionary as a new line
with open(output_filename, 'w') as f:
    for item in mmlu_he_lst:
        # Convert dict to a JSON string and add a newline character
        f.write(json.dumps(item) + '\n')

    for k in more_mmlu_he_lst:
        for item in more_mmlu_he_lst[k]:
            # Convert dict to a JSON string and add a newline character
            f.write(json.dumps(item) + '\n')

print(f"Successfully saved data to {output_filename}")

Successfully saved data to final_hebrew_bnch/MMLU_heb_2.jsonl


___

In [176]:
# Process the file in chunks of 1000 lines at a time
chunk_iterator = pd.read_json(output_filename, lines=True, chunksize=1000)

for chunk in chunk_iterator:
    mmlu_df = chunk

In [182]:
mmlu_df['subject'].value_counts()

subject
abstract_algebra                       22
elementary_mathematics                 22
college_mathematics                    21
conceptual_physics                     20
formal_logic                           20
medical_genetics                       20
philosophy                             20
machine_learning                       20
high_school_statistics                 20
international_law                      20
high_school_computer_science           20
high_school_geography                  19
human_sexuality                        19
virology                               19
high_school_world_history              19
professional_medicine                  19
professional_psychology                19
nutrition                              19
astronomy                              19
high_school_microeconomics             19
high_school_psychology                 19
high_school_biology                    19
high_school_macroeconomics             19
high_school_chemistry     

# HellaSwag

In [8]:
from datasets import load_dataset

hellaswag_en = hellaswag.get_hellaswag_datasets()
hellaswag_en = hellaswag_en['hellaswag_val'].select(range(0, 1_000, 30))
hellaswag_en = hellaswag_en.take(20)
hellaswag_en

Dataset({
    features: ['ind', 'activity_label', 'ctx_a', 'ctx_b', 'ctx', 'endings', 'source_id', 'split', 'split_type', 'label'],
    num_rows: 20
})

In [10]:
hellaswag_labels = pd.read_csv('compare_csv/hellaswag/hellaswag_test_20_samples.csv')

hellaswag_labels.shape

(20, 3)

In [11]:
hellaswag_labels

Unnamed: 0,original,gemini,answer_label
0,<activity_label>Roof shingle removal</activity...,<activity_label>הסרת רעפים מהגג</activity_labe...,3
1,<activity_label>Gargling mouthwash</activity_l...,<activity_label>גרגור מי פה</activity_label>\n...,1
2,<activity_label>Kayaking</activity_label>\n<ct...,<activity_label>שייט בקיאק</activity_label>\n<...,1
3,<activity_label>Surfing</activity_label>\n<ctx...,<activity_label>גלישת גלים</activity_label>\n<...,1
4,<activity_label>Playing violin</activity_label...,<activity_label>נגינה בכינור</activity_label>\...,2
5,<activity_label>Layup drill in basketball</act...,<activity_label>תרגול ליי-אפ בכדורסל</activity...,1
6,<activity_label>Ice fishing</activity_label>\n...,<activity_label>דיג בקרח</activity_label>\n<ct...,2
7,<activity_label>Sharpening knives</activity_la...,<activity_label>השחזת סכינים</activity_label>\...,2
8,<activity_label>Doing motocross</activity_labe...,<activity_label>רכיבת מוטוקרוס</activity_label...,3
9,<activity_label>High jump</activity_label>\n<c...,<activity_label>קפיצה לגובה</activity_label>\n...,2


In [12]:
query_to_sample(hellaswag_labels.loc[0, 'gemini'])

{'activity_label': 'הסרת רעפים מהגג',
 'ctx_a': 'גבר יושב על גג.',
 'ctx_b': 'הוא',
 'ctx': 'גבר יושב על גג. הוא',
 'ending 1': 'משתמש בניילון נצמד כדי לעטוף זוג מגלשי סקי.',
 'ending 2': 'תולש אריחים ישרים.',
 'ending 3': 'מחזיק קובייה הונגרית.',
 'ending 4': 'מתחיל לתלוש את הרעפים מהגג.'}

In [17]:
hellaswag_he_lst = []
for cnt, row in hellaswag_labels.iterrows():
    d = query_to_sample(row['gemini'])
    hellaswag_he_lst.append({
        'query': d['ctx'],
        'choices': [d[f'ending {i}'] for i in ['1', '2', '3', '4']],
        'answer_index': row['answer_label']
    })

len(hellaswag_he_lst)

20

In [19]:
hellaswag_he_lst[:2]

[{'qwery': 'גבר יושב על גג. הוא',
  'choices': ['משתמש בניילון נצמד כדי לעטוף זוג מגלשי סקי.',
   'תולש אריחים ישרים.',
   'מחזיק קובייה הונגרית.',
   'מתחיל לתלוש את הרעפים מהגג.'],
  'answer_index': 3},
 {'qwery': 'היא מביאה להם קצת מים לגרגר בפה. הילד והילדה מתחילים לשחק בכיור. האישה',
  'choices': ['מנידה בראשה בחוסר אמון ומנופפת לה לשלום.',
   'צוחקת על הילדים שמטפטפים מים.',
   'חוזרת ומדברת עם הבנים.',
   'מוציאה אוכל מהמקרר והם ממשיכים לשחק יחד.'],
  'answer_index': 1}]

___

In [20]:
output_filename = "final_hebrew_bnch/hellaswag_20_samples.jsonl"

# Open the file and write each dictionary as a new line
with open(output_filename, 'w') as f:
    for item in hellaswag_he_lst:
        # Convert dict to a JSON string and add a newline character
        f.write(json.dumps(item) + '\n')

print(f"Successfully saved data to {output_filename}")

Successfully saved data to final_hebrew_bnch/hellaswag_20_samples.jsonl


# COPA

In [22]:
copa_labels = pd.read_csv('compare_csv/copa/copa_train_20_samples.csv')

copa_labels.shape

(20, 3)

In [23]:
copa_labels.head(3)

Unnamed: 0,original,gemini_pro_think_v1,answer_label
0,<premise>My body cast a shadow over the grass....,<premise>גופי הטיל צל על הדשא.</premise>\n<cho...,1
1,<premise>The woman tolerated her friend's diff...,<premise>האישה סבלה את ההתנהגות הקשה של חברתה....,0
2,<premise>The women met for coffee.</premise>\n...,<premise>הנשים נפגשו לקפה.</premise>\n<choice1...,1


In [25]:
copa_labels['gemini_pro_think_v1'].iloc[0]

'<premise>גופי הטיל צל על הדשא.</premise>\n<choice1>הדשא היה מכוסח.</choice1>\n<choice2>השמש זרחה.</choice2>'

In [26]:
copa_he_lst = []
for cnt, row in copa_labels.iterrows():
    d = query_to_sample(row['gemini_pro_think_v1'])
    copa_he_lst.append({
        'query': d['premise'],
        'choices': [d[f'choice{i}'] for i in ['1', '2']],
        'answer_index': row['answer_label']
    })

len(copa_he_lst)

20

In [29]:
copa_he_lst[:2]

[{'qwery': 'גופי הטיל צל על הדשא.',
  'choices': ['הדשא היה מכוסח.', 'השמש זרחה.'],
  'answer_index': 1},
 {'qwery': 'האישה סבלה את ההתנהגות הקשה של חברתה.',
  'choices': ['האישה ידעה שחברתה עוברת תקופה קשה.',
   'האישה הרגישה שחברתה ניצלה את טוב ליבה.'],
  'answer_index': 0}]

___

In [30]:
output_filename = "final_hebrew_bnch/copa_20_samples.jsonl"

# Open the file and write each dictionary as a new line
with open(output_filename, 'w') as f:
    for item in copa_he_lst:
        # Convert dict to a JSON string and add a newline character
        f.write(json.dumps(item) + '\n')

print(f"Successfully saved data to {output_filename}")

Successfully saved data to final_hebrew_bnch/copa_20_samples.jsonl
