In [1]:
!pip install datasets
!pip install openpyxl

#For Gemini
!pip install -q -U google-genai

# For the labeling interface
!pip install gradio

# For the Qwen-MT
!pip install -U openai

Collecting datasets
  Downloading datasets-4.0.0-py3-none-any.whl.metadata (19 kB)
Collecting dill<0.3.9,>=0.3.0 (from datasets)
  Downloading dill-0.3.8-py3-none-any.whl.metadata (10 kB)
Collecting xxhash (from datasets)
  Downloading xxhash-3.5.0-cp310-cp310-manylinux_2_17_x86_64.manylinux2014_x86_64.whl.metadata (12 kB)
Collecting multiprocess<0.70.17 (from datasets)
  Downloading multiprocess-0.70.16-py310-none-any.whl.metadata (7.2 kB)
Collecting fsspec<=2025.3.0,>=2023.1.0 (from fsspec[http]<=2025.3.0,>=2023.1.0->datasets)
  Downloading fsspec-2025.3.0-py3-none-any.whl.metadata (11 kB)
Collecting huggingface-hub>=0.24.0 (from datasets)
  Downloading huggingface_hub-0.34.4-py3-none-any.whl.metadata (14 kB)
Collecting aiohttp!=4.0.0a0,!=4.0.0a1 (from fsspec[http]<=2025.3.0,>=2023.1.0->datasets)
  Downloading aiohttp-3.12.15-cp310-cp310-manylinux_2_17_x86_64.manylinux2014_x86_64.whl.metadata (7.7 kB)
Collecting aiohappyeyeballs>=2.5.0 (from aiohttp!=4.0.0a0,!=4.0.0a1->fsspec[http]<=

# Imports

In [1]:
%load_ext autoreload
%autoreload 2

import pandas as pd
import sys
from datasets import concatenate_datasets

# Call models
from src.call_models import bedrock_connect, call_claude_bedrock
from src.call_models import google_connect, call_gemini, all_string_gemini_config, all_int_gemini_config
from src.translate_func import claude_translation, gemini_translation

# Datasets
from src.benchmarks_code import arc_ai
from src.benchmarks_code import gsm8k
from src.benchmarks_code import copa
from prompts import arc_prompts
from prompts import gsm_prompts
from prompts import copa_prompts
#from prompts import hellaswag_prompts

# Access keys
from my_access_keys import google_access_key, aws_access_key, aws_secret_key

# .csv utils
from src.save_utils import add_dataset_to_csv

# Remove annoying warning
from IPython.core.display_functions import display

In [4]:
from my_access_keys import google_project_id

In [3]:
# Get the model's access keys
bedrock_client = bedrock_connect(aws_access_key, aws_secret_key)
google_client = google_connect(google_access_key)

# Check Models Calls

In [16]:
print('Gemini:')
generate_content_config = all_string_gemini_config(['recipe', 'ingredients'], 'ALWAYS THINK BEFORE ANSWERING!', think_bud=200)
response = call_gemini(google_client, "List a popular cookie recipe, and include the amounts of ingredients.", generate_content_config)
print(response.text)
print('---')
my_recipes = response.parsed
print(my_recipes.keys())

Gemini:
{"recipe": "Classic Chocolate Chip Cookies", "ingredients": "1 cup (2 sticks) unsalted butter, softened; 3/4 cup granulated sugar; 3/4 cup packed light brown sugar; 2 large eggs; 1 teaspoon vanilla extract; 2 1/4 cups all-purpose flour; 1 teaspoon baking soda; 1/2 teaspoon salt; 12 ounces chocolate chips (semi-sweet or milk chocolate)"}
---
dict_keys(['recipe', 'ingredients'])


# ARC_AI2

## Get Dataset

In [8]:
arc_dataset = arc_ai.get_arc_ai2_datasets()
arc_dataset['arc_challenge_train'] = arc_dataset['arc_challenge_train'].skip(5).take(20)
# arc_dataset['arc_challenge_train'] = arc_dataset['arc_challenge_train'].skip(5).take(90)

# Remove sample number 121 - with only 3 optional answers....
# arc_dataset['arc_challenge_test'] = concatenate_datasets([arc_dataset['arc_challenge_test'].take(120), arc_dataset['arc_challenge_test'].skip(122).take(80)])
arc_dataset['arc_challenge_test'] = arc_dataset['arc_challenge_test'].skip(502).take(350)

file_name = 'compare_csv/arc_ai2_chall_test_503-853.csv'

print(file_name)
arc_dataset

compare_csv/arc_ai2_chall_test_503-853.csv


{'arc_challenge_train': Dataset({
     features: ['id', 'question', 'choices', 'answerKey'],
     num_rows: 20
 }),
 'arc_challenge_test': Dataset({
     features: ['id', 'question', 'choices', 'answerKey'],
     num_rows: 350
 })}

In [9]:
use_indcs = list(range(arc_dataset['arc_challenge_test'].num_rows))
for i, s in enumerate(arc_dataset['arc_challenge_test']):
    if len(s['choices']['label']) != 4:
        use_indcs.remove(i)
for i, s in enumerate(arc_dataset['arc_challenge_test'].select(use_indcs)):
    if len(s['choices']['label']) != 4:
        print(i)

arc_dataset['arc_challenge_test'] = arc_dataset['arc_challenge_test'].select(use_indcs)
arc_dataset['arc_challenge_test']

Dataset({
    features: ['id', 'question', 'choices', 'answerKey'],
    num_rows: 349
})

In [12]:
df = pd.read_csv(file_name)
text_df = pd.read_csv(file_name[:-4] + '-text.csv')
print(df.shape, text_df.shape)
display(df.head(2))
display(text_df.head(2))

(349, 1) (349, 1)


Unnamed: 0,original
0,<question>When a mixture of oxygen and hydroge...
1,<question>Which device most likely provided te...


Unnamed: 0,original
0,<question>When a mixture of oxygen and hydroge...
1,<question>Which device most likely provided te...


## Run Translation

### Claude

In [22]:
%%time

# exp_name = 'claude_3-7_v7_thinking'
exp_name = 'claude_4_opus_v7_thinking'

hebrew_datasets, text_output = claude_translation(
    bedrock_client,
    arc_dataset,
    # small,
    arc_prompts.ARC_INSTRUCT_V7_CLAUDE,
    arc_prompts.ARC_FEW_SHOTS,
    arc_prompts.ARC_FORMAT,
    arc_ai.arc_sample_to_dict,
    arc_ai.arc_dict_to_sample,
    if_four=False,
)

Translating arc_challenge_train...


  0%|          | 0/20 [00:00<?, ?it/s]

CPU times: user 164 ms, sys: 20.1 ms, total: 184 ms
Wall time: 3min 53s


In [23]:
def len_mat(text):
    pattern = r"<(?!response_format\b)([^>]+)>(.*?)</\1>"
    matches = re.findall(pattern, text, re.DOTALL)
    return len(matches)

In [24]:
import re
pd.Series([len_mat(i) for i in text_output['arc_challenge_test']]).value_counts()

5    20
Name: count, dtype: int64

In [26]:
df = add_dataset_to_csv(file_name, exp_name, hebrew_datasets['arc_challenge_test'], arc_ai.arc_sample_to_dict)
text_df[exp_name + ' text'] = text_output['arc_challenge_test']
text_df.to_csv(file_name[:-4] + '-text.csv', index=False)
display(df.head(2))
display(text_df.head(2))

Unnamed: 0,original,claude_3-7_v7_thinking,claude_4_opus_v7_thinking
0,<question>Which land form is the result of the...,<question>איזו צורת נוף היא תוצאה של הכוח הבונ...,<question>איזו צורת קרקע היא תוצאה של כוח בונה...
1,<question>Which statement best compares single...,<question>איזו אמירה משווה בצורה הטובה ביותר ב...,<question>איזו אמירה משווה בצורה הטובה ביותר ב...


Unnamed: 0,original,claude_3-7_v7_thinking text,claude_4_opus_v7_thinking text
0,<question>Which land form is the result of the...,Thinking:\nI need to translate the question an...,Thinking:\nLet me translate this question abou...
1,<question>Which statement best compares single...,Thinking:\nI need to translate the given Engli...,"Thinking:\nLet me translate this carefully, ke..."


### Gemini

In [13]:
small = {}
small['arc_challenge_test'] = arc_dataset['arc_challenge_test']
small

{'arc_challenge_test': Dataset({
     features: ['id', 'question', 'choices', 'answerKey'],
     num_rows: 349
 })}

In [14]:
exp_name = 'gemini_pro_think_v2'

hebrew_datasets, text_output = gemini_translation(
    google_client,
    # arc_dataset,
    small,
    arc_prompts.ARC_INSTRUCT_V2_GEMINI,
    arc_prompts.ARC_FEW_SHOTS,
    arc_ai.arc_sample_to_dict,
    arc_ai.arc_dict_to_sample,
    if_pro=True,
    # if_pro=False,
    think_bud=20_000
    # think_bud=2_000
)

Translating arc_challenge_test...


  0%|          | 0/349 [00:00<?, ?it/s]

Sleeping.... Done!                                

In [58]:
hebrew_datasets['arc_challenge_test'][262]

{'id': 'Mercury_402539',
 'question': 'During an experiment, a class heated a balloon that had an initial circumference of 25 cm. The circumference increased to 27 cm. Which is the best conclusion that can be drawn?',
 'choices': {'label': ['A', 'B', 'C', 'D'],
  'text': ['The molecules inside the balloon lost energy to the outside.',
   'The molecules inside the balloon gained energy from the heat.',
   'The energy of the molecules inside the balloon remained the same.',
   'The molecules inside the balloon were escaping outside.']},
 'answerKey': 'B',
 'translation_status': 'Failed to translate!'}

In [15]:
hebrew_datasets

{'arc_challenge_test': Dataset({
     features: ['id', 'question', 'choices', 'answerKey', 'translation_status'],
     num_rows: 349
 })}

In [16]:
df = add_dataset_to_csv(file_name, exp_name, hebrew_datasets['arc_challenge_test'], arc_ai.arc_sample_to_dict)
text_df[exp_name + ' text'] = text_output['arc_challenge_test']
text_df.to_csv(file_name[:-4] + '-text.csv', index=False)
display(df.head(2))
display(text_df.head(2))

Unnamed: 0,original,gemini_pro_think_v2
0,<question>When a mixture of oxygen and hydroge...,<question>כאשר מציתים תערובת של חמצן ומימן במב...
1,<question>Which device most likely provided te...,<question>איזה מכשיר ככל הנראה סיפק את הטכנולו...


Unnamed: 0,original,gemini_pro_think_v2 text
0,<question>When a mixture of oxygen and hydroge...,**Reflecting on the Translation Process**\n\nO...
1,<question>Which device most likely provided te...,**Hebrew Translation: A Swift and Accurate Ada...


In [17]:
df.shape, text_df.shape

((349, 2), (349, 2))

### Multi-options Translation - Gemini

### Claude vs Gemini (using Gemini as judge)

# GSM8K

## Get Dataset

In [24]:
gsm_dataset = gsm8k.get_gsm8k_datasets()
gsm_dataset['gsm8k_test'] = gsm_dataset['gsm8k_test'].skip(1_000).select(list(range(0, 67)) + list(range(68, 150)))
# gsm_file_name = 'compare_csv/gsm8k_main_test_top_200.csv'
gsm_file_name = 'compare_csv/gsm8k_main_test_1001-1150.csv'

print(gsm_file_name)
gsm_dataset

compare_csv/gsm8k_main_test_1001-1150.csv


{'gsm8k_test': Dataset({
     features: ['question', 'answer'],
     num_rows: 149
 })}

In [25]:
df = add_dataset_to_csv(gsm_file_name, 'original', gsm_dataset['gsm8k_test'], gsm8k.gsm8k_sample_to_dict)
text_df = add_dataset_to_csv(gsm_file_name[:-4] + '-text.csv', 'original', gsm_dataset['gsm8k_test'], gsm8k.gsm8k_sample_to_dict)
display(df.head(2))
display(text_df.head(2))

Unnamed: 0,original
0,<question>Doctor Jones is scheduling his time ...
1,<question>Jordan wanted to surprise her mom wi...


Unnamed: 0,original
0,<question>Doctor Jones is scheduling his time ...
1,<question>Jordan wanted to surprise her mom wi...


## Run Translation

### Claude

In [14]:
%%time

# exp_name = 'claude_3-7_v7_thinking'
exp_name = 'claude_4_opus_v7_thinking'

hebrew_datasets, text_output = claude_translation(
    bedrock_client,
    # small,
    gsm_dataset,
    gsm_prompts.GSM_INSTRUCT_CLAUDE_V2,
    gsm_prompts.GSM_FEW_SHOTS,
    gsm_prompts.GSM_FORMAT,
    # gsm_prompts.GSM_FORMAT_REFINE,
    gsm8k.gsm8k_sample_to_dict,
    gsm8k.gsm8k_dict_to_sample,
    if_four=False,
)

Translating gsm8k_test...


  0%|          | 0/20 [00:00<?, ?it/s]

CPU times: user 216 ms, sys: 34.5 ms, total: 250 ms
Wall time: 4min 20s


In [15]:
def len_mat(text):
    pattern = r"<(?!response_format\b)([^>]+)>(.*?)</\1>"
    matches = re.findall(pattern, text, re.DOTALL)
    return len(matches)

In [16]:
import re
pd.Series([len_mat(i) for i in text_output['gsm8k_test']]).value_counts()

2    20
Name: count, dtype: int64

In [20]:
print(text_output['gsm8k_test'][3])

Thinking:
I need to translate this English math question into Hebrew following the guidelines:

1. The question is about James running sprints, which is a common activity that doesn't need cultural adaptation
2. I need to use an Israeli name instead of James - I'll use יעקב (Ya'akov) which is a common Israeli name
3. The measurements are in meters which is already used in Israel, so no conversion needed
4. The structure and numbers remain the same

Let me translate:

Question: James decides to run 3 sprints 3 times a week. He runs 60 meters each sprint. How many total meters does he run a week?

Answer: He sprints 3*3=<<3*3=9>>9 times
So he runs 9*60=<<9*60=540>>540 meters
#### 540

Hebrew translation:
Question: יעקב מחליט לרוץ 3 ספרינטים 3 פעמים בשבוע. הוא רץ 60 מטרים בכל ספרינט. כמה מטרים סך הכל הוא רץ בשבוע?

Answer: הוא עושה ספרינטים 3*3=<<3*3=9>>9 פעמים
אז הוא רץ 9*60=<<9*60=540>>540 מטרים
#### 540

Text:
<question>יעקב מחליט לרוץ 3 ספרינטים 3 פעמים בשבוע. הוא רץ 60 מטרים בכל ספרי

In [18]:
df = add_dataset_to_csv(gsm_file_name, exp_name, hebrew_datasets['gsm8k_test'], gsm8k.gsm8k_sample_to_dict)
df.head(2)

Unnamed: 0,original,claude_3-7_v7_thinking,claude_4_opus_v7_thinking
0,<question>Janet’s ducks lay 16 eggs per day. S...,<question>הברווזים של יעל מטילים 16 ביצים כל י...,<question>ברווזות של ג'נט מטילות 16 ביצים ביום...
1,<question>A robe takes 2 bolts of blue fiber a...,<question>גלימה דורשת 2 גלילים של בד כחול וחצי...,<question>חלוק דורש 2 גלילי סיבים כחולים וחצי ...


In [19]:
text_df[exp_name + ' text'] = text_output['gsm8k_test']
text_df.to_csv(gsm_file_name[:-4] + '-text.csv', index=False)
text_df.head(2)

Unnamed: 0,original,claude_3-7_v7_thinking text,claude_4_opus_v7_thinking text
0,<question>Janet’s ducks lay 16 eggs per day. S...,Thinking:\nLet's translate this question from ...,Thinking:\nLet me translate this step by step:...
1,<question>A robe takes 2 bolts of blue fiber a...,Thinking:\nLet me translate this math problem ...,Thinking:\nI need to translate this question a...


In [73]:
for i in range(10):
    print(text_df['claude_v2_refine text'].iloc[i])
    print('\n - - - - - - - - - - - - - - - - - - - - - - - - - - - -\n')

First translation attempt:
<question>התרנגולות של ינט מטילות 16 ביצים ביום. היא אוכלת שלוש לארוחת בוקר כל בוקר ואופה מאפינס לחבריה כל יום עם ארבע. היא מוכרת את השאר בשוק האיכרים מדי יום ב-2$ לביצת ברווז טרייה. כמה דולרים היא מרוויחה כל יום בשוק האיכרים?</question>
<answer>ינט מוכרת 16 - 3 - 4 = <<16-3-4=9>>9 ביצי ברווז ביום.
היא מרוויחה 9 * 2 = $<<9*2=18>>18 כל יום בשוק האיכרים.
#### 18</answer>

<explain>
The translation needs several improvements:
1. "Janet" should be changed to a more common Israeli name like "יעל"
2. The original mentions "ducks" but I mistakenly translated to "תרנגולות" (chickens) in the first sentence
3. Need to change dollars ($) to shekels (₪)
4. "Farmers' market" should be translated to the more common Israeli term "שוק איכרים" 
5. Need to ensure consistency with "duck eggs" throughout the translation
6. The formatting and structure should be preserved but with Hebrew right-to-left orientation
</explain>

Improved translation:
<question>הברווזים של יעל מטילים 

### Gemini

In [15]:
small = {}
small['gsm8k_test'] = gsm_dataset['gsm8k_test'].select(list(range(50, 67)) + list(range(68, 150)))
small

{'gsm8k_test': Dataset({
     features: ['question', 'answer'],
     num_rows: 99
 })}

In [16]:
exp_name = 'gemini_pro_think_v2'

hebrew_datasets, text_output = gemini_translation(
    google_client,
    small,
    # gsm_dataset,
    gsm_prompts.GSM_INSTRUCT_GEMINI_V2,
    gsm_prompts.GSM_FEW_SHOTS,
    gsm8k.gsm8k_sample_to_dict,
    gsm8k.gsm8k_dict_to_sample,
    if_pro=True,
    think_bud=8192,
)

Translating gsm8k_test...


  0%|          | 0/99 [00:00<?, ?it/s]

Sleeping in the "While" because of 503 UNAVAILABLE. {'error': {'code': 503, 'message': 'The model is overloaded. Please try again later.', 'status': 'UNAVAILABLE'}}.... Done!

In [19]:
import pickle
from datasets import Dataset, concatenate_datasets

with open('gemini_cp/ck - gemini_gsm8k_test_50.pkl', 'rb') as f:
    lst_1 = pickle.load(f)
with open('gemini_cp/ck - gemini_gsm8k_test_50_text.pkl', 'rb') as f:
    lst_1_text = pickle.load(f)
print(len(lst_1), len(lst_1_text))

lst_1 = Dataset.from_list(lst_1)
lst_1

50 50


Dataset({
    features: ['question', 'answer', 'translation_status'],
    num_rows: 50
})

In [21]:
hebrew_datasets['gsm8k_test'] = concatenate_datasets([lst_1, hebrew_datasets['gsm8k_test']])
text_output['gsm8k_test'] = lst_1_text + text_output['gsm8k_test']

In [26]:
len(text_output['gsm8k_test'])

149

In [27]:
hebrew_datasets

{'gsm8k_test': Dataset({
     features: ['question', 'answer', 'translation_status'],
     num_rows: 149
 })}

In [28]:
df = add_dataset_to_csv(gsm_file_name, exp_name, hebrew_datasets['gsm8k_test'], gsm8k.gsm8k_sample_to_dict)
text_df[exp_name + ' text'] = text_output['gsm8k_test']
text_df.to_csv(gsm_file_name[:-4] + '-text.csv', index=False)
print(df.shape, text_df.shape)
display(df.head(2))
display(text_df.head(2))

(149, 2) (149, 2)


Unnamed: 0,original,gemini_pro_think_v2
0,<question>Doctor Jones is scheduling his time ...,<question>דוקטור כהן מתכנן את זמנו ליום שני. ה...
1,<question>Jordan wanted to surprise her mom wi...,<question>ירדן רצתה להפתיע את אמא שלה עם עוגת ...


Unnamed: 0,original,gemini_pro_think_v2 text
0,<question>Doctor Jones is scheduling his time ...,**My Thought Process: Translating a Math Probl...
1,<question>Jordan wanted to surprise her mom wi...,**My Translation Process for the Math Problem*...


# COPA

## Get Dataset

In [31]:
copa_dataset = copa.get_copa_datasets()

# remove mirrored samples
copa_train_indices = (pd.Series(copa_dataset['copa_train']['mirrored']) == False)
copa_train_indices = copa_train_indices[copa_train_indices].index
copa_dataset['copa_train'] = copa_dataset['copa_train'].select(copa_train_indices)

copa_split = 'train'
question_type = 'cause'

# keep only the wanted split and question_type
for split in ['train', 'test']:
    if split != copa_split:
        del copa_dataset['copa_' + split]
        continue
    look_on = pd.DataFrame(copa_dataset['copa_' + split])
    use_indices = look_on[look_on['question'] == question_type].index
    copa_dataset['copa_' + split] = copa_dataset['copa_' + split].select(use_indices)
    
copa_file_name = f'compare_csv/copa/copa_{copa_split}_{question_type}.csv'

print(copa_file_name)
copa_dataset

compare_csv/copa/copa_train_cause.csv


{'copa_train': Dataset({
     features: ['label', 'id', 'premise', 'question', 'choice1', 'choice2', 'mirrored'],
     num_rows: 250
 })}

In [None]:
df = add_dataset_to_csv(copa_file_name, 'original', copa_dataset['copa_' + copa_split], copa.copa_sample_to_dict)
text_df = add_dataset_to_csv(copa_file_name[:-4] + '-text.csv', 'original', copa_dataset['copa_' + copa_split], copa.copa_sample_to_dict)
display(df.head(2))
display(text_df.head(2))

In [None]:
df = pd.read_csv(copa_file_name)
text_df = pd.read_csv(copa_file_name[:-4] + '-text.csv')
print(df.shape, text_df.shape)
display(df.head(2))
display(text_df.head(2))

In [None]:
df.shape, text_df.shape

## Run Gemini

In [14]:
exp_name = 'gemini_pro_think_v1'

hebrew_datasets, text_output = gemini_translation(
    google_client,
    copa_dataset,
    copa_prompts.COPA_INSTRUCT_V1_GEMINI,
    copa_prompts.COPA_FEW_SHOTS,
    copa.copa_sample_to_dict,
    copa.copa_dict_to_sample,
    if_pro=True,
    think_bud=128,
)

Translating copa_train...


  0%|          | 0/20 [00:00<?, ?it/s]

In [15]:
hebrew_datasets

{'copa_train': Dataset({
     features: ['label', 'id', 'premise', 'question', 'choice1', 'choice2', 'mirrored', 'translation_status'],
     num_rows: 20
 })}

In [16]:
hebrew_datasets['copa_train'][0]

{'label': 1,
 'id': 1,
 'premise': 'גופי הטיל צל על הדשא.',
 'question': 'סיבה',
 'choice1': 'הדשא היה מכוסח.',
 'choice2': 'השמש זרחה.',
 'mirrored': False,
 'translation_status': 'Success'}

In [17]:
df = add_dataset_to_csv(copa_file_name, exp_name, hebrew_datasets['copa_' + copa_split], copa.copa_sample_to_dict)
text_df[exp_name + ' text'] = text_output['copa_' + copa_split]
text_df.to_csv(copa_file_name[:-4] + '-text.csv', index=False)
print(df.shape, text_df.shape)
display(df.head(2))
display(text_df.head(2))

(20, 2) (20, 2)


Unnamed: 0,original,gemini_pro_think_v1
0,<premise>My body cast a shadow over the grass....,<premise>גופי הטיל צל על הדשא.</premise>\n<cho...
1,<premise>The woman tolerated her friend's diff...,<premise>האישה סבלה את ההתנהגות הקשה של חברתה....


Unnamed: 0,original,gemini_pro_think_v1 text
0,<premise>My body cast a shadow over the grass....,**Breaking Down the Translation Task**\n\nOkay...
1,<premise>The woman tolerated her friend's diff...,**Breaking Down the Translation Challenge**\n\...


In [20]:
df['answer_label'] = pd.Series(hebrew_datasets['copa_' + copa_split]['label'])

In [21]:
df.head()

Unnamed: 0,original,gemini_pro_think_v1,answer_label
0,<premise>My body cast a shadow over the grass....,<premise>גופי הטיל צל על הדשא.</premise>\n<cho...,1
1,<premise>The woman tolerated her friend's diff...,<premise>האישה סבלה את ההתנהגות הקשה של חברתה....,0
2,<premise>The women met for coffee.</premise>\n...,<premise>הנשים נפגשו לקפה.</premise>\n<choice1...,1
3,<premise>The runner wore shorts.</premise>\n<c...,<premise>הרץ לבש מכנסיים קצרים.</premise>\n<ch...,1
4,<premise>The guests of the party hid behind th...,<premise>אורחי המסיבה התחבאו מאחורי הספה.</pre...,0


In [23]:
df['answer_label'] == pd.Series(hebrew_datasets['copa_' + copa_split]['label'])

0     True
1     True
2     True
3     True
4     True
5     True
6     True
7     True
8     True
9     True
10    True
11    True
12    True
13    True
14    True
15    True
16    True
17    True
18    True
19    True
dtype: bool

In [22]:
df.to_csv(copa_file_name, index=False)