# Imports

In [None]:
%load_ext autoreload
%autoreload 2

import pandas as pd
import sys
from datasets import concatenate_datasets

# Call models
from src.call_models import bedrock_connect, call_claude_bedrock
from src.call_models import google_connect, call_gemini, all_string_gemini_config, all_int_gemini_config
from src.translate_func import claude_translation, gemini_translation

# Datasets
from src.benchmarks_code import arc_ai
from src.benchmarks_code import gsm8k
from src.benchmarks_code import copa
from prompts import arc_prompts
from prompts import gsm_prompts
from prompts import copa_prompts
#from prompts import hellaswag_prompts

# Access keys
from my_access_keys import google_access_key, aws_access_key, aws_secret_key

# .csv utils
from src.save_utils import add_dataset_to_csv

# Remove annoying warning
from IPython.core.display_functions import display

In [None]:
from my_access_keys import google_project_id

In [None]:
# Get the model's access keys
bedrock_client = bedrock_connect(aws_access_key, aws_secret_key)
google_client = google_connect(google_access_key)

# Check Models Calls

In [None]:
print('Gemini:')
generate_content_config = all_string_gemini_config(['recipe', 'ingredients'], 'ALWAYS THINK BEFORE ANSWERING!', think_bud=200)
response = call_gemini(google_client, "List a popular cookie recipe, and include the amounts of ingredients.", generate_content_config)
print(response.text)
print('---')
my_recipes = response.parsed
print(my_recipes.keys())

# ARC_AI2

## Get Dataset

In [None]:
arc_dataset = arc_ai.get_arc_ai2_datasets()
# arc_dataset['arc_challenge_train'] = arc_dataset['arc_challenge_train'].skip(5).take(20)
# arc_dataset['arc_challenge_train'] = arc_dataset['arc_challenge_train'].skip(5).take(90)

# Remove sample number 121 - with only 3 optional answers....
# arc_dataset['arc_challenge_test'] = concatenate_datasets([arc_dataset['arc_challenge_test'].take(120), arc_dataset['arc_challenge_test'].skip(122).take(80)])
arc_dataset['arc_challenge_test'] = arc_dataset['arc_challenge_test'].skip(900).take(30)

file_name = 'compare_csv/arc_ai2/arc_ai2_chall_test_901-930.csv'

print(file_name)
arc_dataset

In [None]:
use_indcs = list(range(arc_dataset['arc_challenge_test'].num_rows))
for i, s in enumerate(arc_dataset['arc_challenge_test']):
    if len(s['choices']['label']) != 4:
        use_indcs.remove(i)
for i, s in enumerate(arc_dataset['arc_challenge_test'].select(use_indcs)):
    if len(s['choices']['label']) != 4:
        print(i)

arc_dataset['arc_challenge_test'] = arc_dataset['arc_challenge_test'].select(use_indcs)
arc_dataset['arc_challenge_test']

In [None]:
df = pd.read_csv(file_name)
text_df = pd.read_csv(file_name[:-4] + '-text.csv')
print(df.shape, text_df.shape)
display(df.head(2))
display(text_df.head(2))

In [None]:
df.shape, text_df.shape

## Run Translation

### Claude

In [None]:
%%time

# exp_name = 'claude_3-7_v7_thinking'
exp_name = 'claude_4_opus_v7_thinking'

hebrew_datasets, text_output = claude_translation(
    bedrock_client,
    arc_dataset,
    # small,
    arc_prompts.ARC_INSTRUCT_V7_CLAUDE,
    arc_prompts.ARC_FEW_SHOTS,
    arc_prompts.ARC_FORMAT,
    arc_ai.arc_sample_to_dict,
    arc_ai.arc_dict_to_sample,
    if_four=False,
)

In [None]:
def len_mat(text):
    pattern = r"<(?!response_format\b)([^>]+)>(.*?)</\1>"
    matches = re.findall(pattern, text, re.DOTALL)
    return len(matches)

In [None]:
import re
pd.Series([len_mat(i) for i in text_output['arc_challenge_test']]).value_counts()

In [None]:
df = add_dataset_to_csv(file_name, exp_name, hebrew_datasets['arc_challenge_test'], arc_ai.arc_sample_to_dict)
text_df[exp_name + ' text'] = text_output['arc_challenge_test']
text_df.to_csv(file_name[:-4] + '-text.csv', index=False)
display(df.head(2))
display(text_df.head(2))

### Gemini

In [None]:
small = {}
small['arc_challenge_test'] = arc_dataset['arc_challenge_test']
small

In [None]:
exp_name = 'gemini'

hebrew_datasets, text_output = gemini_translation(
    google_client,
    # arc_dataset,
    small,
    arc_prompts.ARC_INSTRUCT_V2_GEMINI,
    arc_prompts.ARC_FEW_SHOTS,
    arc_ai.arc_sample_to_dict,
    arc_ai.arc_dict_to_sample,
    if_pro=True,
    # if_pro=False,
    think_bud=4096,
    # think_bud=2_000
)

In [None]:
hebrew_datasets

In [None]:
hebrew_datasets['arc_challenge_test'][0]

In [None]:
df = add_dataset_to_csv(file_name, exp_name, hebrew_datasets['arc_challenge_test'], arc_ai.arc_sample_to_dict)
text_df[exp_name + ' text'] = text_output['arc_challenge_test']
text_df.to_csv(file_name[:-4] + '-text.csv', index=False)
display(df.head(2))
display(text_df.head(2))

In [None]:
df.shape, text_df.shape

### Multi-options Translation - Gemini

### Claude vs Gemini (using Gemini as judge)

# GSM8K

## Get Dataset

In [None]:
gsm_dataset = gsm8k.get_gsm8k_datasets()
gsm_dataset['gsm8k_test'] = gsm_dataset['gsm8k_test'].skip(1_000).select(list(range(0, 67)) + list(range(68, 150)))
# gsm_file_name = 'compare_csv/gsm8k_main_test_top_200.csv'
gsm_file_name = 'compare_csv/gsm8k_main_test_1001-1150.csv'

print(gsm_file_name)
gsm_dataset

In [None]:
df = add_dataset_to_csv(gsm_file_name, 'original', gsm_dataset['gsm8k_test'], gsm8k.gsm8k_sample_to_dict)
text_df = add_dataset_to_csv(gsm_file_name[:-4] + '-text.csv', 'original', gsm_dataset['gsm8k_test'], gsm8k.gsm8k_sample_to_dict)
display(df.head(2))
display(text_df.head(2))

## Run Translation

### Claude

In [None]:
%%time

# exp_name = 'claude_3-7_v7_thinking'
exp_name = 'claude_4_opus_v7_thinking'

hebrew_datasets, text_output = claude_translation(
    bedrock_client,
    # small,
    gsm_dataset,
    gsm_prompts.GSM_INSTRUCT_CLAUDE_V2,
    gsm_prompts.GSM_FEW_SHOTS,
    gsm_prompts.GSM_FORMAT,
    # gsm_prompts.GSM_FORMAT_REFINE,
    gsm8k.gsm8k_sample_to_dict,
    gsm8k.gsm8k_dict_to_sample,
    if_four=False,
)

In [None]:
def len_mat(text):
    pattern = r"<(?!response_format\b)([^>]+)>(.*?)</\1>"
    matches = re.findall(pattern, text, re.DOTALL)
    return len(matches)

In [None]:
import re
pd.Series([len_mat(i) for i in text_output['gsm8k_test']]).value_counts()

In [None]:
print(text_output['gsm8k_test'][3])

In [None]:
df = add_dataset_to_csv(gsm_file_name, exp_name, hebrew_datasets['gsm8k_test'], gsm8k.gsm8k_sample_to_dict)
df.head(2)

In [None]:
text_df[exp_name + ' text'] = text_output['gsm8k_test']
text_df.to_csv(gsm_file_name[:-4] + '-text.csv', index=False)
text_df.head(2)

In [None]:
for i in range(10):
    print(text_df['claude_v2_refine text'].iloc[i])
    print('\n - - - - - - - - - - - - - - - - - - - - - - - - - - - -\n')

### Gemini

In [None]:
small = {}
small['gsm8k_test'] = gsm_dataset['gsm8k_test'].select(list(range(50, 67)) + list(range(68, 150)))
small

In [None]:
exp_name = 'gemini_pro_think_v2'

hebrew_datasets, text_output = gemini_translation(
    google_client,
    small,
    # gsm_dataset,
    gsm_prompts.GSM_INSTRUCT_GEMINI_V2,
    gsm_prompts.GSM_FEW_SHOTS,
    gsm8k.gsm8k_sample_to_dict,
    gsm8k.gsm8k_dict_to_sample,
    if_pro=True,
    think_bud=8192,
)

In [None]:
import pickle
from datasets import Dataset, concatenate_datasets

with open('gemini_cp/ck - gemini_gsm8k_test_50.pkl', 'rb') as f:
    lst_1 = pickle.load(f)
with open('gemini_cp/ck - gemini_gsm8k_test_50_text.pkl', 'rb') as f:
    lst_1_text = pickle.load(f)
print(len(lst_1), len(lst_1_text))

lst_1 = Dataset.from_list(lst_1)
lst_1

In [None]:
hebrew_datasets['gsm8k_test'] = concatenate_datasets([lst_1, hebrew_datasets['gsm8k_test']])
text_output['gsm8k_test'] = lst_1_text + text_output['gsm8k_test']

In [None]:
len(text_output['gsm8k_test'])

In [None]:
hebrew_datasets

In [None]:
df = add_dataset_to_csv(gsm_file_name, exp_name, hebrew_datasets['gsm8k_test'], gsm8k.gsm8k_sample_to_dict)
text_df[exp_name + ' text'] = text_output['gsm8k_test']
text_df.to_csv(gsm_file_name[:-4] + '-text.csv', index=False)
print(df.shape, text_df.shape)
display(df.head(2))
display(text_df.head(2))

# COPA

## Get Dataset

In [None]:
copa_dataset = copa.get_copa_datasets()

# remove mirrored samples
copa_train_indices = (pd.Series(copa_dataset['copa_train']['mirrored']) == False)
copa_train_indices = copa_train_indices[copa_train_indices].index
copa_dataset['copa_train'] = copa_dataset['copa_train'].select(copa_train_indices)

copa_split = 'test'
question_type = 'effect'
if question_type == 'cause':
    copa_few_shots_prompt = copa_prompts.COPA_FEW_SHOTS_CAUSE
else:
    copa_few_shots_prompt = copa_prompts.COPA_FEW_SHOTS_EFFECT

# keep only the wanted split and question_type
for split in ['train', 'test']:
    if split != copa_split:
        del copa_dataset['copa_' + split]
        continue
    look_on = pd.DataFrame(copa_dataset['copa_' + split])
    use_indices = look_on[look_on['question'] == question_type].index
    copa_dataset['copa_' + split] = copa_dataset['copa_' + split].select(use_indices)
    
copa_file_name = f'compare_csv/copa/copa_{copa_split}_{question_type}.csv'

print(copa_file_name)
copa_dataset

In [None]:
df = add_dataset_to_csv(copa_file_name, 'original', copa_dataset['copa_' + copa_split], copa.copa_sample_to_dict)
text_df = add_dataset_to_csv(copa_file_name[:-4] + '-text.csv', 'original', copa_dataset['copa_' + copa_split], copa.copa_sample_to_dict)
display(df.head(2))
display(text_df.head(2))

In [None]:
df = pd.read_csv(copa_file_name)
text_df = pd.read_csv(copa_file_name[:-4] + '-text.csv')
print(df.shape, text_df.shape)
display(df.head(2))
display(text_df.head(2))

In [None]:
df.shape, text_df.shape

## Run Gemini

In [None]:
print(copa_file_name)
print('cause?   ', copa_few_shots_prompt == copa_prompts.COPA_FEW_SHOTS_CAUSE)
print('effect?  ', copa_few_shots_prompt == copa_prompts.COPA_FEW_SHOTS_EFFECT)

In [None]:
exp_name = 'gemini'

hebrew_datasets, text_output = gemini_translation(
    google_client,
    copa_dataset,
    copa_prompts.COPA_INSTRUCT_V1_GEMINI,
    copa_few_shots_prompt,
    copa.copa_sample_to_dict,
    copa.copa_dict_to_sample,
    if_pro=True,
    think_bud=2048,
)

In [None]:
hebrew_datasets

In [None]:
hebrew_datasets['copa_test'][0]

In [None]:
df = add_dataset_to_csv(copa_file_name, exp_name, hebrew_datasets['copa_' + copa_split], copa.copa_sample_to_dict)
text_df[exp_name + ' text'] = text_output['copa_' + copa_split]
text_df.to_csv(copa_file_name[:-4] + '-text.csv', index=False)
print(df.shape, text_df.shape)
display(df.head(2))
display(text_df.head(2))

In [None]:
df['answer_label'] = pd.Series(hebrew_datasets['copa_' + copa_split]['label'])

In [None]:
df.head()

In [None]:
print((df['answer_label'] == pd.Series(hebrew_datasets['copa_' + copa_split]['label'])).all())
(df['answer_label'] == pd.Series(hebrew_datasets['copa_' + copa_split]['label']))

In [None]:
df.to_csv(copa_file_name, index=False)