# Imports

In [None]:
%load_ext autoreload
%autoreload 2

import pandas as pd
import numpy as np
import sys

# Call models
from src.call_models import bedrock_connect, call_claude_bedrock
from src.call_models import google_connect, call_gemini, all_string_gemini_config, all_int_gemini_config
from src.translate_func import claude_translation, gemini_translation

# Datasets
from prompts import mmlu_prompts
from src.benchmarks_code import mmlu
# Access keys
from my_access_keys import google_access_key, aws_access_key, aws_secret_key

# .csv utils
from src.save_utils import add_dataset_to_csv

# Remove annoying warning
from IPython.core.display_functions import display

In [None]:
bedrock_client = bedrock_connect(aws_access_key, aws_secret_key)
google_client = google_connect(google_access_key)

# Get Dataset

In [None]:
mmlu_dataset = mmlu.get_mmlu_datasets()
# mmlu_dataset['mmlu_test'] = mmlu_dataset['mmlu_test']
size = mmlu_dataset['mmlu_test'].num_rows
mmlu_dataset['mmlu_test'] = mmlu_dataset['mmlu_test'] #.select(np.arange(0, size, 90))

df_mmlu_full = pd.DataFrame(mmlu_dataset['mmlu_test'])
df_mmlu_part = pd.DataFrame(mmlu_dataset['mmlu_test'].select(np.arange(0, size, 90)))
df_mmlu_part.index = np.arange(0, size, 90)

print(df_mmlu_full.shape, df_mmlu_part.shape)
# display(df_mmlu_full.head(2))
# display(df_mmlu_part.head(2))

mmlu_dataset

## How much needed from each subject?

In [None]:
df_mmlu_labeled = pd.read_csv('labeled_files/mmlu_test_labeled_gradio.csv')
df_mmlu_labeled.shape

In [None]:
need_from_each = pd.Series(index=df_mmlu_full['subject'].unique(), data=20)

remove = df_mmlu_part[(df_mmlu_labeled.iloc[:df_mmlu_part.shape[0]]['rating'] != 'SKIP').values]['subject'].value_counts()
remove = remove.reindex(need_from_each.index, fill_value=0)
need_from_each -= remove
need_from_each = need_from_each.sort_values(ascending=False)
display(need_from_each.head(3))
print()
display(need_from_each.tail(3))

In [None]:
use_subjects = [
    "professional_psychology",
    "high_school_psychology",
    "high_school_macroeconomics",
    "elementary_mathematics",
    "prehistory",
    "philosophy",
    "high_school_biology",
    "nutrition",
    "professional_accounting",
    "professional_medicine",
    "high_school_mathematics",
    "clinical_knowledge",
    "security_studies",
    "high_school_microeconomics",
    "high_school_world_history",
    "conceptual_physics",
    "marketing",
    "human_aging",
    "high_school_statistics",
    "high_school_chemistry",
    "sociology",
    "high_school_geography",
    "college_medicine",
    "world_religions",
    "virology",
    "high_school_european_history",
    "logical_fallacies",
    "astronomy",
    "high_school_physics",
    "electrical_engineering",
    "college_biology",
    "anatomy",
    "human_sexuality",
    "formal_logic",
    "international_law",
    "econometrics",
    "machine_learning",
    "public_relations",
    "management",
    "college_physics",
    "college_computer_science",
    "college_mathematics",
    "global_facts",
    "high_school_computer_science",
    "computer_security",
    "abstract_algebra",
    "business_ethics",
    "college_chemistry",
    "medical_genetics",
]

side_subjects = [
    "professional_law",
    "moral_scenarios",
    "miscellaneous",
    "moral_disputes",
    "high_school_us_history",
    "high_school_government_and_politics",
    "jurisprudence",
    "us_foreign_policy",
]

In [None]:
need_from_each[need_from_each.index.isin(use_subjects)].sum(), need_from_each[need_from_each.index.isin(side_subjects)].sum()

In [None]:
df_mmlu_full = df_mmlu_full[~df_mmlu_full.index.isin(df_mmlu_part.index)]
df_mmlu_full.shape

In [None]:
needed_use_only = need_from_each[need_from_each.index.isin(side_subjects)]
print(needed_use_only.shape)

df_mmlu_full['subject_cnt'] = df_mmlu_full.groupby('subject').cumcount()
take_indices = df_mmlu_full[df_mmlu_full['subject'].isin(side_subjects)]
# take_indices = take_indices[take_indices.apply(lambda x: x['subject_cnt'] < 4, axis=1)]
take_indices = take_indices[take_indices.apply(lambda x: x['subject_cnt'] < needed_use_only.loc[x['subject']] + 5, axis=1)]
print(take_indices.shape)
mmlu_dataset['mmlu_test'] = mmlu_dataset['mmlu_test'].select(take_indices.index)
mmlu_dataset

In [None]:
take_indices['subject'].value_counts()

In [None]:
(pd.DataFrame(mmlu_dataset['mmlu_test'])['subject'].value_counts().sort_index() == needed_use_only.sort_index()).all()

## Create save file

In [None]:
mmlu_dataset

In [None]:
mmlu_dataset['mmlu_test'] = mmlu_dataset['mmlu_test']
mmlu_file_name = 'compare_csv/mmlu/mmlu_test_prob_sub.csv'

In [None]:
mmlu_dataset['mmlu_test'] = mmlu_dataset['mmlu_test']

In [None]:
mmlu_dataset['mmlu_test'] = mmlu_dataset['mmlu_test'].select(list(range(24)) + list(range(25, 61)) + list(range(62, 164)))
mmlu_dataset['mmlu_test']

In [None]:
df = add_dataset_to_csv(mmlu_file_name, 'original', mmlu_dataset['mmlu_test'], mmlu.mmlu_sample_to_dict)
text_df = add_dataset_to_csv(mmlu_file_name[:-4] + '-text.csv', 'original', mmlu_dataset['mmlu_test'], mmlu.mmlu_sample_to_dict)
display(df.head(2))
display(text_df.head(2))

In [None]:
df = pd.read_csv(mmlu_file_name)
text_df = pd.read_csv(mmlu_file_name[:-4] + '-text.csv')
display(df.head(2))
display(text_df.head(2))

In [None]:
df.shape, text_df.shape

# Run Translation

## Claude

In [None]:
%%time

# exp_name = 'claude_3-7_v7_thinking'
exp_name = 'claude_thinking_v2'

hebrew_datasets, text_output = claude_translation(
    bedrock_client,
    mmlu_dataset,
    # small,
    mmlu_prompts.MMLU_INSTRUCT_CLAUDE_V2,
    mmlu_prompts.MMLU_FEW_SHOTS,
    mmlu_prompts.MMLU_FORMAT,
    mmlu.mmlu_sample_to_dict,
    mmlu.mmlu_dict_to_sample,
    if_four=False,
)

In [None]:
def len_mat(text):
    pattern = r"<(?!response_format\b)([^>]+)>(.*?)</\1>"
    matches = re.findall(pattern, text, re.DOTALL)
    return len(matches)

In [None]:
import re
pd.Series([len_mat(i) for i in text_output['mmlu_test']]).value_counts()

In [None]:
pd.Series([len_mat(i.split('Text:')[-1]) for i in text_output['mmlu_test']]).value_counts()

In [None]:
print(text_output['mmlu_test'][3])

In [None]:
df = add_dataset_to_csv(mmlu_file_name, exp_name, hebrew_datasets['mmlu_test'], mmlu.mmlu_sample_to_dict)
df.head(2)

In [None]:
text_df[exp_name + ' text'] = text_output['mmlu_test']
text_df.to_csv(mmlu_file_name[:-4] + '-text.csv', index=False)
text_df.head(2)

## Gemini

In [None]:
# jumped over 24? and 61?
small = {}
small['mmlu_test'] = mmlu_dataset['mmlu_test'].select(list(range(62, mmlu_dataset['mmlu_test'].num_rows)))
small

In [None]:
exp_name = 'gemini'

hebrew_datasets, text_output = gemini_translation(
    google_client,
    # mmlu_dataset,
    small,
    mmlu_prompts.MMLU_INSTRUCT_V1_GEMINI,
    mmlu_prompts.MMLU_FEW_SHOTS,
    mmlu.mmlu_sample_to_dict,
    mmlu.mmlu_dict_to_sample,
    if_pro=True,
    think_bud=4_096,
)

In [None]:
hebrew_datasets

In [None]:
import pickle
from datasets import Dataset, concatenate_datasets

with open('gemini_cp/ck1 - gemini_mmlu_test_15.pkl', 'rb') as f:
    lst_1 = pickle.load(f)
with open('gemini_cp/ck1 - gemini_mmlu_test_15_text.pkl', 'rb') as f:
    lst_1_text = pickle.load(f)

with open('gemini_cp/ck2 - gemini_mmlu_test_45.pkl', 'rb') as f:
    lst_1 += pickle.load(f)
with open('gemini_cp/ck2 - gemini_mmlu_test_45_text.pkl', 'rb') as f:
    lst_1_text += pickle.load(f)

len(lst_1), len(lst_1_text)

In [None]:
lst_1 = Dataset.from_list(lst_1)

hebrew_datasets['mmlu_test_fixed'] = concatenate_datasets([lst_1, hebrew_datasets['mmlu_test']])
text_output['mmlu_test_fixed'] = lst_1_text + text_output['mmlu_test']

In [None]:
hebrew_datasets['mmlu_test_fixed']

In [None]:
df = add_dataset_to_csv(mmlu_file_name, exp_name, hebrew_datasets['mmlu_test_fixed'], mmlu.mmlu_sample_to_dict)
text_df[exp_name + ' text'] = text_output['mmlu_test_fixed']
text_df.to_csv(mmlu_file_name[:-4] + '-text.csv', index=False)
display(df.head(2))
display(df.tail(2))
display(text_df.head(2))

In [None]:
df['answer_label'] = pd.Series(hebrew_datasets['mmlu_test_fixed']['answer'])
df['subject'] = pd.Series(hebrew_datasets['mmlu_test_fixed']['subject'])
print((df['answer_label'] == pd.Series(hebrew_datasets['mmlu_test_fixed']['answer'])).all())
df.to_csv(mmlu_file_name, index=False)

In [None]:
df.head()

In [None]:
df.tail()