## Imports

In [None]:
%load_ext autoreload
%autoreload 2
import pandas as pd
import numpy as np
import os
import json
import re
import matplotlib.pyplot as plt

from datasets import load_dataset, Dataset

# Create Data

## Read relevant data

In [None]:
def read_jsonl(file_name):
    data = []
    with open(file_name, "r", encoding="utf-8") as file:
        for line in file:
            try:
                # Parse each line as a JSON object
                json_object = json.loads(line)
                data.append(json_object)
            except json.JSONDecodeError as e:
                print(f"Error decoding JSON on line: {line.strip()}. Error: {e}")
    return data

def clean_copa(x):
    if x.startswith('Cause:\n\n'):
        return x[len('Cause:\n\n'):]
    elif x.startswith('Effect:\n\n'):
        return x[len('Effect:\n\n'):]
    return x

def into_arc(row):
    return '<question>' + row['qwery'] + '</question>\n' + '\n'.join([f'<option {i + 1}>{row["choices"][i]}</option {i + 1}>' for i in range(4)])

def into_mmlu(row):
    letters = ['a', 'b', 'c', 'd']
    return '<question>' + row['qwery'] + '</question>\n' + '\n'.join([f'<choice_{letters[i]}>{row["choices"][i]}</choice_{letters[i]}>' for i in range(4)])

In [None]:
prefix = '/home/ec2-user/SageMaker/qwen-hebrew-finetuning - translation/translation/'
use_files = {
    'arc': prefix + 'labeled_files/arc_ai_TEST_labeled_gradio.csv',
    'gsm': prefix + 'labeled_files/gsm_TEST_labeled_gradio.csv',
    'mmlu': prefix + 'labeled_files/mmlu_main_sub_TEST_labeled_gradio.csv',
    'copa': prefix + 'labeled_files/copa_TRAIN_gradio.csv',
}

hebrew_final = {
    'arc': prefix + 'final_hebrew_bnch/arc_ai2_chall_heb.jsonl',
    'mmlu': prefix + 'final_hebrew_bnch/MMLU_heb_2.jsonl',
}

for k in use_files:
    use_files[k] = pd.read_csv(use_files[k]).fillna('')
    use_files[k] = use_files[k][use_files[k]['rating'] != 'SKIP']
    use_files[k] = use_files[k].iloc[np.linspace(0, use_files[k].shape[0] - 1, 500, dtype='int')]
    use_files[k] = use_files[k].drop_duplicates()
    use_files[k].loc[use_files[k]['gold'] == '', 'gold'] = use_files[k].loc[use_files[k]['gold'] == '']['new_text_column']

    print(use_files[k].shape, k)

hebrew_final['arc'] = pd.DataFrame(read_jsonl(hebrew_final['arc']))
hebrew_final['arc']['gold'] = hebrew_final['arc'].apply(into_arc, axis=1)
hebrew_final['mmlu'] = pd.DataFrame(read_jsonl(hebrew_final['mmlu']))
hebrew_final['mmlu']['gold'] = hebrew_final['mmlu'].apply(into_mmlu, axis=1)

for k in ['arc', 'mmlu']:
    use_files[k] = use_files[k].merge(hebrew_final[k], on='gold')
    use_files[k] = use_files[k][['text_column', 'gold', 'answer_index']].rename({
        'text_column': 'Eng',
        'gold': 'Heb',
        'answer_index': 'label',
    }, axis=1)

use_files['copa']['text_column'] = use_files['copa']['text_column'].apply(clean_copa)
use_files['copa'] = use_files['copa'][['text_column', 'gold', 'answer_label']].rename({
    'text_column': 'Eng',
    'gold': 'Heb',
    'answer_label': 'label',
}, axis=1)

use_files['gsm'] = use_files['gsm'][['text_column', 'gold']].rename({
    'text_column': 'Eng',
    'gold': 'Heb',
}, axis=1)

print([(k, use_files[k].shape) for k in use_files])

In [None]:
def into_final_text_arc(row):
    pattern = r"<(?!response_format\b)([^>]+)>(.*?)</\1>"
    matches_en = re.findall(pattern, row['Eng'], re.DOTALL)
    sample_en = {key: value.strip() for key, value in matches_en}
    matches_he = re.findall(pattern, row['Heb'], re.DOTALL)
    sample_he = {key: value.strip() for key, value in matches_he}

    eng = sample_en['question'] + '\n' + sample_en[f'option {row["label"] + 1}']
    heb = sample_he['question'] + '\n' + sample_he[f'option {row["label"] + 1}']
    return (eng, heb)


def into_final_text_mmlu(row):
    pattern = r"<(?!response_format\b)([^>]+)>(.*?)</\1>"
    matches_en = re.findall(pattern, row['Eng'], re.DOTALL)
    sample_en = {key: value.strip() for key, value in matches_en}
    matches_he = re.findall(pattern, row['Heb'], re.DOTALL)
    sample_he = {key: value.strip() for key, value in matches_he}

    labels_map = ['a', 'b', 'c', 'd']
    
    eng = sample_en['question'] + '\n' + sample_en[f'choice_{labels_map[row["label"]]}']
    heb = sample_he['question'] + '\n' + sample_he[f'choice_{labels_map[row["label"]]}']
    return (eng, heb)


def into_final_text_copa(row):
    pattern = r"<(?!response_format\b)([^>]+)>(.*?)</\1>"
    matches_en = re.findall(pattern, row['Eng'], re.DOTALL)
    sample_en = {key: value.strip() for key, value in matches_en}
    matches_he = re.findall(pattern, row['Heb'], re.DOTALL)
    sample_he = {key: value.strip() for key, value in matches_he}
    
    eng = sample_en['premise'] + '\n' + sample_en[f'choice{row["label"] + 1}']
    heb = sample_he['premise'] + '\n' + sample_he[f'choice{row["label"] + 1}']
    return (eng, heb)

def into_final_text_gsm(row):
    pattern = r"<(?!response_format\b)([^>]+)>(.*?)</\1>"
    matches_en = re.findall(pattern, row['Eng'], re.DOTALL)
    sample_en = {key: value.strip() for key, value in matches_en}
    matches_he = re.findall(pattern, row['Heb'], re.DOTALL)
    sample_he = {key: value.strip() for key, value in matches_he}
    
    eng = sample_en['question'] + '\n' + sample_en['answer']
    heb = sample_he['question'] + '\n' + sample_he['answer']
    return (eng, heb)

In [None]:
use_files['arc'][['en_prompt', 'he_prompt']] = use_files['arc'].apply(into_final_text_arc, result_type='expand', axis=1)
use_files['mmlu'][['en_prompt', 'he_prompt']] = use_files['mmlu'].apply(into_final_text_mmlu, result_type='expand', axis=1)
use_files['copa'][['en_prompt', 'he_prompt']] = use_files['copa'].apply(into_final_text_copa, result_type='expand', axis=1)
use_files['gsm'][['en_prompt', 'he_prompt']] = use_files['gsm'].apply(into_final_text_gsm, result_type='expand', axis=1)

In [None]:
print([(k, use_files[k].shape) for k in use_files])

In [None]:
for k in use_files:
    use_files[k].to_csv(f'moe_analysis_data/{k}_en_he_500.csv', index=False)

In [None]:
for k in use_files:
    print(k)
    display(use_files[k][['en_prompt', 'he_prompt']].map(lambda x: x.split(' ')).map(len).mean(axis=0))
    print()

In [None]:
for k in use_files:
    print(k)
    print(use_files[k].iloc[0]['en_prompt'])
    print(use_files[k].iloc[0]['he_prompt'])
    print()

## Find more data on the internet

In [None]:
# 1. Load the dataset in streaming mode to avoid downloading everything
streamed_dataset = load_dataset(
    "HebArabNlpProject/HebNLI",
    split="train",  # Specify the split you want, e.g., 'train'
    streaming=True
)

# 2. Define a function to filter examples on the fly
def filter_long_sentences(example):
    # Keep the example only if 'sentence1' is longer than 500 chars
    return len(example['sentence1']) > 450

# 3. Apply the filter to the stream
filtered_stream = streamed_dataset.filter(filter_long_sentences)

# 4. Take the first 500 examples that pass the filter
final_dataset_iterator = filtered_stream.take(500)

# Convert the final iterator to a list to use the data
# This step will process the stream until 500 examples are found
final_examples = Dataset.from_list(list(final_dataset_iterator))

final_examples

In [None]:
nli_df = pd.DataFrame(final_examples)[['sentence1', 'translation1']]
nli_df.columns = ['en_prompt', 'he_prompt']
print(nli_df.shape)
nli_df.head(3)

In [None]:
nli_df.to_csv('moe_analysis_data/nli_en_he_460.csv', index=False)

In [None]:
nli_df.map(lambda x: len(x.split())).mean()

In [None]:
print(final_examples[0]['sentence1'])
print()
print(final_examples[0]['translation1'])

___

In [None]:
import json

out = []
buffer_en, buffer_he = [], []

length = 250

def flush_chunk():
    """If buffer has >=300 words, add to output and reset."""
    global buffer_en, buffer_he
    en_text = " ".join(buffer_en).strip()
    he_text = " ".join(buffer_he).strip()
    if len(en_text.split()) >= length and len(he_text.split()) >= length:
        out.append({"en_prompt": en_text, "he_prompt": he_text,
                    "en_words": len(en_text.split()),
                    "he_words": len(he_text.split())})
    buffer_en, buffer_he = [], []

# paths to your downloaded files
for en_file, he_file in [
    (
        "moe_analysis_data/NeuLab-TedTalks.en-he.en", "moe_analysis_data/NeuLab-TedTalks.en-he.he"
    ), (
        "moe_analysis_data/TED2020.en-he.en", "moe_analysis_data/TED2020.en-he.he"
    )]:

    with open(en_file, encoding="utf-8") as f_en, open(he_file, encoding="utf-8") as f_he:
        for en_line, he_line in zip(f_en, f_he):
            en_line, he_line = en_line.strip(), he_line.strip()
            if not en_line or not he_line:
                continue
            buffer_en.append(en_line)
            buffer_he.append(he_line)
            # check length and flush if long enough
            if len(" ".join(buffer_en).split()) >= length:
                flush_chunk()

# flush remaining sentences
flush_chunk()

print(f"Built {len(out)} parallel samples (≥{length} words each).")

df = pd.DataFrame(out)
print(df.shape)   # (rows, columns)
display(df.head())

# Save as CSV
df.to_csv("moe_analysis_data/ted_he_en_chunks.csv", index=False)

In [None]:
((df['en_words'] >= 300) & (df['he_words'] >= 300)).sum()