In [1]:
import json
import random

from transformers import AutoTokenizer
import datasets

seed = 42

random.seed(seed)

dataset = datasets.load_dataset("").shuffle(seed=seed) # aihub dataset
tokenizer = AutoTokenizer.from_pretrained('yanolja/KoSOLAR-10.7B-v0.2')

def tokenization(example):
    output = {}
    output['ko_input_ids'] = tokenizer(example["ko"], add_special_tokens=False).pop('input_ids')
    output['en_input_ids'] = tokenizer(example["en"], add_special_tokens=False).pop('input_ids')
    return output # ignore eos token

def get_length(example):
    ko_len = len(example['ko_input_ids'][:-1])
    en_len = len(example['en_input_ids'])
    return {'len': ko_len + en_len, 'ko_len': ko_len, 'en_len': en_len}

dataset = dataset.map(tokenization, batched=True)
dataset = dataset.map(get_length)
sorted_dataset = dataset.sort("len", reverse=True)

sample_idx = random.sample(range(50000,8333207),80000) + random.sample(range(0,50000),20000)
train_dataset = sorted_dataset['train'].select(sample_idx).filter(lambda example: example['source'] != 71265)
final_idx = random.sample(range(0,len(train_dataset)),len(train_dataset))
train_dataset_1 = train_dataset.select(final_idx[:50000]).shuffle(seed=seed)
train_dataset_2 = train_dataset.select(final_idx[50000:]).shuffle(seed=seed) # for ALMA-R

sample_idx = random.sample(range(6250,1041651),10000) + random.sample(range(0,6250),2500)
val_dataset = sorted_dataset['validation'].select(sample_idx).filter(lambda example: example['source'] != 71265)
final_idx = random.sample(range(0,len(val_dataset)),4000)
val_dataset = val_dataset.select(final_idx).shuffle(seed=seed)

sample_idx = random.sample(range(6250,1041651),20000) + random.sample(range(0,6250),5000)
test_dataset = sorted_dataset['test'].select(sample_idx).filter(lambda example: example['source'] != 71265)
final_idx = random.sample(range(0,len(test_dataset)),8000)
test_dataset_1 = test_dataset.select(final_idx[:4000]).shuffle(seed=seed)
test_dataset_2 = test_dataset.select(final_idx[4000:]).shuffle(seed=seed)

str_data = []
for data in train_dataset_1:
    str_data.append(str(json.dumps({"translation": {"ko": data['ko'], "en": data['en']}}, ensure_ascii=False)) + '\n')

with open("./human_written_data/koen/train.ko-en.json", "w", encoding="utf-8") as f:
    f.writelines(str_data)

str_data = []
for data in val_dataset:
    str_data.append({"translation": {"ko": data['ko'], "en": data['en']}})
    
str_data = str(json.dumps(str_data, ensure_ascii=False))
with open("./human_written_data/koen/valid.ko-en.json", "w", encoding="utf-8") as f:
    f.writelines(str_data)

str_data = []
for data in test_dataset_1:
    str_data.append({"translation": {"ko": data['ko'], "en": data['en']}})
    
str_data = str(json.dumps(str_data, ensure_ascii=False))
with open("./human_written_data/koen/test.ko-en.json", "w", encoding="utf-8") as f:
    f.writelines(str_data)

str_data = []
for data in test_dataset_2:
    str_data.append({"translation": {"en": data['en'], "ko": data['ko']}})
    
str_data = str(json.dumps(str_data, ensure_ascii=False))
with open("./human_written_data/koen/test.en-ko.json", "w", encoding="utf-8") as f:
    f.writelines(str_data)

  from .autonotebook import tqdm as notebook_tqdm
  table = cls._concat_blocks(blocks, axis=0)
