In [1]:
# TODO - sentencepiece tokenizer

## Preprocess WMT14 Dataset
- step 1. Download Raw Data(Parquet) and Save it as txt
- step 2. Train tokenizer(BPE)
- step 3. Preprocess(tokenize) the data

#### 1. Download WMT14-(en, de)
- The file format is parquet
- Download it first and Save it line by line on local

In [1]:
# Download WMT14(english-german) and convert parquet to text

from datasets import load_dataset
from tqdm.auto import tqdm
import os


def save_split(data, save_dir, split_name):
    en_file = os.path.join(save_dir, f"{split_name}.en")
    de_file = os.path.join(save_dir, f"{split_name}.de")

    with open(en_file, "w", encoding="utf-8") as f_en, open(de_file, "w", encoding="utf-8") as f_de:
        for example in tqdm(data, desc=f"Saving {split_name}"):
            en_text = example["translation"]["en"]
            de_text = example["translation"]["de"]

            f_en.write(en_text + "\n")
            f_de.write(de_text + "\n")

    print(f"Saved to {save_dir} - {split_name}.en and {split_name}.de")

print("Download WMT14 English-German dataset as parquet")
dataset = load_dataset("wmt14", "de-en")


train_data = dataset["train"]
val_data = dataset["validation"]
test_data = dataset["test"]
print(f"Train Samples: {len(train_data)}")
print(f"Validation Samples: {len(val_data)}")
print(f"Test Samples: {len(test_data)}")

data_dir = "data"
save_dir = f"{data_dir}/wmt14"
os.makedirs(save_dir, exist_ok=True)
# save_split(train_data, save_dir, "train")
# save_split(val_data, save_dir, "valid")
# save_split(test_data, save_dir, "test")

Download WMT14 English-German dataset as parquet
Train Samples: 4508785
Validation Samples: 3000
Test Samples: 3003


## 2. Train BPE Tokenizer
- CPU core수를 충분히 사용!(속도 차이가 심함)
- Source and Target에 따라 2개의 Tokenizer 필요

In [4]:
# Train sentencepiece tokenizer

import sentencepiece as spm

def train_sentencepiece_tokenizer(input_files, model_prefix, vocab_size):
    """Train BPE tokenizer"""
    input_str = ",".join(input_files)
    spm.SentencePieceTrainer.train(
        input = input_str,
        model_prefix = model_prefix,
        vocab_size = vocab_size,
        character_coverage = 1.0,
        model_type = "bpe",
        pad_id = 0,
        unk_id = 1,
        bos_id = 2,
        eos_id = 3,
        pad_piece = "<pad>",
        unk_piece = "<unk>",
        bos_piece = "<s>",
        eos_piece = "</s>",
        train_extremely_large_corpus=True,  # 대용량 데이터용
        num_threads=os.cpu_count(),  # 모든 CPU 코어 사용
        minloglevel=1  # 0: INFO, 1: WARNING, 2: ERROR
    )
    print(f"Tokenizer save : {model_prefix}.model, {model_prefix}.vocab")

vocab_size = 32000
os.makedirs(f"{data_dir}/tokenizers", exist_ok=True)


print("Train Enlgish Tokenizer")
train_sentencepiece_tokenizer(
    input_files=[f"{save_dir}/train.en"],
    model_prefix=f"{data_dir}/tokenizers/en_tokenizer",
    vocab_size=vocab_size,
)
print("Train German Tokenizer")
train_sentencepiece_tokenizer(
    input_files=[f"{save_dir}/train.de"],
    model_prefix=f"{data_dir}/tokenizers/de_tokenizer",
    vocab_size=vocab_size,
)

Train Enlgish Tokenizer
Tokenizer save : data/tokenizers/en_tokenizer.model, data/tokenizers/en_tokenizer.vocab
Train German Tokenizer
Tokenizer save : data/tokenizers/de_tokenizer.model, data/tokenizers/de_tokenizer.vocab




## 3. Tokenize source and target data
- 학습완료한 en-tokenizer를 이용하여 Source(en) 파일 전처리
- 학습완료한 de-tokenizer를 이용하여 Target(de) 파일 전처리

In [21]:
# Preprocess data with tokenizer
def tokenize_data(
    src_file,
    tgt_file,
    src_tokenizer,
    tgt_tokenizer,
    output_prefix,
    max_len = 100,
):
    print("Load tokenizers")
    sp_src = spm.SentencePieceProcessor()
    sp_tgt = spm.SentencePieceProcessor()
    sp_src.load(src_tokenizer)
    sp_tgt.load(tgt_tokenizer)
    
    print("Read and Tokenize and Save")
    print(f"Source File: {src_file}")
    print(f"Target File: {tgt_file}")
    with open(src_file, "r", encoding="utf-8") as f_src, \
        open(tgt_file, "r", encoding="utf-8") as f_tgt, \
        open(f"{output_prefix}.src", "w", encoding="utf-8") as f_out_src, \
        open(f"{output_prefix}.tgt", "w", encoding="utf-8") as f_out_tgt:
            total = 0
            filtered = 0 # over max_len
    
            for src_line, tgt_line in zip(f_src, f_tgt):
                total += 1
    
                # Tokenize
                src_tokens = sp_src.encode_as_pieces(src_line.strip())
                tgt_tokens = sp_tgt.encode_as_pieces(tgt_line.strip())
    
                if len(src_tokens) <= max_len and len(tgt_tokens) <= max_len:
                    f_out_src.write(" ".join(src_tokens) + "\n")
                    f_out_tgt.write(" ".join(tgt_tokens) + "\n")
                else:
                    filtered += 1
                    
    print(f"\n\nTotal: {total}")
    print(f"Filtered: {filtered}")
    print(f"Kept: {total - filtered}")
    print("\n\n")

os.makedirs(f"{data_dir}/processed", exist_ok=True)

for split_name in ["train", "valid", "test"]:
    tokenize_data(
        src_file=f"{data_dir}/wmt14/{split_name}.en",
        tgt_file=f"{data_dir}/wmt14/{split_name}.de",
        src_tokenizer=f"{data_dir}/tokenizers/en_tokenizer.model",
        tgt_tokenizer=f"{data_dir}/tokenizers/de_tokenizer.model",
        output_prefix=f"{data_dir}/processed/{split_name}",
        max_len=100,
    )

Load tokenizers
Read and Tokenize and Save
Source File: data/wmt14/train.en
Target File: data/wmt14/train.de


Tokenizing: 0it [00:00, ?it/s]



Total: 4509342
Filtered: 37872
Kept: 4471470



Load tokenizers
Read and Tokenize and Save
Source File: data/wmt14/valid.en
Target File: data/wmt14/valid.de


Tokenizing: 0it [00:00, ?it/s]



Total: 3000
Filtered: 7
Kept: 2993



Load tokenizers
Read and Tokenize and Save
Source File: data/wmt14/test.en
Target File: data/wmt14/test.de


Tokenizing: 0it [00:00, ?it/s]



Total: 3003
Filtered: 1
Kept: 3002





In [29]:
# 토큰 예시
tokenizer = spm.SentencePieceProcessor()
tokenizer.load("data/tokenizers/en_tokenizer.model")
with open(f"data/processed/train.src", "r", encoding="utf-8") as f:
    for i, tokenized_line in enumerate(f):
        print(f"Token:\n{tokenized_line}")
        print(f"Encoded Tokens:\n{tokenizer.encode_as_ids(tokenized_line)}", )
        print("\n\n")
        if i == 10:
            break

Token:
▁Res umption ▁of ▁the ▁session

Encoded Tokens:
[1965, 9905, 1180, 25, 9, 4826]



Token:
▁I ▁declare ▁resumed ▁the ▁session ▁of ▁the ▁European ▁Parliament ▁adjourned ▁on ▁Friday ▁17 ▁December ▁1999, ▁and ▁I ▁would ▁like ▁once ▁again ▁to ▁wish ▁you ▁a ▁happy ▁new ▁year ▁in ▁the ▁hope ▁that ▁you ▁enjoyed ▁a ▁pleasant ▁festive ▁period .

Encoded Tokens:
[57, 8816, 11284, 9, 4826, 25, 9, 218, 466, 18978, 64, 6453, 2263, 2973, 9778, 34, 57, 295, 382, 1646, 548, 32, 1509, 106, 5, 3395, 342, 393, 28, 9, 1232, 65, 106, 6918, 5, 4290, 22668, 1495, 514]



Token:
▁Although , ▁as ▁you ▁will ▁have ▁seen , ▁the ▁dread ed ▁' mill ennium ▁bug ' ▁failed ▁to ▁material ise , ▁still ▁the ▁people ▁in ▁a ▁number ▁of ▁countries ▁suffered ▁a ▁series ▁of ▁natural ▁disasters ▁that ▁truly ▁were ▁dreadful .

Encoded Tokens:
[3275, 536, 103, 106, 164, 146, 2312, 536, 9, 16376, 1111, 736, 4166, 173, 28810, 1198, 8436, 736, 4765, 32, 1579, 53, 28805, 536, 907, 9, 474, 28, 5, 891, 25, 516, 7016, 5, 2728, 25,