# set up

In [None]:
%mkdir /kaggle/working/dataset
%mkdir /kaggle/working/dataset/tokenizer
%mkdir /kaggle/working/dataset/statistic

In [None]:
!git clone https://github.com/pnthi1604/nlp_data_processing

In [None]:
!rm -rf /opt/conda/lib/python3.10/site-packages/aiohttp*
!pip install --force-reinstall --no-deps aiohttp==3.9.1

In [None]:
!pip install contractions
!pip install bs4
!pip install underthesea

# import

In [None]:
from nlp_data_processing.utils.mapping import (
    separate_text,
    separate_word,
    normalize_punctuation_spacing,
    contraction,
    word_tokenize_vn,
    separate_text_with_min_max_len,
)

from nlp_data_processing.utils.statistic import (
    draw_graph,
    draw_hist_graph,
    get_length_tokens,
)

from nlp_data_processing.utils.save import (
    write_file,
)

from nlp_data_processing.utils.filter import (
    condition_length_with_tokenizer,
    condition_non_number_character,
    condition_min_max_length,
)

from nlp_data_processing.utils.tokenizers import (
    ApiTokenizerHuggingFace,
    read_tokenizer,
    BPE_TOKEN,
    WORDPIECE_TOKEN,
    WORDLEVEL_TOKEN,
)

from nlp_data_processing.utils.seed import set_seed

from nlp_data_processing.utils.create_noise import (
    token_masking,
    token_deletion,
    document_rotation,
    text_infilling,
    sentence_permutation,
    TOKEN_MASKING,
    TOKEN_DELETION,
    DOCUMENT_ROTATION,
    TEXT_INFILLING,
    SENTENCE_PERMUTATION,
)

import pandas as pd
import torch
import os

# config

In [None]:
config = {}
GET_NOISE_FN = {
    TOKEN_MASKING: token_masking,
    TOKEN_DELETION: token_deletion,
    DOCUMENT_ROTATION: document_rotation,
    TEXT_INFILLING: text_infilling,
    SENTENCE_PERMUTATION: sentence_permutation,
}


config["max_sample"] = 10000000000
config["max_get_sample"] = 10000000
# config["max_sample"] = 10
# config["max_get_sample"] = 25

config["raw_data_path"] = "/kaggle/input/dataset/bart/pretrain/raw_data/dataset.csv"
config["train_data_path"] = "/kaggle/working/dataset/train.csv"

config["min_len_token"] = 6
config["max_len_token"] = 196
config["ranges"] = [(6, 100, 0.2), (101, 196, 0.8)] # (min, max, ratio)

config["lang_src"] = "noise_vi"
config["lang_tgt"] = "vi"

config["vocab_size_src"] = 35000
config["vocab_size_tgt"] = 35000
config["min_frequency"] = 2
config["seed"] = 42
config["special_tokens"] = [
    "<s>",
    "</s>",
    "<pad>",
    "<unk>",
    "<mask>",
    "<cls>",
    "<sep>",
]
config["type_token_src"] = WORDPIECE_TOKEN
config["type_token_tgt"] = WORDPIECE_TOKEN
# TOKEN_MASKING
# config["type_noise"] = TOKEN_MASKING
# config["ratio"] = 0.3

# TOKEN_DELETION
# config["type_noise"] = TOKEN_DELETION
# config["ratio"] = 0.15

# DOCUMENT_ROTATION
# config["type_noise"] = DOCUMENT_ROTATION
# config["ratio"] = 1

# TEXT_INFILLING
# config["type_noise"] = TEXT_INFILLING
# config["ratio"] = 0.2

# SENTENCE_PERMUTATION
# config["type_noise"] = SENTENCE_PERMUTATION
# config["ratio"] = 1

# Shuffle
config["type_noise"] = [TEXT_INFILLING, SENTENCE_PERMUTATION]
config["ratio"] = [0.3, 1]

config["train_tokenizer"] = True
config["tokenizer_src_path"] = "/kaggle/working/dataset/tokenizer/tokenizer_src.json"
config["tokenizer_tgt_path"] = "/kaggle/working/dataset/tokenizer/tokenizer_tgt.json"

config["graph_len_token_src_path"] = "/kaggle/working/dataset/statistic/len_token_src.png"
config["graph_len_token_tgt_path"] = "/kaggle/working/dataset/statistic/len_token_tgt.png"

config["desc_path"] = "/kaggle/working/dataset/desc.txt"

In [None]:
set_seed(seed=config["seed"])

# read raw data

In [None]:
raw_data = pd.read_csv(config["raw_data_path"])[:config["max_sample"]]
raw_data = raw_data.dropna()
raw_data = raw_data.drop_duplicates()
raw_data

In [None]:
raw_data = raw_data.rename(columns={"Contents": config["lang_src"]})
raw_data = raw_data[[config["lang_src"]]]
raw_data = raw_data.drop_duplicates()
raw_data = raw_data.dropna()
raw_data

# split sentence

In [None]:
new_raw_dataset = None
for min_len, max_len, ratio in config["ranges"]:
    tmp_raw_data = raw_data.copy()
    tmp_raw_data[config["lang_src"]] = tmp_raw_data[config["lang_src"]].apply(lambda text: separate_text_with_min_max_len(
        text=text,
        min_len=min_len,
        max_len=max_len,
    ))
    tmp_raw_data = tmp_raw_data.explode(config["lang_src"])
    max_sample = int(config["max_get_sample"] * ratio)
    tmp_raw_data = tmp_raw_data[:max_sample]
    if new_raw_dataset is None:
        new_raw_dataset = tmp_raw_data
    else:
        new_raw_dataset = pd.concat([new_raw_dataset, tmp_raw_data])

raw_data = new_raw_dataset
raw_data[config["lang_tgt"]] = raw_data[config["lang_src"]]
raw_data.reset_index(drop=True, inplace=True)
raw_data

# filter number character

In [None]:
# raw_data = raw_data[raw_data.apply(
#     lambda text: condition_non_number_character(
#         text=text[config["lang_src"]],
#     ) and condition_non_number_character(
#         text=text[config["lang_tgt"]],
#     ),
#     axis=1,
# )]
# raw_data.reset_index(drop=True, inplace=True)
# raw_data = raw_data[:int(config["max_get_sample"] * 1.1)]
# raw_data

# filter number words

In [None]:
raw_data = raw_data[raw_data.apply(
    lambda text: condition_min_max_length(
        text=text[config["lang_src"]],
        min_len=config["min_len_token"],
        max_len=config["max_len_token"],
    ) and condition_min_max_length(
        text=text[config["lang_tgt"]],
        min_len=config["min_len_token"],
        max_len=config["max_len_token"],
    ),
    axis=1,
)]
raw_data.reset_index(drop=True, inplace=True)
raw_data = raw_data[:int(config["max_get_sample"] * 1.1)]
raw_data

# normalize data

In [None]:
def mapping_item(item):
    return  normalize_punctuation_spacing(item.lower()).strip()

raw_data = raw_data[:int(config["max_get_sample"] * 1.1)]
    
raw_data[config["lang_src"]] = raw_data[config["lang_src"]].map(lambda item: mapping_item(item))
raw_data[config["lang_tgt"]] = raw_data[config["lang_tgt"]].map(lambda item: mapping_item(item))
sum_item = 0
raw_datas = []
while sum_item < int(config["max_get_sample"] * 1.1):
    tmp_data = raw_data
    type_noises = config["type_noise"]
    ratios = config["ratio"]
    for i in range(len(type_noises)):
        type_noise = type_noises[i]
        ratio = ratios[i]
        noise_fn = GET_NOISE_FN[type_noise]
        raw_data[config["lang_src"]] = raw_data[config["lang_src"]].map(lambda item: noise_fn(
            text=item,
            ratio=ratio,
        ))
    raw_datas.append(raw_data)
    sum_item += len(raw_data)
    raw_data = tmp_data

for i in range(len(raw_datas)):
    raw_datas[i] = raw_datas[i].drop_duplicates()
    raw_datas[i] = raw_datas[i].dropna()
    raw_datas[i].reset_index(drop=True, inplace=True)

# build tokenizer for dataset

In [None]:
dataset = raw_datas[0]

In [None]:
if config["train_tokenizer"]:  
    trainer_tokenizer_src = ApiTokenizerHuggingFace(
        dataset=dataset[config["lang_src"]],
        vocab_size=config["vocab_size_src"],
        min_frequency=config["min_frequency"],
        special_tokens=config["special_tokens"],
        type_token=config["type_token_src"],
    )

    trainer_tokenizer_tgt = ApiTokenizerHuggingFace(
        dataset=dataset[config["lang_tgt"]],
        vocab_size=config["vocab_size_tgt"],
        min_frequency=config["min_frequency"],
        special_tokens=config["special_tokens"],
        type_token=config["type_token_tgt"],
    )

    # train tokenizer
    tokenzier_src = trainer_tokenizer_src.train()
    tokenzier_tgt = trainer_tokenizer_tgt.train()

    # save tokenizer
    tokenzier_src.save(config["tokenizer_src_path"])
    tokenzier_tgt.save(config["tokenizer_tgt_path"])

In [None]:
# read tokenizer
tokenizer_src, tokenizer_tgt = read_tokenizer(
    tokenizer_src_path=config["tokenizer_src_path"],
    tokenizer_tgt_path=config["tokenizer_tgt_path"],
)

config["vocab_size_src"] = tokenizer_src.get_vocab_size()
config["vocab_size_tgt"] = tokenizer_tgt.get_vocab_size()

print("Vocab size src: ", config["vocab_size_src"])
print("Vocab size tgt: ", config["vocab_size_tgt"])

# fillter length tokens

In [None]:
for i in range(len(raw_datas)):
    raw_datas[i] = raw_datas[i][raw_datas[i].apply(
        lambda text: condition_length_with_tokenizer(
            tokenizer=tokenizer_src,
            text=text[config["lang_src"]],
            min_len_token=config["min_len_token"],
            max_len_token=config["max_len_token"],
        ) and condition_length_with_tokenizer(
            tokenizer=tokenizer_tgt,
            text=text[config["lang_tgt"]],
            min_len_token=config["min_len_token"],
            max_len_token=config["max_len_token"],
        ),
        axis=1,
    )]
print("Success\n")

In [None]:
dataset = pd.concat(raw_datas, ignore_index=True)
dataset = dataset[:config["max_get_sample"]]
dataset.reset_index(drop=True, inplace=True)

# save dataset
dataset.to_csv(config["train_data_path"], index=False)

# read dataset
dataset = pd.read_csv(config["train_data_path"])

In [None]:
dataset

In [None]:
lenght_data_src = get_length_tokens(
    tokenizer=tokenizer_src,
    dataset=dataset[config["lang_src"]],
)

lenght_data_tgt = get_length_tokens(
    tokenizer=tokenizer_tgt,
    dataset=dataset[config["lang_tgt"]],
)

config["min_len_token"] = min(lenght_data_src + lenght_data_tgt)
config["max_len_token"] = max(lenght_data_src + lenght_data_tgt)

draw_hist_graph(
    title="Histogram length tokens",
    xlabel="Length tokens",
    ylabel="Frequency",
    data=lenght_data_src,
    save_path=config["graph_len_token_src_path"],
)
draw_hist_graph(
    title="Histogram length tokens",
    xlabel="Length tokens",
    ylabel="Frequency",
    data=lenght_data_tgt,
    save_path=config["graph_len_token_tgt_path"],
)

# save desc

In [None]:
config["length_dataset"] = len(dataset)
config["desc"] = f"Vocab size src: {config['vocab_size_src']}\nVocab size tgt: {config['vocab_size_tgt']}\nMin frequency: {config['min_frequency']}\nMin len token: {config['min_len_token']}\nMax len token: {config['max_len_token']}\nType token src: {config['type_token_src']}\nType token tgt: {config['type_token_tgt']}\nSpecial tokens: {config['special_tokens']}\nLength dataset: {config['length_dataset']}\nType nosie: {config['type_noise']}\nRatio: {config['ratio']}\nRanges: {config['ranges']}"
write_file(
    file_name=config["desc_path"],
    content=config["desc"],
)