In [17]:
import os
os.environ["CUDA_VISIBLE_DEVICES"]= "2, 3"

In [178]:
import json
import random
import numpy as np
import pandas as pd

from glob import glob
from tqdm import tqdm

from datasets import Dataset, DatasetDict
from transformers import AutoTokenizer, pipeline, AutoModelForCausalLM, DataCollatorForSeq2Seq, AutoModelForSeq2SeqLM, Seq2SeqTrainingArguments, Seq2SeqTrainer

In [19]:
train_csv = pd.read_csv('/home/kyoungmin_temp/laboratory/kor2kor/dataset/circum_01_train_form.csv')
valid_csv = pd.read_csv('/home/kyoungmin_temp/laboratory/kor2kor/dataset/circum_01_valid_form.csv')

In [20]:
train_csv['translation'][0]

"{'standard': '아빠 먼저 죽어버려서 우리 어머니가 빨리 늙었지요', 'jeju_dialect': '아방 모녀 죽어부난 우리 어멍 질레 늙어쭈마씸'}"

In [21]:
train_ds = Dataset.from_pandas(train_csv)
train_ds

Dataset({
    features: ['id', 'translation'],
    num_rows: 29565
})

In [22]:
valid_ds = Dataset.from_pandas(valid_csv)
valid_ds

Dataset({
    features: ['id', 'translation'],
    num_rows: 6422
})

In [23]:
trainval_ds = DatasetDict({'train': train_ds, 'validation': valid_ds})
trainval_ds

DatasetDict({
    train: Dataset({
        features: ['id', 'translation'],
        num_rows: 29565
    })
    validation: Dataset({
        features: ['id', 'translation'],
        num_rows: 6422
    })
})

In [24]:
trainval_ds["train"][1]["translation"]

"{'standard': '우리 오라버니 온갖 거 다 알아 우리 오라버니에게 물어 봐', 'jeju_dialect': '우리 오라방 하간 거 다 알메 우리 오라방신디 들어 봐'}"

In [25]:
tokenizer = AutoTokenizer.from_pretrained(
    'gogamza/kobart-base-v2',
    cache_dir='/home/kyoungmin_temp/HF_CACHE'
)

model = AutoModelForSeq2SeqLM.from_pretrained(
    'gogamza/kobart-base-v2',
    # pad_token_id=tokenizer.eos_token_id,
    torch_dtype='auto', low_cpu_mem_usage=True,
    cache_dir='/home/kyoungmin_temp/HF_CACHE'
).to(device='cuda', non_blocking=True)

You passed along `num_labels=3` with an incompatible id to label map: {'0': 'NEGATIVE', '1': 'POSITIVE'}. The number of labels wil be overwritten to 2.
You passed along `num_labels=3` with an incompatible id to label map: {'0': 'NEGATIVE', '1': 'POSITIVE'}. The number of labels wil be overwritten to 2.
You passed along `num_labels=3` with an incompatible id to label map: {'0': 'NEGATIVE', '1': 'POSITIVE'}. The number of labels wil be overwritten to 2.


In [26]:
data_collator = DataCollatorForSeq2Seq(tokenizer=tokenizer, model=model)

In [27]:
source_lang = "jeju_dialect"
target_lang = "standard"

def preprocess_function(examples):
    inputs = [eval(example)[source_lang] for example in examples["translation"]]
    targets = [eval(example)[target_lang] for example in examples["translation"]]
    model_inputs = tokenizer(inputs, text_target=targets, max_length=40, truncation=True)
    return model_inputs

In [28]:
tokenized_data = trainval_ds.map(preprocess_function, batched=True, remove_columns=trainval_ds["train"].column_names)

Map: 100%|███████████████████████████████████████████████| 29565/29565 [00:02<00:00, 14108.67 examples/s]
Map: 100%|█████████████████████████████████████████████████| 6422/6422 [00:00<00:00, 12575.38 examples/s]


In [29]:
def postprocess_text(preds, labels):
    preds = [pred.strip() for pred in preds]
    labels = [[label.strip()] for label in labels]

    return preds, labels


def compute_metrics(eval_preds):
    preds, labels = eval_preds
    if isinstance(preds, tuple):
        preds = preds[0]
    decoded_preds = tokenizer.batch_decode(preds, skip_special_tokens=True)

    labels = np.where(labels != -100, labels, tokenizer.pad_token_id)
    decoded_labels = tokenizer.batch_decode(labels, skip_special_tokens=True)

    decoded_preds, decoded_labels = postprocess_text(decoded_preds, decoded_labels)

    result = metric.compute(predictions=decoded_preds, references=decoded_labels)
    result = {"bleu": result["score"]}

    prediction_lens = [np.count_nonzero(pred != tokenizer.pad_token_id) for pred in preds]
    result["gen_len"] = np.mean(prediction_lens)
    result = {k: round(v, 4) for k, v in result.items()}
    return result

In [30]:
tokenized_data

DatasetDict({
    train: Dataset({
        features: ['input_ids', 'token_type_ids', 'attention_mask', 'labels'],
        num_rows: 29565
    })
    validation: Dataset({
        features: ['input_ids', 'token_type_ids', 'attention_mask', 'labels'],
        num_rows: 6422
    })
})

In [31]:
from transformers import Trainer, TrainingArguments

MODEL_NAME = f"KoBART_base_v2-trial2"
args = Seq2SeqTrainingArguments(
    output_dir=f"./{MODEL_NAME}",
    per_device_train_batch_size=32,
    per_device_eval_batch_size=32,
    evaluation_strategy="steps",
    eval_steps=50,
    logging_steps=50,
    num_train_epochs=3,
    weight_decay=0.1,
    warmup_steps=20,
    lr_scheduler_type="cosine",
    learning_rate=5e-4,
    save_steps=50, 
    fp16=True,
    push_to_hub=True,
    run_name=MODEL_NAME,
    load_best_model_at_end=True,
    report_to = ["wandb"]
)

trainer = Seq2SeqTrainer(
    model=model,
    tokenizer=tokenizer,
    args=args,
    data_collator=data_collator,
    train_dataset=tokenized_data["train"],
    eval_dataset=tokenized_data["validation"],
)

Detected kernel version 5.4.0, which is below the recommended minimum of 5.5.0; this can cause the process to hang. It is recommended to upgrade the kernel to the minimum version or higher.
huggingface/tokenizers: The current process just got forked, after parallelism has already been used. Disabling parallelism to avoid deadlocks...
	- Avoid using `tokenizers` before the fork if possible
	- Explicitly set the environment variable TOKENIZERS_PARALLELISM=(true | false)
huggingface/tokenizers: The current process just got forked, after parallelism has already been used. Disabling parallelism to avoid deadlocks...
	- Avoid using `tokenizers` before the fork if possible
	- Explicitly set the environment variable TOKENIZERS_PARALLELISM=(true | false)
huggingface/tokenizers: The current process just got forked, after parallelism has already been used. Disabling parallelism to avoid deadlocks...
	- Avoid using `tokenizers` before the fork if possible
	- Explicitly set the environment variable

In [32]:
trainer.train()

huggingface/tokenizers: The current process just got forked, after parallelism has already been used. Disabling parallelism to avoid deadlocks...
	- Avoid using `tokenizers` before the fork if possible
	- Explicitly set the environment variable TOKENIZERS_PARALLELISM=(true | false)
[34m[1mwandb[0m: Logging into wandb.ai. (Learn how to deploy a W&B server locally: https://wandb.me/wandb-server)
[34m[1mwandb[0m: You can find your API key in your browser here: https://wandb.ai/authorize
[34m[1mwandb[0m: Paste an API key from your profile and hit enter, or press ctrl+c to quit:

  ········


[34m[1mwandb[0m: Appending key for api.wandb.ai to your netrc file: /root/.netrc
huggingface/tokenizers: The current process just got forked, after parallelism has already been used. Disabling parallelism to avoid deadlocks...
	- Avoid using `tokenizers` before the fork if possible
	- Explicitly set the environment variable TOKENIZERS_PARALLELISM=(true | false)
huggingface/tokenizers: The current process just got forked, after parallelism has already been used. Disabling parallelism to avoid deadlocks...
	- Avoid using `tokenizers` before the fork if possible
	- Explicitly set the environment variable TOKENIZERS_PARALLELISM=(true | false)


You're using a PreTrainedTokenizerFast tokenizer. Please note that with a fast tokenizer, using the `__call__` method is faster than using a method to encode the text followed by a call to the `pad` method to get a padded encoding.


Step,Training Loss,Validation Loss
50,2.3889,0.542521
100,0.5339,0.432828
150,0.4609,0.418016
200,0.4631,0.416714
250,0.4065,0.377481
300,0.3898,0.353903
350,0.3637,0.338873
400,0.3347,0.327471
450,0.3428,0.308735
500,0.2871,0.318907


There were missing keys in the checkpoint model loaded: ['model.encoder.embed_tokens.weight', 'model.decoder.embed_tokens.weight', 'lm_head.weight'].


TrainOutput(global_step=2310, training_loss=0.2511038686289932, metrics={'train_runtime': 976.8619, 'train_samples_per_second': 151.326, 'train_steps_per_second': 2.365, 'total_flos': 2862858555648000.0, 'train_loss': 0.2511038686289932, 'epoch': 5.0})

In [33]:
trainer.push_to_hub()

model.safetensors: 100%|██████████████████████████████████████████████| 496M/496M [01:04<00:00, 7.70MB/s]


'https://huggingface.co/Seoulsky/KoBART_base_v2-trial2/tree/main/'

In [34]:
import torch
from transformers import pipeline

device = torch.device("cuda") if torch.cuda.is_available() else torch.device("cpu")
pipe = pipeline(
    "translation", model=f"{MODEL_NAME}", device=device, max_length=40
)

You passed along `num_labels=3` with an incompatible id to label map: {'0': 'NEGATIVE', '1': 'POSITIVE'}. The number of labels wil be overwritten to 2.
You passed along `num_labels=3` with an incompatible id to label map: {'0': 'NEGATIVE', '1': 'POSITIVE'}. The number of labels wil be overwritten to 2.


In [35]:
test_txt_path_lst = sorted(glob('/home/kyoungmin_temp/laboratory/kor2kor/dataset/aihub_older_jeju/test_circum_01/*.json'))
len(test_txt_path_lst)

4498

In [303]:
def output_processing(result_txt):
    empty_space = result_txt.strip(' ').replace('\n', '').split(' ')
    try:
        empty_space = empty_space[:empty_space.index('')]
    except:
        pass
    
    if(len(empty_space) >= 2):
        while empty_space[-1] == empty_space[-2]:
            empty_space.pop()

    if(len(empty_space) >= 4):
        while empty_space[-2:] == empty_space[-4:-2]:
            empty_space.pop()
            empty_space.pop()

        if len(empty_space) == 2:
            pass
        else:
            word_set1 = set(''.join(empty_space[-2:]))
            word_set2 = set(''.join(empty_space[-4:-2]))
            total_set = set(''.join(empty_space[-4:])) 
                
            while (word_set1 == total_set) or (word_set2 == total_set):
                empty_space.pop()
                empty_space.pop()
                
                word_set1 = set(''.join(empty_space[-2:]))
                word_set2 = set(''.join(empty_space[-4:-2]))
                total_set = set(''.join(empty_space[-4:]))
    
                if len(empty_space) == 2:
                    break

    if(len(empty_space) >= 6):
        while empty_space[-3:] == empty_space[-6:-3]:
            empty_space.pop()
            empty_space.pop()
            empty_space.pop()

        if len(empty_space) == 3:
            pass
        else:
            word_set1 = set(''.join(empty_space[-3:]))
            word_set2 = set(''.join(empty_space[-6:-3]))
            total_set = set(''.join(empty_space[-6:]))
            
            while (word_set1 == total_set) or (word_set2 == total_set):
                empty_space.pop()
                empty_space.pop()
                empty_space.pop()
                
                word_set1 = set(''.join(empty_space[-3:]))
                word_set2 = set(''.join(empty_space[-6:-3]))
                total_set = set(''.join(empty_space[-6:]))
    
                if len(empty_space) == 3:
                    break
            
    return ' '.join(empty_space)

In [305]:
# random_idx = random.randint(0, len(test_txt_path_lst))
random_idx = 4217 # 744 # 542 # 3286
# 4334
sample_path = test_txt_path_lst[random_idx]
print(f'Set random index: {random_idx}')
print(f'The random path: {sample_path}')
with open(sample_path) as f:
    sample_json = json.load(f)

dialect_txt = ' '.join(list(x['dialect'] for x in sample_json['transcription']['segments']))
ground_truth = ' '.join(list(x['dialect'] if x['standard'] is None else x['standard'] for x in sample_json['transcription']['segments']))
model_result = pipe(dialect_txt, num_return_sequences=1, pad_token_id=0)[0]['translation_text']
post_process_txt = output_processing(model_result)

reference = [ground_truth.split()]
model_output = post_process_txt.split()

print(f"dialect: {dialect_txt}")
print(f"standard: {ground_truth}")
print(f"translated: {model_result}")
print(f"post processing: {post_process_txt}")
print(f'BLEU Score: {bleu.sentence_bleu(reference, model_output)}')

Set random index: 4217
The random path: /home/kyoungmin_temp/laboratory/kor2kor/dataset/aihub_older_jeju/test_circum_01/st_set1_collectorjj67_speakerjj1025_55_3.json
dialect: 무슨 걱정이시냐
standard: 무슨 걱정이 있니
translated: 무슨 걱정이있니 무슨 걱정이있니  걱정이있니   있니  걱정이있니                  
post processing: 무슨 걱정이있니
BLEU Score: 9.291879812217675e-232


In [306]:
import nltk.translate.bleu_score as bleu

In [307]:
reference = [ground_truth.split()]
model_output = post_process_txt.split()
# rouge = Rouge()

print(f'BLEU Score: {bleu.sentence_bleu(reference, model_output)}')
# print(f'Rouge Score: {rouge.get_scores(model_output, reference)}')

BLEU Score: 9.291879812217675e-232


In [308]:
bleu_result = {'path': [], 'bleu_score': [], 'dialect': [], 'standard': [], 'predict': []}

for sample_path in tqdm(test_txt_path_lst):
    with open(sample_path) as f:
        sample_json = json.load(f)
    
    dialect_txt = ' '.join(list(x['dialect'] for x in sample_json['transcription']['segments']))
    ground_truth = ' '.join(list(x['dialect'] if x['standard'] is None else x['standard'] for x in sample_json['transcription']['segments']))
    model_result = pipe(dialect_txt, num_return_sequences=1, pad_token_id=0)[0]['translation_text']
    post_process_txt = output_processing(model_result)

    reference = [ground_truth.split()]
    model_output = post_process_txt.split()
    bleu_score = bleu.sentence_bleu(reference, model_output)

    bleu_result['path'].append(os.path.basename(sample_path))
    bleu_result['bleu_score'].append(bleu_score)
    bleu_result['dialect'].append(dialect_txt)
    bleu_result['standard'].append(ground_truth)
    bleu_result['predict'].append(post_process_txt)

100%|████████████████████████████████████████████████████████████████| 4498/4498 [16:04<00:00,  4.66it/s]


In [310]:
sum(bleu_result['bleu_score'])

3232.568386814992

In [313]:
sum(bleu_result['bleu_score']) / len(bleu_result['path'])

0.7186679383759431

In [315]:
bleu_result['predict'] = list(map(lambda x: ' '.join(x), bleu_result['predict']))

In [317]:
pd.DataFrame(bleu_result).to_csv('./jeju_bleu_score_result.csv', encoding='utf-8')

In [115]:
pipe(dialect_txt, num_return_sequences=1, pad_token_id=0)

[{'translation_text': '가서 보니 형은 아무렇지도 않게 마루에 앉았더라    였더라                  '}]

In [53]:
post_process = pipe(dialect_txt, num_return_sequences=1, pad_token_id=0)[0]["translation_text"].split(' ')
post_process



['옛날은',
 '거름에',
 '보리',
 '씨를',
 '섞고',
 '그',
 '거름을',
 '집어',
 '넣으면서',
 '보리를',
 '갈았었어',
 '',
 '쌌었어',
 '',
 '',
 '',
 '',
 '',
 '',
 '',
 '',
 '',
 '',
 '']

In [54]:
' '.join(post_process[:post_process.index('')])

'옛날은 거름에 보리 씨를 섞고 그 거름을 집어 넣으면서 보리를 갈았었어'