In [1]:
from google.colab import drive
drive.mount('/content/drive')

Drive already mounted at /content/drive; to attempt to forcibly remount, call drive.mount("/content/drive", force_remount=True).


In [2]:
!pip install transformers[torch] sentencepiece datasets evaluate sacrebleu[ko]



In [1]:
import pandas as pd
import json
import numpy as np
import random

import logging
from tqdm import tqdm
import os

import torch
from datasets import Dataset
from transformers import AutoTokenizer, AutoModelForSeq2SeqLM, Seq2SeqTrainingArguments, Seq2SeqTrainer, DataCollatorForSeq2Seq

import evaluate

from datetime import datetime
from pytz import timezone

import warnings
warnings.filterwarnings('ignore')

train_serial = datetime.now(tz=timezone('Asia/Seoul')).strftime("%Y%m%d_%H%M%S")

In [2]:
cwd = '/content/drive/MyDrive/2023_이어드림/모의3차/'

cfg = {
    'seed': 42,
    'gpu': 3,
    ##
    'max_length': 64,
    'data_dir': 'train.csv',
    'val_ratio': 0.2,
    'model_name': 'Helsinki-NLP/opus-mt-tc-big-en-ko',
    'max_length': 64,
    ##
    'train_batch_size': 16,
    'learning_rate': 2e-4,
    'weight_decay': 0.01,
    'num_epochs': 2,
    ##
    'eval_batch_size': 8,
    'output_dir': cwd + str(train_serial),
}

In [3]:
# set random seed
np.random.seed(cfg['seed'])
torch.manual_seed(cfg['seed'])
random.seed(cfg['seed'])

device = 'cuda' if torch.cuda.is_available() else 'cpu'

In [4]:
# set logger
os.makedirs(cfg['output_dir'], exist_ok=True)

logger = logging.getLogger()
# formatter = logging.Formatter('%(asct%Y-%m-%d %H:%M:%S',)
class kstFormatter(logging.Formatter):
    def converter(self,timestamp):
        return datetime.now(tz=timezone('Asia/Seoul'))

    def formatTime(self, record, datefmt=None):
        dt = self.converter(record.created)
        if datefmt:
            s = dt.strftime(datefmt)
        else:
            try:
                s = dt.isoformat(timespec='milliseconds')
            except TypeError:
                s = dt.isoformat()
        return s

file_handler = logging.FileHandler(os.path.join(cwd, cfg['output_dir'], 'log.txt'))
file_handler.setFormatter(kstFormatter('%(asctime)s | %(levelname)s | %(message)s',
                                        datefmt='%Y-%m-%d %H:%M:%S'))
logger.addHandler(file_handler)
logger.setLevel(logging.INFO)
logger.info('initialize logger')

INFO:root:initialize logger


In [5]:
logger.info(f'load model: {cfg["model_name"]}')
tokenizer = AutoTokenizer.from_pretrained(cfg['model_name'])
model = AutoModelForSeq2SeqLM.from_pretrained(cfg['model_name']).to(device)

INFO:root:load model: Helsinki-NLP/opus-mt-tc-big-en-ko


In [6]:
dataset = Dataset.from_pandas(pd.read_csv(os.path.join(cfg['data_dir']))[:10000])
dataset = dataset.train_test_split(test_size=cfg['val_ratio'], seed=cfg['seed'])
logger.info(f'dataset loaded from {cfg["data_dir"]}\n{dataset}')

INFO:root:dataset loaded from train.csv
DatasetDict({
    train: Dataset({
        features: ['id', 'text', 'mt', 'target'],
        num_rows: 8000
    })
    test: Dataset({
        features: ['id', 'text', 'mt', 'target'],
        num_rows: 2000
    })
})


In [7]:
def preprocess_fn(batch, tokenizer = tokenizer):
    tokenized = tokenizer(text = batch['text'],
                          text_target = batch['target'],
                          max_length = cfg['max_length'],
                          truncation=True,)
    return tokenized

tokenized_dataset = dataset.map(preprocess_fn,
                                batched=True,
                                remove_columns=dataset["train"].column_names)
logger.info(f'finished tokenization')
tokenized_dataset

Map:   0%|          | 0/8000 [00:00<?, ? examples/s]

Map:   0%|          | 0/2000 [00:00<?, ? examples/s]

INFO:root:finished tokenization


DatasetDict({
    train: Dataset({
        features: ['input_ids', 'attention_mask', 'labels'],
        num_rows: 8000
    })
    test: Dataset({
        features: ['input_ids', 'attention_mask', 'labels'],
        num_rows: 2000
    })
})

In [8]:
print(tokenized_dataset['train'][0]['input_ids'])
print(tokenized_dataset['train'][0]['labels'])

[0, 49, 4, 191, 30420, 0, 2379, 4246, 0, 218, 0, 7124, 7029, 13187, 400, 13229, 3, 2]
[5651, 4, 18, 8738, 7, 268, 12220, 5256, 311, 18924, 147, 421, 2716, 3, 2]


In [9]:
data_collator = DataCollatorForSeq2Seq(tokenizer, model=model)

In [10]:
# evaluate function
metric = evaluate.load('sacrebleu')

def compute_metrics(eval_preds):
    preds, labels = eval_preds

    if isinstance(preds, tuple):
        preds = preds[0]

    decoded_preds = tokenizer.batch_decode(preds, skip_special_tokens=True)
    labels = np.where(labels != -100, labels, tokenizer.pad_token_id)
    decoded_labels = tokenizer.batch_decode(labels, skip_special_tokens=True)

    decoded_preds = [pred.strip() for pred in decoded_preds]
    decoded_labels = [[label.strip()] for label in decoded_labels]
    result = metric.compute(predictions = decoded_preds, references = decoded_labels)
    result = {'bleu': result['score']}
    return result

In [11]:
training_args = Seq2SeqTrainingArguments(
    output_dir=cfg['output_dir'],
    evaluation_strategy='epoch',
    learning_rate=cfg['learning_rate'],
    weight_decay=cfg['weight_decay'],
    per_device_train_batch_size=cfg['train_batch_size'],
    per_device_eval_batch_size=cfg['eval_batch_size'],
    num_train_epochs=cfg['num_epochs'],
    save_total_limit=2,
    predict_with_generate=True,
    fp16=True,
    gradient_accumulation_steps=1,
    logging_dir=os.path.join(cfg['output_dir'],'logs'),
    logging_steps=100,
    logging_first_step=True,
    overwrite_output_dir=True,
    seed=cfg['seed'],
    disable_tqdm=False,
    report_to='none'
)

In [12]:
trainer = Seq2SeqTrainer(
    model=model,
    args=training_args,
    train_dataset=tokenized_dataset['train'],
    eval_dataset=tokenized_dataset['test'],
    data_collator=data_collator,
    tokenizer=tokenizer,
    compute_metrics=compute_metrics,
)

logger.info(f'trainer built : {trainer.args}')

INFO:root:trainer built : Seq2SeqTrainingArguments(
_n_gpu=1,
adafactor=False,
adam_beta1=0.9,
adam_beta2=0.999,
adam_epsilon=1e-08,
auto_find_batch_size=False,
bf16=False,
bf16_full_eval=False,
data_seed=None,
dataloader_drop_last=False,
dataloader_num_workers=0,
dataloader_pin_memory=True,
ddp_backend=None,
ddp_broadcast_buffers=None,
ddp_bucket_cap_mb=None,
ddp_find_unused_parameters=None,
ddp_timeout=1800,
debug=[],
deepspeed=None,
disable_tqdm=False,
dispatch_batches=None,
do_eval=True,
do_predict=False,
do_train=False,
eval_accumulation_steps=None,
eval_delay=0,
eval_steps=None,
evaluation_strategy=epoch,
fp16=True,
fp16_backend=auto,
fp16_full_eval=False,
fp16_opt_level=O1,
fsdp=[],
fsdp_config={'min_num_params': 0, 'xla': False, 'xla_fsdp_grad_ckpt': False},
fsdp_min_num_params=0,
fsdp_transformer_layer_cls_to_wrap=None,
full_determinism=False,
generation_config=None,
generation_max_length=None,
generation_num_beams=None,
gradient_accumulation_steps=1,
gradient_checkpointing=Fa

In [13]:
trainer.train()

Epoch,Training Loss,Validation Loss,Bleu
1,4.1657,4.010334,1.861269
2,2.6542,3.864245,2.788052


TrainOutput(global_step=1000, training_loss=3.606754327774048, metrics={'train_runtime': 924.1916, 'train_samples_per_second': 17.312, 'train_steps_per_second': 1.082, 'total_flos': 734555277361152.0, 'train_loss': 3.606754327774048, 'epoch': 2.0})

In [14]:
trainer.save_model(cfg['output_dir'])

In [15]:
output_ex_train = model.generate(**tokenizer(dataset['train'][2]['text'], return_tensors='pt').to(training_args.device))
output_ex_test = model.generate(**tokenizer(dataset['test'][2]['text'], return_tensors='pt').to(training_args.device))

In [20]:
output_ex_train

tensor([[32000,  4446,     7,  6403,  1968,    43,  5398,  7603,   393,     4,
           165, 19935,    11,   296,  1090,  2871,     8,    67,  5184,    12,
          8643,    42,    19,     3,     2]], device='cuda:0')

In [16]:
logger.info('[example-train] ' + dataset['train'][2]['text'] + '\n-> ' + tokenizer.decode(output_ex_train[0], skip_special_tokens=True) +\
            '\nlabel : '+dataset['train'][2]['target']+'\n'+\
            '[example-test] ' + dataset['test'][2]['text'] + '\n-> ' + tokenizer.decode(output_ex_test[0], skip_special_tokens=True) +\
            '\nlabel : '+dataset['test'][2]['target'])


INFO:root:[example-train] In the meantime, the world is spending its intellectual, financial and technical capital on digging up resources that we cannot burn, otherwise we ourselves will burn.
-> 미래에는 세상이 점점 더 복잡해지고 있고, 우리가 감당할 수 없는 에너지 소비에 대한 부담이 줄어들고 있습니다.
label : 그 사이에 세계는 태울 수 없는 자원을 캐는 데 지적, 재정적, 기술적 자본을 쏟아붓고 있다. 우리 인류가 타 버리지 않도록 말이다.
[example-test] All I ask is your respect.
-> 내가 아는 모든 것은 당신의 마음입니다.
label : 내가 요구하는 것은 당신의 존경입니다.


In [None]:
del output_ex_train, output_ex_test, model
torch.cuda.empty_cache()

# inference

In [46]:
tokenizer = AutoTokenizer.from_pretrained(cfg['output_dir'])
model = AutoModelForSeq2SeqLM.from_pretrained(cfg['output_dir']).to(device)

In [50]:
from torch.utils.data import DataLoader
import gc

test_dataset = Dataset.from_pandas(pd.read_csv(os.path.join(cwd, 'test.csv')))
testloader = DataLoader(test_dataset, batch_size=32)

preds = pd.DataFrame(columns=['id','pred'])
with torch.no_grad():
    for i, data in enumerate(testloader):
        encoding = tokenizer(data['text'], padding=True, return_tensors='pt').to(device)
        pred = model.generate(**encoding)
        preds = pd.concat([preds, pd.DataFrame({'id':data['id'], 'pred':tokenizer.batch_decode(pred, skip_special_tokens=True)})], axis=0)

        if i%10 == 0:
            print(f"{i} samples completed")

        del encoding
        gc.collect()
        torch.cuda.empty_cache()

# GPU RAM 5.4~14.3 GB

preds.head()

0 samples completed
10 samples completed
20 samples completed
30 samples completed
40 samples completed
50 samples completed
60 samples completed
70 samples completed
80 samples completed
90 samples completed
100 samples completed
110 samples completed
120 samples completed
130 samples completed
140 samples completed
150 samples completed
160 samples completed
170 samples completed
180 samples completed
190 samples completed
200 samples completed
210 samples completed
220 samples completed
230 samples completed
240 samples completed
250 samples completed
260 samples completed
270 samples completed
280 samples completed
290 samples completed
300 samples completed
310 samples completed
320 samples completed
330 samples completed


Unnamed: 0,id,pred
0,mcY54CiViEeLTeJ,그 예술가는 사진 촬영을 중단하고 사진 촬영을 중단했습니다.
1,aaUE07DPd3D4yU8,당신은 음악에 대해 이야기하고 있습니다.
2,ZmhbGbc4fuL6wWX,"내가 그것을 시도할 때, 내가 상상하는 것보다, 이 한국판은 좋습니다."
3,cuSNoRiXZm9ewP6,그것은 일종의 신호입니다.
4,tqa1wfPjLzhDnE3,"그것은 거의 불가능해 보이지만, 최근 연구에 따르면, 그 해저에는 고대의 화석보다 ..."


In [52]:
preds.to_csv(os.path.join(cwd, 'prediction.csv'), index=False)