In [None]:
!pip install torch transformers konlpy tqdm sacrebleu

Collecting konlpy
  Downloading konlpy-0.6.0-py2.py3-none-any.whl (19.4 MB)
[2K     [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m19.4/19.4 MB[0m [31m39.1 MB/s[0m eta [36m0:00:00[0m
Collecting sacrebleu
  Downloading sacrebleu-2.4.2-py3-none-any.whl (106 kB)
[2K     [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m106.7/106.7 kB[0m [31m6.0 MB/s[0m eta [36m0:00:00[0m
Collecting nvidia-cuda-nvrtc-cu12==12.1.105 (from torch)
  Using cached nvidia_cuda_nvrtc_cu12-12.1.105-py3-none-manylinux1_x86_64.whl (23.7 MB)
Collecting nvidia-cuda-runtime-cu12==12.1.105 (from torch)
  Using cached nvidia_cuda_runtime_cu12-12.1.105-py3-none-manylinux1_x86_64.whl (823 kB)
Collecting nvidia-cuda-cupti-cu12==12.1.105 (from torch)
  Using cached nvidia_cuda_cupti_cu12-12.1.105-py3-none-manylinux1_x86_64.whl (14.1 MB)
Collecting nvidia-cudnn-cu12==8.9.2.26 (from torch)
  Using cached nvidia_cudnn_cu12-8.9.2.26-py3-none-manylinux1_x86_64.whl (731.7 MB)
Collecting nvidia-cublas-cu12=

In [None]:
import os
import torch
from torch.utils.data import DataLoader, Dataset
from sklearn.model_selection import train_test_split
from transformers import BartForConditionalGeneration, PreTrainedTokenizerFast, AdamW, get_scheduler
from tqdm.auto import tqdm
import sacrebleu

# 파일 내용을 읽어오는 함수
def read_text_file(file_path):
    with open(file_path, 'r', encoding='utf-8') as file:
        content = file.readlines()
    return [line.strip() for line in content]

# 디렉토리 경로 설정
directory_path = '/content/drive/MyDrive/archive'

# 파일 이름 리스트와 대응하는 변수 이름 리스트
file_names = ['je.train', 'ko.train', 'je.dev', 'ko.dev']
variable_names = ['je_train', 'ko_train', 'je_val', 'ko_val']

# 파일 내용을 각각의 변수에 저장
for file_name, variable_name in zip(file_names, variable_names):
    file_path = os.path.join(directory_path, file_name)
    content = read_text_file(file_path)
    globals()[variable_name] = content

In [None]:
je_train = je_train[0:10000]
ko_train = ko_train[0:10000]

In [None]:
# 문장에 태그 추가
tagged_ko_train = ["<2je> " + sentence for sentence in ko_train]
tagged_je_train = ["<2ko> " + sentence for sentence in je_train]

tagged_ko_test = ["<2je> " + sentence for sentence in ko_val]
tagged_je_test = ["<2ko> " + sentence for sentence in je_val]

# 합치기
train_src_texts = tagged_ko_train + tagged_je_train
train_tgt_texts = je_train + ko_train

test_src_texts = tagged_ko_test + tagged_je_test
test_tgt_texts = je_val + ko_val

In [None]:
# KoBART 모델 및 토크나이저 로드
model_name = 'hyunwoongko/kobart'
tokenizer = PreTrainedTokenizerFast.from_pretrained(model_name)
model = BartForConditionalGeneration.from_pretrained(model_name)

class TranslationDataset(Dataset):
    def __init__(self, src_texts, tgt_texts, tokenizer, max_length=128):
        self.src_texts = src_texts
        self.tgt_texts = tgt_texts
        self.tokenizer = tokenizer
        self.max_length = max_length

    def __len__(self):
        return len(self.src_texts)

    def __getitem__(self, idx):
        src_text = self.src_texts[idx]
        tgt_text = self.tgt_texts[idx]
        inputs = self.tokenizer(src_text, max_length=self.max_length, truncation=True, padding="max_length", return_tensors="pt")
        with self.tokenizer.as_target_tokenizer():
            labels = self.tokenizer(tgt_text, max_length=self.max_length, truncation=True, padding="max_length", return_tensors="pt")

        item = {key: val.squeeze() for key, val in inputs.items()}
        item['labels'] = labels['input_ids'].squeeze()

        return item

# PyTorch 데이터셋 생성
train_dataset = TranslationDataset(train_src_texts, train_tgt_texts, tokenizer)
test_dataset = TranslationDataset(test_src_texts, test_tgt_texts, tokenizer)

# 데이터 로더 생성
train_dataloader = DataLoader(train_dataset, shuffle=True, batch_size=8)
test_dataloader = DataLoader(test_dataset, shuffle=False, batch_size=8)

# 훈련 설정
optimizer = AdamW(model.parameters(), lr=5e-5)
num_epochs = 5
num_training_steps = num_epochs * len(train_dataloader)
lr_scheduler = get_scheduler(
    "linear",
    optimizer=optimizer,
    num_warmup_steps=0,
    num_training_steps=num_training_steps
)

device = torch.device("cuda") if torch.cuda.is_available() else torch.device("cpu")
model.to(device)

# 모델 훈련
progress_bar = tqdm(range(num_training_steps))

model.train()
train_losses = []

for epoch in range(num_epochs):
    epoch_loss = 0
    for batch in train_dataloader:
        batch = {k: v.to(device) for k, v in batch.items()}
        outputs = model(**batch)
        loss = outputs.loss
        loss.backward()

        optimizer.step()
        lr_scheduler.step()
        optimizer.zero_grad()
        progress_bar.update(1)

        epoch_loss += loss.item()
    avg_epoch_loss = epoch_loss / len(train_dataloader)
    train_losses.append(avg_epoch_loss)
    print(f"Epoch [{epoch+1}/{num_epochs}], Loss: {avg_epoch_loss}")

You passed along `num_labels=3` with an incompatible id to label map: {'0': 'NEGATIVE', '1': 'POSITIVE'}. The number of labels wil be overwritten to 2.


  0%|          | 0/12500 [00:00<?, ?it/s]

Epoch [1/5], Loss: 0.305398824133724
Epoch [2/5], Loss: 0.12220019324049354
Epoch [3/5], Loss: 0.08807375194672495
Epoch [4/5], Loss: 0.06481485862331465
Epoch [5/5], Loss: 0.048827404301962814


In [None]:
# 모델 저장
model.save_pretrained('/content/drive/MyDrive/kobart_translation_model')
tokenizer.save_pretrained('/content/drive/MyDrive/kobart_translation_model')

Non-default generation parameters: {'forced_eos_token_id': 1}


('/content/drive/MyDrive/kobart_translation_model/tokenizer_config.json',
 '/content/drive/MyDrive/kobart_translation_model/special_tokens_map.json',
 '/content/drive/MyDrive/kobart_translation_model/tokenizer.json')