In [None]:
!pip install torch transformers konlpy tqdm sacrebleu

Collecting konlpy
  Downloading konlpy-0.6.0-py2.py3-none-any.whl (19.4 MB)
[2K     [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m19.4/19.4 MB[0m [31m14.9 MB/s[0m eta [36m0:00:00[0m
Collecting sacrebleu
  Downloading sacrebleu-2.4.2-py3-none-any.whl (106 kB)
[2K     [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m106.7/106.7 kB[0m [31m4.3 MB/s[0m eta [36m0:00:00[0m
Collecting nvidia-cuda-nvrtc-cu12==12.1.105 (from torch)
  Using cached nvidia_cuda_nvrtc_cu12-12.1.105-py3-none-manylinux1_x86_64.whl (23.7 MB)
Collecting nvidia-cuda-runtime-cu12==12.1.105 (from torch)
  Using cached nvidia_cuda_runtime_cu12-12.1.105-py3-none-manylinux1_x86_64.whl (823 kB)
Collecting nvidia-cuda-cupti-cu12==12.1.105 (from torch)
  Using cached nvidia_cuda_cupti_cu12-12.1.105-py3-none-manylinux1_x86_64.whl (14.1 MB)
Collecting nvidia-cudnn-cu12==8.9.2.26 (from torch)
  Using cached nvidia_cudnn_cu12-8.9.2.26-py3-none-manylinux1_x86_64.whl (731.7 MB)
Collecting nvidia-cublas-cu12=

In [None]:
import os
import torch
from torch.utils.data import DataLoader, Dataset
from sklearn.model_selection import train_test_split
from transformers import BartForConditionalGeneration, PreTrainedTokenizerFast, AdamW, get_scheduler
from tqdm.auto import tqdm
import sacrebleu

# 파일 내용을 읽어오는 함수
def read_text_file(file_path):
    with open(file_path, 'r', encoding='utf-8') as file:
        content = file.readlines()
    return [line.strip() for line in content]

# 디렉토리 경로 설정
directory_path = '/content/drive/MyDrive/archive'

# 파일 이름 리스트와 대응하는 변수 이름 리스트
file_names = ['je.train', 'ko.train', 'je.dev', 'ko.dev', 'ko.test', 'je.test']
variable_names = ['je_train', 'ko_train', 'je_val', 'ko_val', 'ko_test', 'je_test']

# 파일 내용을 각각의 변수에 저장
for file_name, variable_name in zip(file_names, variable_names):
    file_path = os.path.join(directory_path, file_name)
    content = read_text_file(file_path)
    globals()[variable_name] = content

je_train = je_train[0:10000]
ko_train = ko_train[0:10000]
ko_test = ko_test[0:1000]
je_test = je_test[0:1000]

# 문장에 태그 추가
tagged_ko_train = ["<2je> " + sentence for sentence in ko_train]
tagged_je_train = ["<2ko> " + sentence for sentence in je_train]

tagged_ko_test = ["<2je> " + sentence for sentence in ko_test]
tagged_je_test = ["<2ko> " + sentence for sentence in je_test]

# 합치기
train_src_texts = tagged_ko_train + tagged_je_train
train_tgt_texts = je_train + ko_train

test_src_texts = tagged_ko_test + tagged_je_test
test_tgt_texts = je_test + ko_test

In [None]:
import torch
from transformers import BartForConditionalGeneration, PreTrainedTokenizerFast
from torch.utils.data import DataLoader, Dataset
import sacrebleu

# 저장된 모델과 토크나이저 로드
model_path = '/content/drive/MyDrive/kobart_translation_model'
tokenizer = PreTrainedTokenizerFast.from_pretrained(model_path)
model = BartForConditionalGeneration.from_pretrained(model_path)

device = torch.device("cuda") if torch.cuda.is_available() else torch.device("cpu")
model.to(device)

class TranslationDataset(Dataset):
    def __init__(self, src_texts, tgt_texts, tokenizer, max_length=128):
        self.src_texts = src_texts
        self.tgt_texts = tgt_texts
        self.tokenizer = tokenizer
        self.max_length = max_length

    def __len__(self):
        return len(self.src_texts)

    def __getitem__(self, idx):
        src_text = self.src_texts[idx]
        tgt_text = self.tgt_texts[idx]
        inputs = self.tokenizer(src_text, max_length=self.max_length, truncation=True, padding="max_length", return_tensors="pt")
        with self.tokenizer.as_target_tokenizer():
            labels = self.tokenizer(tgt_text, max_length=self.max_length, truncation=True, padding="max_length", return_tensors="pt")

        item = {key: val.squeeze() for key, val in inputs.items()}
        item['labels'] = labels['input_ids'].squeeze()

        return item

# PyTorch 데이터셋 생성
test_dataset = TranslationDataset(test_src_texts, test_tgt_texts, tokenizer)

# 데이터 로더 생성
test_dataloader = DataLoader(test_dataset, shuffle=False, batch_size=8)

# BLEU 점수 계산 함수
def compute_bleu_score(model, dataloader, tokenizer, device):
    model.eval()
    all_predictions = []
    all_labels = []

    for batch in dataloader:
        batch = {k: v.to(device) for k, v in batch.items()}
        with torch.no_grad():
            outputs = model.generate(input_ids=batch['input_ids'], attention_mask=batch['attention_mask'], max_length=128)
        predictions = [tokenizer.decode(g, skip_special_tokens=True, clean_up_tokenization_spaces=False) for g in outputs]
        labels = [tokenizer.decode(l, skip_special_tokens=True, clean_up_tokenization_spaces=False) for l in batch['labels']]
        all_predictions.extend(predictions)
        all_labels.extend(labels)

    bleu = sacrebleu.corpus_bleu(all_predictions, [all_labels])
    return bleu.score

# 번역 예제 확인
def translate_texts(model, texts, tokenizer, device, max_length=128):
    model.eval()
    inputs = tokenizer(texts, return_tensors="pt", padding=True, truncation=True, max_length=max_length)
    inputs = {k: v.to(device) for k, v in inputs.items()}
    with torch.no_grad():
        outputs = model.generate(input_ids=inputs['input_ids'], attention_mask=inputs['attention_mask'], max_length=max_length)
    translations = [tokenizer.decode(g, skip_special_tokens=True, clean_up_tokenization_spaces=False) for g in outputs]
    return translations

You passed along `num_labels=3` with an incompatible id to label map: {'0': 'NEGATIVE', '1': 'POSITIVE'}. The number of labels wil be overwritten to 2.


In [None]:
# 예제 번역
example_texts = ["안녕하세요", "밥 먹언?"]
translations = translate_texts(model, example_texts, tokenizer, device)
for text, translation in zip(example_texts, translations):
    print(f"Original: {text}")
    print(f"Translated: {translation}")

Original: 안녕하세요
Translated: 예허지 .
Original: 밥 먹언?
Translated: 먹 먹었어 ?


In [None]:
# 테스트 데이터셋에 대한 BLEU 점수 계산
test_bleu_score = compute_bleu_score(model, test_dataloader, tokenizer, device)
print(f"Test BLEU score: {test_bleu_score}")



Test BLEU score: 46.054439728747866
