<a href="https://colab.research.google.com/github/strongeryoung/Oracle_Bio_Service-development/blob/main/KE_translation.ipynb" target="_parent"><img src="https://colab.research.google.com/assets/colab-badge.svg" alt="Open In Colab"/></a>

In [1]:
pip install torch transformers nltk

Collecting nvidia-cuda-nvrtc-cu12==12.4.127 (from torch)
  Downloading nvidia_cuda_nvrtc_cu12-12.4.127-py3-none-manylinux2014_x86_64.whl.metadata (1.5 kB)
Collecting nvidia-cuda-runtime-cu12==12.4.127 (from torch)
  Downloading nvidia_cuda_runtime_cu12-12.4.127-py3-none-manylinux2014_x86_64.whl.metadata (1.5 kB)
Collecting nvidia-cuda-cupti-cu12==12.4.127 (from torch)
  Downloading nvidia_cuda_cupti_cu12-12.4.127-py3-none-manylinux2014_x86_64.whl.metadata (1.6 kB)
Collecting nvidia-cudnn-cu12==9.1.0.70 (from torch)
  Downloading nvidia_cudnn_cu12-9.1.0.70-py3-none-manylinux2014_x86_64.whl.metadata (1.6 kB)
Collecting nvidia-cublas-cu12==12.4.5.8 (from torch)
  Downloading nvidia_cublas_cu12-12.4.5.8-py3-none-manylinux2014_x86_64.whl.metadata (1.5 kB)
Collecting nvidia-cufft-cu12==11.2.1.3 (from torch)
  Downloading nvidia_cufft_cu12-11.2.1.3-py3-none-manylinux2014_x86_64.whl.metadata (1.5 kB)
Collecting nvidia-curand-cu12==10.3.5.147 (from torch)
  Downloading nvidia_curand_cu12-10.3.5

In [2]:
train_data = [
    ("안녕하세요.", "Hello."),
    ("오늘 날씨가 좋습니다.", "The weather is nice today."),
    ("이것은 테스트 문장입니다.", "This is a test sentence."),
    ("나는 인공지능을 좋아합니다.", "I like artificial intelligence."),
    ("이 모델은 성능이 좋습니다.", "This model has good performance."),
    ("저는 매일 아침 커피를 마십니다.", "I drink coffee every morning."),
    ("지금 몇 시입니까?", "What time is it now?"),
    ("내일은 비가 올 것입니다.", "It will rain tomorrow."),
    ("서울은 한국의 수도입니다.", "Seoul is the capital of Korea."),
    ("기계학습은 매우 흥미롭습니다.", "Machine learning is very interesting."),
    ("오늘 저녁에 영화 보러 갈까요?", "Shall we go watch a movie tonight?"),
    ("당신은 어떤 음식을 좋아합니까?", "What kind of food do you like?"),
    ("한국어를 배우는 것은 쉽지 않습니다.", "Learning Korean is not easy."),
    ("그는 매일 아침 운동을 합니다.", "He exercises every morning."),
    ("그녀는 책을 읽고 있습니다.", "She is reading a book."),
    ("이 컴퓨터는 너무 느립니다.", "This computer is too slow."),
    ("새로운 프로젝트는 잘 진행되고 있습니다.", "The new project is going well."),
    ("학교가 끝난 후에 무엇을 합니까?", "What do you do after school?"),
    ("휴가 때 어디로 갈 계획입니까?", "Where do you plan to go on vacation?"),
    ("그 회사는 신제품을 출시했습니다.", "The company has released a new product."),
]

test_data = [
    ("안녕히 가세요.", "Goodbye."),
    ("당신의 이름은 무엇입니까?", "What is your name?"),
    ("식사는 하셨습니까?", "Have you eaten?"),
    ("여기에서 지하철역까지 얼마나 걸립니까?", "How long does it take to the subway station from here?"),
    ("저는 여행을 좋아합니다.", "I like traveling."),
    ("도와주셔서 감사합니다.", "Thank you for your help."),
    ("내일 아침에 일찍 일어나야 합니다.", "I have to get up early tomorrow morning."),
    ("회의는 언제 시작합니까?", "When does the meeting start?"),
    ("지금 배가 고픕니다.", "I'm hungry now."),
    ("이 문제를 해결할 방법이 있습니까?", "Is there a way to solve this problem?")
]


In [3]:
import torch
from transformers import MarianMTModel, MarianTokenizer
import nltk
from nltk.translate.bleu_score import corpus_bleu

nltk.download('punkt_tab')

# 모델 및 토크나이저 로딩
model_name = "Helsinki-NLP/opus-mt-ko-en"
tokenizer = MarianTokenizer.from_pretrained(model_name)
model = MarianMTModel.from_pretrained(model_name)

# 평가 함수 정의
def evaluate_bleu(model, tokenizer, dataset):
    references, hypotheses = [], []
    model.eval()

    with torch.no_grad():
        for ko, en in dataset:
            inputs = tokenizer(ko, return_tensors="pt")
            outputs = model.generate(**inputs)
            pred = tokenizer.decode(outputs[0], skip_special_tokens=True)

            references.append([nltk.word_tokenize(en.lower())])
            hypotheses.append(nltk.word_tokenize(pred.lower()))

    bleu_score = corpus_bleu(references, hypotheses)
    return bleu_score

# 미세학습 함수 정의
def fine_tune(model, tokenizer, dataset, epochs=10):
    optimizer = torch.optim.AdamW(model.parameters(), lr=1e-5)

    model.train()
    for epoch in range(epochs):
        total_loss = 0
        for ko, en in dataset:
            inputs = tokenizer(ko, return_tensors="pt")
            labels = tokenizer(en, return_tensors="pt").input_ids
            labels[labels == tokenizer.pad_token_id] = -100

            loss = model(**inputs, labels=labels).loss
            loss.backward()
            optimizer.step()
            optimizer.zero_grad()

            total_loss += loss.item()
        print(f"Epoch {epoch+1}/{epochs}, Loss: {total_loss/len(dataset):.4f}")

[nltk_data] Downloading package punkt_tab to /root/nltk_data...
[nltk_data]   Unzipping tokenizers/punkt_tab.zip.
The secret `HF_TOKEN` does not exist in your Colab secrets.
To authenticate with the Hugging Face Hub, create a token in your settings tab (https://huggingface.co/settings/tokens), set it as secret in your Google Colab and restart your session.
You will be able to reuse this secret in all of your notebooks.
Please note that authentication is recommended but still optional to access public models or datasets.


tokenizer_config.json:   0%|          | 0.00/44.0 [00:00<?, ?B/s]

source.spm:   0%|          | 0.00/842k [00:00<?, ?B/s]

target.spm:   0%|          | 0.00/813k [00:00<?, ?B/s]

vocab.json:   0%|          | 0.00/1.72M [00:00<?, ?B/s]

config.json:   0%|          | 0.00/1.39k [00:00<?, ?B/s]



pytorch_model.bin:   0%|          | 0.00/312M [00:00<?, ?B/s]

model.safetensors:   0%|          | 0.00/312M [00:00<?, ?B/s]

generation_config.json:   0%|          | 0.00/293 [00:00<?, ?B/s]

In [4]:
# Fine-tuning 이전 BLEU 점수 평가
bleu_before = evaluate_bleu(model, tokenizer, test_data)
print(f"Fine-tuning 이전 BLEU 점수: {bleu_before:.4f}")

# 미세학습 실행
fine_tune(model, tokenizer, train_data, epochs=10)

# Fine-tuning 이후 BLEU 점수 평가
bleu_after = evaluate_bleu(model, tokenizer, test_data)
print(f"Fine-tuning 이후 BLEU 점수: {bleu_after:.4f}")

Fine-tuning 이전 BLEU 점수: 0.5459
Epoch 1/10, Loss: 6.3899
Epoch 2/10, Loss: 5.2023
Epoch 3/10, Loss: 4.5418
Epoch 4/10, Loss: 3.8736
Epoch 5/10, Loss: 3.2674
Epoch 6/10, Loss: 2.8779
Epoch 7/10, Loss: 2.3304
Epoch 8/10, Loss: 1.9777
Epoch 9/10, Loss: 1.6875
Epoch 10/10, Loss: 1.3767
Fine-tuning 이후 BLEU 점수: 0.5937


In [5]:
# 번역할 문장 리스트
sentences_ko = [
    "오늘 정말 피곤하네요.",
    "주말에 영화 보러 가실래요?",
    "서울에는 맛있는 식당이 많습니다.",
    "내일 오전에 회의가 있습니다.",
    "이 컴퓨터는 속도가 매우 빠릅니다."
]

# 번역 및 출력 함수
def translate_sentences(model, tokenizer, sentences):
    model.eval()
    with torch.no_grad():
        for sentence in sentences:
            inputs = tokenizer(sentence, return_tensors="pt")
            translated = model.generate(**inputs)
            translated_text = tokenizer.decode(translated[0], skip_special_tokens=True)
            print(f"원문: {sentence}")
            print(f"번역: {translated_text}\n")

# 번역 실행
translate_sentences(model, tokenizer, sentences_ko)

원문: 오늘 정말 피곤하네요.
번역: I'm really tired today.

원문: 주말에 영화 보러 가실래요?
번역: Would you like to go see a movie on the weekend?

원문: 서울에는 맛있는 식당이 많습니다.
번역: There are a lot of delicious restaurants in Seoul.

원문: 내일 오전에 회의가 있습니다.
번역: There is a meeting in the morning.

원문: 이 컴퓨터는 속도가 매우 빠릅니다.
번역: The computer is very fast.

