In [4]:
!pip install konlpy

Collecting konlpy
  Downloading konlpy-0.6.0-py2.py3-none-any.whl (19.4 MB)
[2K     [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m19.4/19.4 MB[0m [31m72.8 MB/s[0m eta [36m0:00:00[0m
[?25hCollecting JPype1>=0.7.0 (from konlpy)
  Downloading JPype1-1.5.0-cp310-cp310-manylinux_2_17_x86_64.manylinux2014_x86_64.whl (488 kB)
[2K     [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m488.6/488.6 kB[0m [31m45.4 MB/s[0m eta [36m0:00:00[0m
Installing collected packages: JPype1, konlpy
Successfully installed JPype1-1.5.0 konlpy-0.6.0


In [5]:
!pip install torch transformers tqdm

Collecting nvidia-cuda-nvrtc-cu12==12.1.105 (from torch)
  Using cached nvidia_cuda_nvrtc_cu12-12.1.105-py3-none-manylinux1_x86_64.whl (23.7 MB)
Collecting nvidia-cuda-runtime-cu12==12.1.105 (from torch)
  Using cached nvidia_cuda_runtime_cu12-12.1.105-py3-none-manylinux1_x86_64.whl (823 kB)
Collecting nvidia-cuda-cupti-cu12==12.1.105 (from torch)
  Using cached nvidia_cuda_cupti_cu12-12.1.105-py3-none-manylinux1_x86_64.whl (14.1 MB)
Collecting nvidia-cudnn-cu12==8.9.2.26 (from torch)
  Using cached nvidia_cudnn_cu12-8.9.2.26-py3-none-manylinux1_x86_64.whl (731.7 MB)
Collecting nvidia-cublas-cu12==12.1.3.1 (from torch)
  Using cached nvidia_cublas_cu12-12.1.3.1-py3-none-manylinux1_x86_64.whl (410.6 MB)
Collecting nvidia-cufft-cu12==11.0.2.54 (from torch)
  Using cached nvidia_cufft_cu12-11.0.2.54-py3-none-manylinux1_x86_64.whl (121.6 MB)
Collecting nvidia-curand-cu12==10.3.2.106 (from torch)
  Using cached nvidia_curand_cu12-10.3.2.106-py3-none-manylinux1_x86_64.whl (56.5 MB)
Collectin

In [6]:
import math
import torch
import torch.nn as nn
from torch.optim import Adam
from torch.utils.data import DataLoader, Dataset
from sklearn.model_selection import train_test_split
from konlpy.tag import Okt
from tqdm.auto import tqdm
import json
import os

# Seq2Seq 모델 정의
class Seq2SeqModel(nn.Module):
    def __init__(self, src_vocab_size, tgt_vocab_size, d_model=512, num_layers=2, dropout=0.1):
        super(Seq2SeqModel, self).__init__()
        self.encoder = nn.LSTM(d_model, d_model, num_layers, dropout=dropout, batch_first=True)
        self.decoder = nn.LSTM(d_model, d_model, num_layers, dropout=dropout, batch_first=True)
        self.src_tok_emb = nn.Embedding(src_vocab_size, d_model)
        self.tgt_tok_emb = nn.Embedding(tgt_vocab_size, d_model)
        self.fc_out = nn.Linear(d_model, tgt_vocab_size)
        self.dropout = nn.Dropout(dropout)

    def forward(self, src, tgt):
        src_emb = self.dropout(self.src_tok_emb(src))
        tgt_emb = self.dropout(self.tgt_tok_emb(tgt))
        _, (hidden, cell) = self.encoder(src_emb)
        output, _ = self.decoder(tgt_emb, (hidden, cell))
        return self.fc_out(output)

# KonlpyTokenizer 클래스 정의
class KonlpyTokenizer:
    def __init__(self):
        self.okt = Okt()
        self.word2idx = {}
        self.idx2word = {}
        self.vocab_size = 0

    def fit(self, sentences):
        word_set = set(word for sentence in sentences for word in self.okt.morphs(sentence))
        self.word2idx = {word: idx+2 for idx, word in enumerate(word_set)}  # 2부터 시작
        self.word2idx['<pad>'] = 0
        self.word2idx['<unk>'] = 1
        self.idx2word = {idx: word for word, idx in self.word2idx.items()}
        self.vocab_size = len(self.word2idx)

    def encode(self, sentence):
        return [self.word2idx.get(word, self.word2idx['<unk>']) for word in self.okt.morphs(sentence)]

    def decode(self, tokens):
        return ' '.join([self.idx2word[token] for token in tokens if token != 0])

    def save(self, file_path):
        with open(file_path, 'w', encoding='utf-8') as f:
            json.dump({'word2idx': self.word2idx, 'idx2word': self.idx2word}, f, ensure_ascii=False, indent=4)

    def load(self, file_path):
        with open(file_path, 'r', encoding='utf-8') as f:
            data = json.load(f)
            self.word2idx = data['word2idx']
            self.idx2word = {int(k): v for k, v in data['idx2word'].items()}
            self.vocab_size = len(self.word2idx)

In [7]:
import os

# 데이터 준비
def read_text_file(file_path):
    with open(file_path, 'r', encoding='utf-8') as file:
        content = file.readlines()
    return [line.strip() for line in content]

# 파일 경로 설정 및 데이터 읽기
directory_path = '/content/drive/MyDrive/구름/Week4/archive'
file_names = ['je.train', 'ko.train', 'je.dev', 'ko.dev']
variable_names = ['je_train', 'ko_train', 'je_dev', 'ko_dev']

for file_name, variable_name in zip(file_names, variable_names):
    file_path = os.path.join(directory_path, file_name)
    content = read_text_file(file_path)
    globals()[variable_name] = content

# 학습 데이터를 일부만 사용
je_train = je_train[0:10000]
ko_train = ko_train[0:10000]

# 문장에 태그 추가
tagged_ko_train = ["<2je> " + sentence for sentence in ko_train]
tagged_je_train = ["<2ko> " + sentence for sentence in je_train]

tagged_ko_test = ["<2je> " + sentence for sentence in ko_dev]
tagged_je_test = ["<2ko> " + sentence for sentence in je_dev]

# 학습 및 테스트 데이터 설정
train_src_texts = tagged_ko_train + tagged_je_train
train_tgt_texts = je_train + ko_train

test_src_texts = tagged_ko_test + tagged_je_test
test_tgt_texts = je_dev + ko_dev

# 토크나이저 학습
tokenizer = KonlpyTokenizer()
tokenizer.fit(train_src_texts + train_tgt_texts)

In [8]:
# 데이터셋 정의
class TranslationDataset(Dataset):
    def __init__(self, src_texts, tgt_texts, tokenizer, max_length=128):
        self.src_texts = src_texts
        self.tgt_texts = tgt_texts
        self.tokenizer = tokenizer
        self.max_length = max_length

    def __len__(self):
        return len(self.src_texts)

    def __getitem__(self, idx):
        src_text = self.src_texts[idx]
        tgt_text = self.tgt_texts[idx]
        src_ids = self.tokenizer.encode(src_text)[:self.max_length]
        tgt_ids = self.tokenizer.encode(tgt_text)[:self.max_length]
        src_ids = src_ids + [0] * (self.max_length - len(src_ids))
        tgt_ids = tgt_ids + [0] * (self.max_length - len(tgt_ids))
        return torch.tensor(src_ids), torch.tensor(tgt_ids)

# 데이터 로더 설정
train_dataset = TranslationDataset(train_src_texts, train_tgt_texts, tokenizer)
test_dataset = TranslationDataset(test_src_texts, test_tgt_texts, tokenizer)

train_dataloader = DataLoader(train_dataset, shuffle=True, batch_size=8)
test_dataloader = DataLoader(test_dataset, shuffle=False, batch_size=8)

src_vocab_size = tokenizer.vocab_size
tgt_vocab_size = tokenizer.vocab_size

# 모델 정의
model_ko_to_je = Seq2SeqModel(src_vocab_size, tgt_vocab_size)
model_je_to_ko = Seq2SeqModel(tgt_vocab_size, src_vocab_size)

# 훈련 설정
optimizer_ko_to_je = Adam(model_ko_to_je.parameters(), lr=5e-5)
optimizer_je_to_ko = Adam(model_je_to_ko.parameters(), lr=5e-5)
num_epochs = 20
device = torch.device("cuda") if torch.cuda.is_available() else torch.device("cpu")
model_ko_to_je.to(device)
model_je_to_ko.to(device)

# Early stopping 설정
patience = 2
best_loss_ko_to_je = float('inf')
best_loss_je_to_ko = float('inf')
trigger_times_ko_to_je = 0
trigger_times_je_to_ko = 0

# 검증 함수 정의
def validate(model, dataloader, criterion):
    model.eval()
    val_loss = 0
    with torch.no_grad():
        for src, tgt in dataloader:
            src, tgt = src.to(device), tgt.to(device)
            tgt_input = tgt[:, :-1]
            tgt_output = tgt[:, 1:]
            output = model(src, tgt_input)
            loss = criterion(output.reshape(-1, output.shape[-1]), tgt_output.reshape(-1))
            val_loss += loss.item()
    return val_loss / len(dataloader)

criterion = nn.CrossEntropyLoss()

# 모델 훈련
progress_bar = tqdm(range(num_epochs * len(train_dataloader)))
train_losses_ko_to_je = []
train_losses_je_to_ko = []

for epoch in range(num_epochs):
    model_ko_to_je.train()
    model_je_to_ko.train()
    epoch_loss_ko_to_je = 0
    epoch_loss_je_to_ko = 0
    for src, tgt in train_dataloader:
        # ko_to_je 훈련
        src, tgt = src.to(device), tgt.to(device)
        tgt_input = tgt[:, :-1]
        tgt_output = tgt[:, 1:]
        optimizer_ko_to_je.zero_grad()
        output = model_ko_to_je(src, tgt_input)
        loss = criterion(output.reshape(-1, output.shape[-1]), tgt_output.reshape(-1))
        loss.backward()
        optimizer_ko_to_je.step()
        epoch_loss_ko_to_je += loss.item()

        # je_to_ko 훈련
        tgt, src = tgt.to(device), src.to(device)  # 소스와 타겟을 바꿔서 훈련
        tgt_input = tgt[:, :-1]
        tgt_output = tgt[:, 1:]
        optimizer_je_to_ko.zero_grad()
        output = model_je_to_ko(src, tgt_input)
        loss = criterion(output.reshape(-1, output.shape[-1]), tgt_output.reshape(-1))
        loss.backward()
        optimizer_je_to_ko.step()
        epoch_loss_je_to_ko += loss.item()

        progress_bar.update(1)

    avg_epoch_loss_ko_to_je = epoch_loss_ko_to_je / len(train_dataloader)
    avg_epoch_loss_je_to_ko = epoch_loss_je_to_ko / len(train_dataloader)
    train_losses_ko_to_je.append(avg_epoch_loss_ko_to_je)
    train_losses_je_to_ko.append(avg_epoch_loss_je_to_ko)
    print(f"Epoch [{epoch+1}/{num_epochs}], ko_to_je Loss: {avg_epoch_loss_ko_to_je}, je_to_ko Loss: {avg_epoch_loss_je_to_ko}")

    val_loss_ko_to_je = validate(model_ko_to_je, test_dataloader, criterion)
    val_loss_je_to_ko = validate(model_je_to_ko, test_dataloader, criterion)
    print(f"Validation Loss ko_to_je: {val_loss_ko_to_je}, je_to_ko: {val_loss_je_to_ko}")

    if val_loss_ko_to_je < best_loss_ko_to_je:
        best_loss_ko_to_je = val_loss_ko_to_je
        trigger_times_ko_to_je = 0
        # 모델 저장
        torch.save(model_ko_to_je.state_dict(), '/content/drive/MyDrive/transformer_translation_model_ko_to_je.pth')
    else:
        trigger_times_ko_to_je += 1
        if trigger_times_ko_to_je >= patience:
            print("Early stopping triggered for ko_to_je")
            break

    if val_loss_je_to_ko < best_loss_je_to_ko:
        best_loss_je_to_ko = val_loss_je_to_ko
        trigger_times_je_to_ko = 0
        # 모델 저장
        torch.save(model_je_to_ko.state_dict(), '/content/drive/MyDrive/transformer_translation_model_je_to_ko.pth')
    else:
        trigger_times_je_to_ko += 1
        if trigger_times_je_to_ko >= patience:
            print("Early stopping triggered for je_to_ko")
            break

# 모델 및 토크나이저 저장
tokenizer.save('/content/drive/MyDrive/tokenizer.json')

  0%|          | 0/50000 [00:00<?, ?it/s]

Epoch [1/20], ko_to_je Loss: 0.6383936259239912, je_to_ko Loss: 0.6447704630792142
Validation Loss ko_to_je: 0.8583626879930496, je_to_ko: 0.8582002779960632
Epoch [2/20], ko_to_je Loss: 0.49188225446045397, je_to_ko Loss: 0.4942382970750332
Validation Loss ko_to_je: 0.8437200621366501, je_to_ko: 0.8432101244211196
Epoch [3/20], ko_to_je Loss: 0.47635041212923823, je_to_ko Loss: 0.47644041297510265
Validation Loss ko_to_je: 0.836827794599533, je_to_ko: 0.8349805600404739
Epoch [4/20], ko_to_je Loss: 0.4611087682157755, je_to_ko Loss: 0.46122181408405305
Validation Loss ko_to_je: 0.8245039383888244, je_to_ko: 0.8244213079214096
Epoch [5/20], ko_to_je Loss: 0.445674948105216, je_to_ko Loss: 0.44554142490923404
Validation Loss ko_to_je: 0.8138091276884079, je_to_ko: 0.8146917772054673
Epoch [6/20], ko_to_je Loss: 0.4308459319755435, je_to_ko Loss: 0.4307683040589094
Validation Loss ko_to_je: 0.8047707894325257, je_to_ko: 0.8023029254674912
Epoch [7/20], ko_to_je Loss: 0.418573568482697, j