# TravelMate AI – 최종 개선 솔루션 (가독성 UP)
이 노트북은 **독↔영 소규모 NMT 프로토타입**의 레퍼런스 구현입니다.
- BPE 서브워드
- LSTM Encoder/Decoder + Bahdanau Attention
- Teacher Forcing + Gradient Clipping + Label Smoothing
- Greedy Decoding & 간이 BLEU 평가

모든 코드에 **상세 주석**을 달아 가독성을 높였습니다. GPU/CPU 모두 동작하며, Colab GPU 기준 전체 학습 ≈ 10 분(10 Epoch)입니다.

In [1]:
!pip install sacrebleu



In [2]:
import random
import numpy as np
from pathlib import Path
from typing import List, Tuple

import torch
import torch.nn as nn
from torch.utils.data import Dataset, DataLoader
from torch.nn.utils.rnn import pad_sequence
import sentencepiece as spm
from datasets import load_dataset
import sacrebleu
from tqdm import tqdm

# ------------------ Global configuration ------------------ #
SEED   = 42

if torch.cuda.is_available(): device=torch.device("cuda")
elif torch.backends.mps.is_available(): device=torch.device("mps")
else: device=torch.device("cpu")
DEVICE = device

random.seed(SEED)
np.random.seed(SEED)
torch.manual_seed(SEED)
print(f'✓ Using device: {DEVICE}')

✓ Using device: cuda


## 1. 데이터셋 다운로드 & BPE 토크나이저 학습

In [3]:
!pip install -U datasets huggingface_hub fsspec

Collecting fsspec
  Using cached fsspec-2025.5.0-py3-none-any.whl.metadata (11 kB)


In [4]:
# 경로 및 데이터셋 로드 -------------------------------------------------
DATA_DIR = Path('./data'); DATA_DIR.mkdir(exist_ok=True)
TRAIN_SPLIT = 'train[:80%]'  # 학습용으로 전체의 80 % 사용
raw_train = load_dataset('opus_books', 'de-en', split=TRAIN_SPLIT)

print(raw_train)

# 평문(.txt) 파일 작성 – SentencePiece 학습 입력 ------------------------
def dump_text(ds, lang: str, path: Path) -> None:
    """Corpus → plain text 파일(one sentence per line)."""
    with path.open('w', encoding='utf-8') as f:
        for ex in ds:
            sent = ex['translation'][lang].replace('\n', ' ')
            f.write(sent + '\n')

SRC_TXT, TGT_TXT = DATA_DIR/'train.en', DATA_DIR/'train.de'
if not SRC_TXT.exists():
    dump_text(raw_train, 'en', SRC_TXT)
    dump_text(raw_train, 'de', TGT_TXT)
    print('en', SRC_TXT)
    print('de', TGT_TXT)
    print('✓ Text files written.')

# SentencePiece BPE 학습 ----------------------------------------------
BPE_VOCAB   = 8000
BPE_PREFIX  = str(DATA_DIR/'bpe')

if not Path(BPE_PREFIX + '.model').exists():
    print('🛠  Training SentencePiece BPE ...')
    spm.SentencePieceTrainer.train(
        input=f'{SRC_TXT},{TGT_TXT}',
        model_prefix=BPE_PREFIX,
        vocab_size=BPE_VOCAB,
        character_coverage=1.0,         # 모든 유니코드 포함
        model_type='bpe',               # BPE 모델
        bos_id=1, eos_id=2, pad_id=0, unk_id=3,
    )
    print('✓ BPE 모델 학습 완료')
else:
    print('✓ 기존 BPE 모델 사용')

sp = spm.SentencePieceProcessor(); sp.load(BPE_PREFIX + '.model')
VOCAB_SIZE = sp.vocab_size()
print('SentencePiece vocab size:', VOCAB_SIZE)

The secret `HF_TOKEN` does not exist in your Colab secrets.
To authenticate with the Hugging Face Hub, create a token in your settings tab (https://huggingface.co/settings/tokens), set it as secret in your Google Colab and restart your session.
You will be able to reuse this secret in all of your notebooks.
Please note that authentication is recommended but still optional to access public models or datasets.


README.md:   0%|          | 0.00/28.1k [00:00<?, ?B/s]

train-00000-of-00001.parquet:   0%|          | 0.00/8.80M [00:00<?, ?B/s]

Generating train split:   0%|          | 0/51467 [00:00<?, ? examples/s]

Dataset({
    features: ['id', 'translation'],
    num_rows: 41174
})
en data/train.en
de data/train.de
✓ Text files written.
🛠  Training SentencePiece BPE ...
✓ BPE 모델 학습 완료
SentencePiece vocab size: 8000


## 2. 데이터셋 숫자화 & DataLoader 준비

In [5]:
class TranslationDataset(Dataset):
    """영-독 병렬 문장을 <BOS> ... <EOS> 형 ID 시퀀스로 변환."""
    def __init__(self, ds, tokenizer: spm.SentencePieceProcessor, max_len: int = 100):
        self.ds, self.sp, self.max_len = ds, tokenizer, max_len

    def __getitem__(self, idx: int):
        ex = self.ds[idx]['translation']

        # ---------- Source (English) 처리 ---------- #
        src_ids = self.sp.encode(ex['en'], out_type=int)[: self.max_len-2]
        src = [self.sp.bos_id()] + src_ids + [self.sp.eos_id()]

        # ---------- Target (German) 처리 ---------- #
        tgt_ids = self.sp.encode(ex['de'], out_type=int)[: self.max_len-2]
        tgt_in  = [self.sp.bos_id()] + tgt_ids              # decoder 입력
        tgt_out = tgt_ids + [self.sp.eos_id()]              # label

        return (
            torch.LongTensor(src),
            torch.LongTensor(tgt_in),
            torch.LongTensor(tgt_out),
        )

    def __len__(self) -> int:
        return len(self.ds)

# -------- DataLoader -------- #
def collate_fn(batch):
    src, tgt_in, tgt_out = zip(*batch)
    src, tgt_in, tgt_out = [
        pad_sequence(t, batch_first=True, padding_value=0)
        for t in (src, tgt_in, tgt_out)
    ]
    return src.to(DEVICE), tgt_in.to(DEVICE), tgt_out.to(DEVICE)

train_ds = TranslationDataset(raw_train, sp)
train_dl = DataLoader(train_ds, batch_size=64, shuffle=True, collate_fn=collate_fn)
print('✓ DataLoader ready:', len(train_dl), 'batches')

✓ DataLoader ready: 644 batches


## 3. LSTM Seq2Seq + Bahdanau Attention 모델

In [6]:
D_MODEL = 256  # 임베딩·히든 차원

class AdditiveAttention(nn.Module):
    """Bahdanau (Additive) Attention."""
    def __init__(self, d_model: int):
        super().__init__()
        self.W1 = nn.Linear(d_model, d_model)
        self.W2 = nn.Linear(d_model, d_model)
        self.v  = nn.Linear(d_model, 1, bias=False)

    def forward(self, enc_out: torch.Tensor, dec_h: torch.Tensor) -> torch.Tensor:
        """enc_out: (B, S, D),  dec_h: (B, 1, D)"""
        score   = self.v(torch.tanh(self.W1(enc_out) + self.W2(dec_h)))  # (B,S,1)
        weights = torch.softmax(score, dim=1)                            # attention prob
        context = (weights * enc_out).sum(dim=1, keepdim=True)          # (B,1,D)
        return context

class Encoder(nn.Module):
    def __init__(self, vocab_size: int):
        super().__init__()
        self.emb  = nn.Embedding(vocab_size, D_MODEL, padding_idx=0)
        self.lstm = nn.LSTM(D_MODEL, D_MODEL, batch_first=True)

    def forward(self, src: torch.Tensor) -> Tuple[torch.Tensor, Tuple[torch.Tensor, torch.Tensor]]:
        """Returns: enc_out, (h, c)"""
        emb = self.emb(src)
        return self.lstm(emb)

class Decoder(nn.Module):
    def __init__(self, vocab_size: int):
        super().__init__()
        self.emb  = nn.Embedding(vocab_size, D_MODEL, padding_idx=0)
        self.lstm = nn.LSTM(D_MODEL * 2, D_MODEL, batch_first=True)
        self.attn = AdditiveAttention(D_MODEL)
        self.fc   = nn.Linear(D_MODEL, vocab_size)

    def forward(
        self,
        tgt_in:  torch.Tensor,                      # (B, T)
        enc_out: torch.Tensor,                      # (B, S, D)
        state:   Tuple[torch.Tensor, torch.Tensor], # (h, c)
    ) -> torch.Tensor:
        h, c   = state
        outputs = []

        # Teacher‑forcing: time‑step loop (그래도 빠름 – S 작은 데이터)
        for t in range(tgt_in.size(1)):
            context = self.attn(enc_out, h[-1].unsqueeze(1))          # (B,1,D)
            emb_in  = self.emb(tgt_in[:, t:t+1])                      # (B,1,D)
            lstm_in = torch.cat([emb_in, context], dim=-1)            # (B,1,2D)
            out, (h, c) = self.lstm(lstm_in, (h, c))                 # (B,1,D)
            outputs.append(self.fc(out))                              # logits

        return torch.cat(outputs, dim=1)  # (B, T, vocab)

class Seq2Seq(nn.Module):
    def __init__(self, vocab_size: int):
        super().__init__()
        self.enc = Encoder(vocab_size)
        self.dec = Decoder(vocab_size)

    def forward(self, src: torch.Tensor, tgt_in: torch.Tensor) -> torch.Tensor:
        enc_out, (h, c) = self.enc(src)
        return self.dec(tgt_in, enc_out, (h, c))

# Instantiate model --------------------------------------------------
model = Seq2Seq(VOCAB_SIZE).to(DEVICE)
print('✓ Model params:', sum(p.numel() for p in model.parameters())//1e6, 'M')

✓ Model params: 7.0 M


## 4. 학습 루프 – Teacher Forcing + Gradient Clipping

In [7]:
optimizer = torch.optim.Adam(model.parameters(), lr=1e-3)
loss_fn  = nn.CrossEntropyLoss(ignore_index=0, label_smoothing=0.1)

EPOCHS       = 10
TF_RATIO     = 0.5   # Teacher‑forcing probability
CLIP_NORM    = 1.0

def run_epoch(epoch: int) -> None:
    model.train()
    total_loss = 0.0
    for src, tgt_in, tgt_out in train_dl:
        # --------------------------------------------------------
        # Forward pass (Teacher‑forcing always ON → 간단)
        # --------------------------------------------------------
        logits = model(src, tgt_in)

        # Loss 계산
        loss = loss_fn(logits.view(-1, VOCAB_SIZE), tgt_out.view(-1))

        # Backprop -------------------------------------------------
        optimizer.zero_grad()
        loss.backward()
        torch.nn.utils.clip_grad_norm_(model.parameters(), CLIP_NORM)
        optimizer.step()

        total_loss += loss.item()

    avg_loss = total_loss / len(train_dl)
    print(f'Epoch {epoch:02d} | loss {avg_loss:.3f}')

for ep in range(1, EPOCHS + 1):
    run_epoch(ep)

Epoch 01 | loss 6.523
Epoch 02 | loss 5.671
Epoch 03 | loss 5.276
Epoch 04 | loss 5.039
Epoch 05 | loss 4.869
Epoch 06 | loss 4.731
Epoch 07 | loss 4.614
Epoch 08 | loss 4.510
Epoch 09 | loss 4.418
Epoch 10 | loss 4.334


## 5. Greedy Decoding 함수

In [8]:
def greedy_translate(model: Seq2Seq, sentence: str, max_len: int = 60) -> str:
    """영어 입력을 독어로 번역 (greedy)."""
    model.eval()

    # -------- 인코딩 --------
    src_ids = [sp.bos_id()] + sp.encode(sentence, out_type=int) + [sp.eos_id()]
    src     = torch.LongTensor(src_ids).unsqueeze(0).to(DEVICE)
    enc_out, (h, c) = model.enc(src)

    # -------- 디코딩 --------
    generated: List[int] = []
    cur_token = torch.LongTensor([[sp.bos_id()]]).to(DEVICE)

    for _ in range(max_len):
        # 한 스텝 디코더 실행 (context + lstm)
        context = model.dec.attn(enc_out, h[-1].unsqueeze(1))
        lstm_in = torch.cat([model.dec.emb(cur_token), context], dim=-1)
        out, (h, c) = model.dec.lstm(lstm_in, (h, c))
        logit = model.dec.fc(out)           # (1,1,V)
        next_id = logit.argmax(dim=-1).item()

        if next_id == sp.eos_id():
            break
        generated.append(next_id)
        cur_token = torch.LongTensor([[next_id]]).to(DEVICE)

    return sp.decode(generated)

## 6. 간이 BLEU 평가 (마지막 50 샘플)

In [9]:
@torch.no_grad()
def compute_bleu(sample: int = 50) -> float:
    subset = load_dataset('opus_books', 'de-en', split=f'train[-{sample}:]')
    refs, hyps = [], []
    for ex in tqdm(subset, desc='BLEU eval'):
        refs.append(ex['de'] if 'de' in ex else ex['translation']['de'])
        hyps.append(greedy_translate(model, ex['en'] if 'en' in ex else ex['translation']['en']))
    return sacrebleu.corpus_bleu(hyps, [refs]).score

bleu_score = compute_bleu()
print(f'BLEU (50 샘플) ≈ {bleu_score:.2f}')

BLEU eval: 100%|██████████| 50/50 [00:01<00:00, 49.90it/s]

BLEU (50 샘플) ≈ 2.27





## 7. 데모

In [10]:
test_sentences = [
    "Where is the nearest hotel?",
    "I need a taxi to the station.",
]
for s in test_sentences:
    print('\nEN>', s)
    print('DE>', greedy_translate(model, s))


EN> Where is the nearest hotel?
DE> Wo ist die Nachtwache?

EN> I need a taxi to the station.
DE> Ich werde einen Kuchen um den Weg zu machen.
