In [1]:
!pip install --quiet torch torchvision transformers sentencepiece
import torch, torch.nn as nn, itertools, math, random
from torch.utils.data import Dataset, DataLoader

device = torch.device("cuda" if torch.cuda.is_available() else "cpu")
print("✅ device:", device)


[2K   [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m363.4/363.4 MB[0m [31m3.8 MB/s[0m eta [36m0:00:00[0m
[2K   [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m13.8/13.8 MB[0m [31m75.1 MB/s[0m eta [36m0:00:00[0m
[2K   [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m24.6/24.6 MB[0m [31m71.1 MB/s[0m eta [36m0:00:00[0m
[2K   [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m883.7/883.7 kB[0m [31m38.0 MB/s[0m eta [36m0:00:00[0m
[2K   [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m664.8/664.8 MB[0m [31m2.0 MB/s[0m eta [36m0:00:00[0m
[2K   [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m211.5/211.5 MB[0m [31m5.9 MB/s[0m eta [36m0:00:00[0m
[2K   [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m56.3/56.3 MB[0m [31m10.6 MB/s[0m eta [36m0:00:00[0m
[2K   [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m127.9/127.9 MB[0m [31m8.4 MB/s[0m eta [36m0:00:00[0m
[2K   [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━

In [4]:
# TODO: Pig Latin 변환 함수 작성
def to_pig_latin(word):
    """단어 하나를 Pig Latin 으로 변환"""
    vowels = "aeiou"
    if word[0] in vowels:
        return word + "yay"
    for i, c in enumerate(word):
        if c in vowels:
            return word[i:] + word[:i] + "ay"
    return word + "ay"  # 만약 모음이 없다면 (예외적 상황)

def eng2pig(sentence):
    return " ".join(to_pig_latin(w) for w in sentence.lower().split())

base_sentences = [
    "hello world",
    "i love machine learning",
    "language models are fun",
    "space tourism is booming",
    "beam me up scotty",
    "the future is bright",
]
sentences = base_sentences * 15
random.shuffle(sentences)

pairs = [(s, eng2pig(s)) for s in sentences]
print("샘플:", pairs[1])


샘플: ('space tourism is booming', 'acespay ourismtay isyay oomingbay')


In [21]:
class Vocab:
    def __init__(self, sents, specials=["<pad>","<sos>","<eos>","<unk>"]):
        tokens = list(itertools.chain.from_iterable(s.split() for s in sents))
        uniq = specials + sorted(set(tokens))
        self.stoi = {t:i for i,t in enumerate(uniq)}
        self.itos = {i:t for t,i in self.stoi.items()}
    def encode(self, sent):
      return [self.stoi.get(tok, self.stoi["<unk>"]) for tok in sent.split()]
    def decode(self, ids):
      # 특수 토큰(<pad>, <sos>, <eos>)는 결과에서 제외
      return " ".join(self.itos[i] for i in ids if self.itos[i] not in ["<pad>", "<sos>", "<eos>"])


src_vocab = Vocab([s for s,_ in pairs])
tgt_vocab = Vocab([t for _,t in pairs])
MAX_LEN = max(len(s.split()) for s,_ in pairs) + 2

class PigDataset(Dataset):
    def __len__(self): return len(pairs)
    def __getitem__(self, idx):
        src, tgt = pairs[idx]
        src_ids = [src_vocab.stoi["<sos>"]] + src_vocab.encode(src) + [src_vocab.stoi["<eos>"]]
        tgt_ids = [tgt_vocab.stoi["<sos>"]] + tgt_vocab.encode(tgt) + [tgt_vocab.stoi["<eos>"]]

        # padding
        src_ids += [src_vocab.stoi["<pad>"]] * (MAX_LEN - len(src_ids))
        tgt_ids += [tgt_vocab.stoi["<pad>"]] * (MAX_LEN - len(tgt_ids))

        # mask
        src_mask = [1 if tok != src_vocab.stoi["<pad>"] else 0 for tok in src_ids]

        return torch.tensor(src_ids), torch.tensor(tgt_ids), torch.tensor(src_mask)


loader = DataLoader(PigDataset(), batch_size=32, shuffle=True)
print("어휘 크기:", len(src_vocab.stoi))

for src, tgt, mask in loader:
    print(" src:", src.shape)
    print(" tgt:", tgt.shape)
    print(" mask:", mask.shape)
    break


어휘 크기: 25
 src: torch.Size([32, 6])
 tgt: torch.Size([32, 6])
 mask: torch.Size([32, 6])


In [22]:
class PositionalEncoding(nn.Module):
    def __init__(self, d_model, max_len=5000):
        super().__init__()
        pe = torch.zeros(max_len, d_model)
        position = torch.arange(0, max_len, dtype=torch.float).unsqueeze(1)
        div_term = torch.exp(torch.arange(0, d_model, 2).float() * (-math.log(10000.0) / d_model))
        pe[:, 0::2] = torch.sin(position * div_term)  # even
        pe[:, 1::2] = torch.cos(position * div_term)  # odd
        pe = pe.unsqueeze(0)  # shape: (1, max_len, d_model)
        self.register_buffer('pe', pe)

    def forward(self, x):
        return x + self.pe[:, :x.size(1), :]


class Translator(nn.Module):
    def __init__(self, sv, tv, d=128, nhead=4, nl=2):
        super().__init__()
        self.se = nn.Embedding(sv, d)
        self.te = nn.Embedding(tv, d)
        self.pe = PositionalEncoding(d)
        self.tr = nn.Transformer(
            d_model=d,
            nhead=nhead,
            num_encoder_layers=nl,
            num_decoder_layers=nl,
            dim_feedforward=512,
            batch_first=True
        )
        self.fc = nn.Linear(d, tv)

    def pad_mask(self, seq, pad):
        return seq.eq(pad)

    def forward(self, src, tgt):
        src_pad = self.pad_mask(src, src_vocab.stoi["<pad>"])
        tgt_pad = self.pad_mask(tgt, tgt_vocab.stoi["<pad>"])

        tgt_len = tgt.size(1)
        tgt_mask = torch.triu(torch.ones(tgt_len, tgt_len), diagonal=1).bool().to(device)

        src = self.pe(self.se(src))
        tgt = self.pe(self.te(tgt))

        out = self.tr(
            src, tgt,
            src_key_padding_mask=src_pad,
            tgt_key_padding_mask=tgt_pad,
            memory_key_padding_mask=src_pad,
            tgt_mask=tgt_mask
        )
        return self.fc(out)


model = Translator(len(src_vocab.stoi), len(tgt_vocab.stoi)).to(device)
opt = torch.optim.Adam(model.parameters(), lr=1e-3)
crit = nn.CrossEntropyLoss(ignore_index=tgt_vocab.stoi["<pad>"])

src, tgt, mask = next(iter(loader))
out = model(src.to(device), tgt.to(device))
print("출력 shape:", out.shape)  # (batch_size, seq_len, vocab_size)

출력 shape: torch.Size([32, 6, 25])


In [23]:
def train_epoch():
    model.train()
    total = 0

    for src, tgt, _ in loader:  # src_mask는 사용하지 않으므로 _로 처리
        src, tgt = src.to(device), tgt.to(device)

        # 디코더 입력/출력 분리
        tgt_in = tgt[:, :-1]
        tgt_out = tgt[:, 1:]

        # 모델 출력
        logits = model(src, tgt_in)
        loss = crit(logits.reshape(-1, logits.size(-1)), tgt_out.reshape(-1))

        opt.zero_grad()
        loss.backward()
        opt.step()

        total += loss.item()

    return total / len(loader)


for epoch in range(1, 4):
    print(f"Epoch {epoch} | loss {train_epoch():.3f}")

Epoch 1 | loss 3.061
Epoch 2 | loss 2.026
Epoch 3 | loss 1.219


In [24]:
def translate(sentence):
    model.eval()
    with torch.no_grad():
        # 인코더 입력 준비
        src_ids = [src_vocab.stoi["<sos>"]] + src_vocab.encode(sentence.lower()) + [src_vocab.stoi["<eos>"]]
        src_ids += [src_vocab.stoi["<pad>"]] * (MAX_LEN - len(src_ids))
        src = torch.tensor([src_ids]).to(device)  # (1, MAX_LEN)

        # 디코더 입력 시작: <sos>
        tgt_ids = [tgt_vocab.stoi["<sos>"]]

        for _ in range(MAX_LEN):
            tgt = torch.tensor([tgt_ids]).to(device)  # (1, cur_len)
            logits = model(src, tgt)  # (1, cur_len, vocab_size)
            next_id = logits[0, -1].argmax().item()  # 마지막 토큰의 예측

            # 종료 조건
            if next_id == tgt_vocab.stoi["<eos>"]:
                break
            tgt_ids.append(next_id)

        return tgt_vocab.decode(tgt_ids[1:])  # <sos> 제외

print(translate("hello world"))


ellohay orldway


In [30]:
from transformers import pipeline

# TODO: 파이프라인 초기화
sentiment = pipeline("sentiment-analysis", model="nlptown/bert-base-multilingual-uncased-sentiment")

samples = [  # 감정 분석할 문장 2개
    "I love this space travel experience!",
    "She's my darling wife.",
    "fucking service",
    "soso"
]

for s in samples:
    print(s, sentiment(s))


Device set to use cuda:0


I love this space travel experience! [{'label': '5 stars', 'score': 0.8984273672103882}]
She's my darling wife. [{'label': '5 stars', 'score': 0.7538031339645386}]
fucking service [{'label': '1 star', 'score': 0.5812363624572754}]
soso [{'label': '3 stars', 'score': 0.28597524762153625}]


In [33]:
from transformers import AutoTokenizer, AutoModelForCausalLM

tokenizer = AutoTokenizer.from_pretrained("gpt2")
tokenizer.pad_token = tokenizer.eos_token  # pad_token 지정
model = AutoModelForCausalLM.from_pretrained("gpt2").to(device)

prompts = [
    "Experience the wonders of interstellar travel with GalactoMail: ",
    "Introducing a revolutionary space communication service: ",
    "Imagine sending emails across galaxies with: "
]

for prompt in prompts:
    inputs = tokenizer(prompt, return_tensors="pt").to(device)
    outputs = model.generate(
        **inputs,
        max_new_tokens=50,
        do_sample=True,
        top_k=50,
        top_p=0.95,
        temperature=0.7,
        repetition_penalty=1.3,
        pad_token_id=tokenizer.eos_token_id
    )
    result = tokenizer.decode(outputs[0], skip_special_tokens=True)
    # 필요하면 줄바꿈이나 이상한 문자 제거
    result = result.strip().split('\n')[0]
    print(result)


Experience the wonders of interstellar travel with GalactoMail:  a simple, easy to use email client.
Introducing a revolutionary space communication service:  Skype.
Imagine sending emails across galaxies with:  (a) an atomic cloud, (b)"
