<a href="https://colab.research.google.com/github/sbbaik/small_bert/blob/main/predict_01.ipynb" target="_parent"><img src="https://colab.research.google.com/assets/colab-badge.svg" alt="Open In Colab"/></a>

In [2]:
import torch
from tokenizers import BertWordPieceTokenizer
import os
import math

# 구글 드라이브 마운트
from google.colab import drive
drive.mount('/content/drive')

# 저장 경로 설정
SAVE_PATH = '/content/drive/MyDrive/small_bert_korean'

# 하이퍼파라미터 정의 (학습 시와 동일하게 설정)
d_model = 768
n_head = 12
num_layers = 6
dim_feedforward = d_model * 4
max_len = 50
dropout = 0.1

# 토크나이저 로드
tokenizer = BertWordPieceTokenizer.from_file(
    os.path.join(SAVE_PATH, 'vocab.txt'),
    clean_text=True,
    handle_chinese_chars=False,
    strip_accents=False,
    lowercase=False
)
vocab_size = tokenizer.get_vocab_size()

# 모델 클래스 정의 (학습 시 사용했던 SmallBERT 클래스를 다시 정의해야 합니다)
# 이전에 제공된 SmallBERT 클래스 코드를 여기에 그대로 붙여넣습니다.
class PositionalEncoding(torch.nn.Module):
    def __init__(self, d_model, max_len=50):
        super(PositionalEncoding, self).__init__()
        pe = torch.zeros(max_len, d_model)
        position = torch.arange(0, max_len, dtype=torch.float).unsqueeze(1)
        div_term = torch.exp(torch.arange(0, d_model, 2).float() * (-math.log(10000.0) / d_model))
        pe[:, 0::2] = torch.sin(position * div_term)
        pe[:, 1::2] = torch.cos(position * div_term)
        pe = pe.unsqueeze(0)
        self.register_buffer('pe', pe)
    def forward(self, x):
        return x + self.pe[:, :x.size(1)]

class SmallBERT(torch.nn.Module):
    def __init__(self, vocab_size, d_model, n_head, num_layers, dim_feedforward, max_len, dropout):
        super(SmallBERT, self).__init__()
        self.embedding = torch.nn.Embedding(vocab_size, d_model)
        self.pos_encoder = PositionalEncoding(d_model, max_len)
        encoder_layer = torch.nn.TransformerEncoderLayer(
            d_model=d_model, nhead=n_head, dim_feedforward=dim_feedforward, dropout=dropout, batch_first=True
        )
        self.transformer_encoder = torch.nn.TransformerEncoder(encoder_layer, num_layers=num_layers)
        self.output_layer = torch.nn.Linear(d_model, vocab_size)
    def forward(self, src):
        x = self.embedding(src) * math.sqrt(d_model)
        x = self.pos_encoder(x)
        output = self.transformer_encoder(x)
        output = self.output_layer(output)
        return output

# 모델 인스턴스 생성 및 학습된 가중치 로드
device = torch.device('cuda' if torch.cuda.is_available() else 'cpu')
model = SmallBERT(vocab_size, d_model, n_head, num_layers, dim_feedforward, max_len, dropout).to(device)
model.load_state_dict(torch.load(os.path.join(SAVE_PATH, 'final_small_bert.pt'), map_location=device))
model.eval()  # 추론 모드로 전환

Drive already mounted at /content/drive; to attempt to forcibly remount, call drive.mount("/content/drive", force_remount=True).


SmallBERT(
  (embedding): Embedding(114, 768)
  (pos_encoder): PositionalEncoding()
  (transformer_encoder): TransformerEncoder(
    (layers): ModuleList(
      (0-5): 6 x TransformerEncoderLayer(
        (self_attn): MultiheadAttention(
          (out_proj): NonDynamicallyQuantizableLinear(in_features=768, out_features=768, bias=True)
        )
        (linear1): Linear(in_features=768, out_features=3072, bias=True)
        (dropout): Dropout(p=0.1, inplace=False)
        (linear2): Linear(in_features=3072, out_features=768, bias=True)
        (norm1): LayerNorm((768,), eps=1e-05, elementwise_affine=True)
        (norm2): LayerNorm((768,), eps=1e-05, elementwise_affine=True)
        (dropout1): Dropout(p=0.1, inplace=False)
        (dropout2): Dropout(p=0.1, inplace=False)
      )
    )
  )
  (output_layer): Linear(in_features=768, out_features=114, bias=True)
)

In [3]:
def predict_masked_token(text, model, tokenizer, device, top_k=5):
    # 입력 텍스트 토큰화
    tokenized_input = tokenizer.encode(text)
    input_ids = tokenized_input.ids

    # [MASK] 토큰의 인덱스 찾기
    mask_token_id = tokenizer.token_to_id("[MASK]")
    mask_idx = input_ids.index(mask_token_id)

    # 텐서로 변환
    input_tensor = torch.tensor([input_ids], dtype=torch.long).to(device)

    # 모델에 입력하여 예측
    with torch.no_grad():
        outputs = model(input_tensor)

    # 마스크 위치의 로짓(logits)만 추출
    mask_logits = outputs[0, mask_idx]

    # 상위 k개의 토큰 예측
    top_k_values, top_k_indices = torch.topk(mask_logits, top_k)

    predicted_tokens = [tokenizer.id_to_token(idx.item()) for idx in top_k_indices]

    print(f"원문: {text}")
    print(f"[MASK] 토큰 예측 결과 (Top {top_k}):")
    for i, token in enumerate(predicted_tokens):
        print(f"{i+1}. {token} (확률: {torch.softmax(mask_logits, dim=-1)[top_k_indices[i]].item():.4f})")
    print("-" * 50)

In [4]:
# 예측 예시 1
text_to_predict_1 = "나는 [MASK]을 먹고싶다."
predict_masked_token(text_to_predict_1, model, tokenizer, device)

# 예측 예시 2
text_to_predict_2 = "콜랩 환경에서 딥러닝 모델을 [MASK]하는 방법."
predict_masked_token(text_to_predict_2, model, tokenizer, device)

# 예측 예시 3
text_to_predict_3 = "오늘 날씨는 [MASK] 맑다."
predict_masked_token(text_to_predict_3, model, tokenizer, device)

원문: 나는 [MASK]을 먹고싶다.
[MASK] 토큰 예측 결과 (Top 5):
1. 저는 (확률: 0.2456)
2. 처리는 (확률: 0.1202)
3. 학습을 (확률: 0.1196)
4. 모델을 (확률: 0.0691)
5. 정말 (확률: 0.0577)
--------------------------------------------------
원문: 콜랩 환경에서 딥러닝 모델을 [MASK]하는 방법.
[MASK] 토큰 예측 결과 (Top 5):
1. 저는 (확률: 0.1686)
2. bert (확률: 0.1508)
3. 학습을 (확률: 0.0984)
4. 안녕하세요 (확률: 0.0746)
5. 있습니다 (확률: 0.0603)
--------------------------------------------------
원문: 오늘 날씨는 [MASK] 맑다.
[MASK] 토큰 예측 결과 (Top 5):
1. 저는 (확률: 0.3069)
2. 학습을 (확률: 0.0938)
3. 처리는 (확률: 0.0797)
4. 사전 (확률: 0.0706)
5. 모델을 (확률: 0.0568)
--------------------------------------------------
