In [1]:
from nltk.util import ngrams
from collections import Counter

# 샘플 텍스트
text = "자연어 처리는 재미있다."

# 2-gram 생성
tokens = text.split()
bigrams = list(ngrams(tokens, 2))
bigram_freq = Counter(bigrams)

print("2-gram:", bigrams)
print("2-gram 빈도:", bigram_freq)

2-gram: [('자연어', '처리는'), ('처리는', '재미있다.')]
2-gram 빈도: Counter({('자연어', '처리는'): 1, ('처리는', '재미있다.'): 1})


In [2]:
import math

# 예제 확률값
probs = [0.1, 0.2, 0.3, 0.4]
perplexity = math.pow(2, -sum(math.log2(p) for p in probs) / len(probs))
print("퍼플렉서티:", perplexity)

퍼플렉서티: 4.518010018049224


In [None]:
import torch
import torch.nn as nn

# 임베딩 -> 은닉층 -> 출력층 : 다음 단어 예측
class NNLM(nn.Module):
    def __init__(self, vocab_size, embed_size, hidden_size):
        super(NNLM, self).__init__()
        self.embed = nn.Embedding(vocab_size, embed_size)       # 토큰 ID -> 임베딩 벡토
        self.fc1 = nn.Linear(embed_size, hidden_size)           # 임베딩 -> 은닉층
        self.fc2 = nn.Linear(hidden_size, vocab_size)           # 은닉층 -> vocab 크기 로짓

    def forward(self, x):
        x = self.embed(x)                                       # (B, ...) -> (B, ..., E)
        x = torch.relu(self.fc1(x))                             # 은닉층 표현 생성
        x = self.fc2(x)                                         # 단어별 로짓 출력
        return x                                                # softmax 전 값(logits)

model = NNLM(vocab_size=5000, embed_size=300, hidden_size=128)
print(model)

NNLM(
  (embed): Embedding(5000, 300)
  (fc1): Linear(in_features=300, out_features=128, bias=True)
  (fc2): Linear(in_features=128, out_features=5000, bias=True)
)


In [None]:
class Seq2Seq(nn.Module):
    def __init__(self, input_size, hidden_size, output_size):
        super(Seq2Seq, self).__init__()
        # 인코더 LSTM : input -> hidden
        self.encoder = nn.LSTM(input_size, hidden_size, batch_first=True)
        # 디코더 LSTM : hidden -> output
        self.decoder = nn.LSTM(hidden_size, output_size, batch_first=True)

    def forward(self, x):
        _, (hidden, _) = self.encoder(x)         # 입력을  인코딩하여 hidden 추출
        out, _ = self.decoder(hidden)            # hidden을 디코더 입력으로 넣어 out 추출
        return out