<a href="https://colab.research.google.com/github/rickiepark/the-lm-book/blob/main/word2vec.ipynb" target="_parent"><img src="https://colab.research.google.com/assets/colab-badge.svg" alt="Open In Colab"/></a>

<div style="display: flex; justify-content: center;">
    <div style="background-color: #f4f6f7; padding: 15px; width: 80%;">
        <table style="width: 100%">
            <tr>
                <td style="vertical-align: middle;">
                    <span style="font-size: 14px;">
                        <a href="https://tensorflow.blog/the-lm-book" target="_blank" rel="noopener"><대규모 언어 모델, 핵심만 빠르게!>(인사이트, 2025)</a>의 주피터 노트북<br><br>
                        코드 저장소: <a href="https://github.com/rickiepark/the-lm-book" target="_blank" rel="noopener">https://github.com/rickiepark/the-lm-book</a>
                    </span>
                </td>
                <td style="vertical-align: middle;">
                    <a href="https://www.thelmbook.com" target="_blank" rel="noopener">
                        <img src="https://tensorflow.blog/wp-content/uploads/2025/10/cover-the-lm-book.jpg" width="80px" alt="대규모 언어 모델, 핵심만 빠르게!" border="1">
                    </a>
                </td>
            </tr>
        </table>
    </div>
</div>

## BoW

a --> `[0., 0., 1., 0., 0., 1., 0., 1., 1., 0., 0., 0., 0., 0., 0., 0., 0., 1., 0., 0., 0., 0., 0., 0., 0., 0.]`

and --> `[0., 0., 0., 0., 0., 0., 0., 0., 1., 1., 0., 0., 1., 0., 0., 0., 0., 1., 0., 0., 0., 0., 0., 0., 0., 1.]`

## Word2Vec CBOW vs Skip-gram

<img src='https://upload.wikimedia.org/wikipedia/commons/thumb/d/dd/Continuous_Bag_of_Words_model_%28CBOW%29.svg/2560px-Continuous_Bag_of_Words_model_%28CBOW%29.svg.png' width=400>

<img src='https://upload.wikimedia.org/wikipedia/commons/thumb/1/1c/Skip-gram.svg/2560px-Skip-gram.svg.png' width=400>

In [None]:
import torch.nn as nn

class SkipGram(nn.Module):
    def __init__(self, vocab_size, embedding_dim):
        super().__init__()
        # 중심 단어 임베딩, (단어 임베딩으로 사용)
        self.in_embed = nn.Embedding(vocab_size, embedding_dim)
        # 주변 단어 임베딩
        self.out_embed = nn.Embedding(vocab_size, embedding_dim)

    def forward(self, center_words):
        # center_words: (batch_size)
        center_embeds = self.in_embed(center_words)  # (batch_size, embedding_dim)
        # 중심 단어 임베딩과 주변 단어 임베딩의 행렬 곱셈
        scores = torch.matmul(center_embeds, self.out_embed.weight.t())  # (batch_size, vocab_size)
        log_probs = F.log_softmax(scores, dim=1)
        return log_probs

사전 훈련된 단어 임베딩을 사용할 수 있는 방법: `gemsim`

In [None]:
!pip install gensim

Collecting gensim
  Downloading gensim-4.4.0-cp312-cp312-manylinux_2_24_x86_64.manylinux_2_28_x86_64.whl.metadata (8.4 kB)
Downloading gensim-4.4.0-cp312-cp312-manylinux_2_24_x86_64.manylinux_2_28_x86_64.whl (27.9 MB)
[2K   [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m27.9/27.9 MB[0m [31m52.4 MB/s[0m eta [36m0:00:00[0m
[?25hInstalling collected packages: gensim
Successfully installed gensim-4.4.0


In [None]:
import gensim.downloader as api

api.info()['models'].keys()

dict_keys(['fasttext-wiki-news-subwords-300', 'conceptnet-numberbatch-17-06-300', 'word2vec-ruscorpora-300', 'word2vec-google-news-300', 'glove-wiki-gigaword-50', 'glove-wiki-gigaword-100', 'glove-wiki-gigaword-200', 'glove-wiki-gigaword-300', 'glove-twitter-25', 'glove-twitter-50', 'glove-twitter-100', 'glove-twitter-200', '__testing_word2vec-matrix-synopsis'])

In [None]:
model = api.load('word2vec-google-news-300')



In [None]:
king_vector = model["king"]
print(len(king_vector))
print(king_vector[:10])

300
[ 0.12597656  0.02978516  0.00860596  0.13964844 -0.02563477 -0.03613281
  0.11181641 -0.19824219  0.05126953  0.36328125]


In [None]:
model.similarity("king", "queen")

np.float32(0.6510957)

In [None]:
model.similarity("king", "sea")

np.float32(0.13782355)

In [None]:
model.most_similar("king", topn=5)

[('kings', 0.7138045430183411),
 ('queen', 0.6510956883430481),
 ('monarch', 0.6413194537162781),
 ('crown_prince', 0.6204220056533813),
 ('prince', 0.6159993410110474)]

"king" + "women" - "man"

In [None]:
model.most_similar(positive=["king", "woman"], negative=["man"])

[('queen', 0.7118193507194519),
 ('monarch', 0.6189674139022827),
 ('princess', 0.5902431011199951),
 ('crown_prince', 0.5499460697174072),
 ('prince', 0.5377321839332581),
 ('kings', 0.5236844420433044),
 ('Queen_Consort', 0.5235945582389832),
 ('queens', 0.518113374710083),
 ('sultan', 0.5098593235015869),
 ('monarchy', 0.5087411403656006)]