# 4.word2vec 속도 개선

### 4.1.2 Embedding 계층 구현

In [None]:
import numpy as np

W = np.arange(21).reshape(7, 3)
print(W)
print(W[2])
print(W[5])

idx = np.array([1, 0, 3, 0])
print(W[idx])


In [None]:
class Embedding:
    def __init__(self, W):
        self.params = [W]
        self.grads = [np.zeros_like(W)]
        self.idx = None

    def forward(self, idx):
        (W,) = self.params
        self.idx = idx
        out = W[idx]
        return out

    def backward(self, dout):
        (dW,) = self.grads
        dW[...] = 0
        np.add.at(dW, self.idx, dout)
        return None


### 4.2.4 다중 분류에서 이진 분류로 （구현）

In [None]:
class EmbeddingDot:
    def __init__(self, W):
        self.embed = Embedding(W)
        self.params = self.embed.params
        self.grads = self.embed.grads
        self.cache = None

    def forward(self, h, idx):
        target_W = self.embed.forward(idx)
        out = np.sum(target_W * h, axis=1)

        self.cache = (h, target_W)
        return out

    def backward(self, dout):
        h, target_W = self.cache
        dout = dout.reshape(dout.shape[0], 1)

        dtarget_W = dout * h
        self.embed.backward(dtarget_W)
        dh = dout * target_W
        return dh


### fig.4-14 Embedding Dot layer의 각 변수의 구체적인 값

In [None]:
W = np.arange(21).reshape([7, 3])
idx = np.array([0, 3, 1])

h = np.arange(9).reshape([3, 3])

embed = Embedding(W)
target_W = embed.forward(idx)
print(target_W)

out = np.sum(target_W * h, axis=1)
print(out)


### 4.2.6 네거티브 샘플링의 샘플링 기법

In [None]:
import numpy as np

print(np.random.choice(10))
words = ["you", "say", "goodbye", "I", "hello", "."]
print(np.random.choice(words))
print(np.random.choice(words, size=5))
print(np.random.choice(words, size=5, replace=False))
p = [0.5, 0.1, 0.05, 0.2, 0.05, 0.1]
print(np.random.choice(words, p=p))


In [None]:
p = np.array([0.7, 0.29, 0.01])
new_p = p**0.75
new_p /= np.sum(new_p)
print(new_p)

In [None]:
import collections


class UnigramSampler:
    def __init__(self, corpus, power, sample_size):
        self.sample_size = sample_size
        self.vocab_size = None
        self.word_p = None

        counts = collections.Counter()
        for word_id in corpus:
            counts[word_id] += 1

        vocab_size = len(counts)
        self.vocab_size = vocab_size

        self.word_p = np.zeros(vocab_size)
        for i in range(vocab_size):
            self.word_p[i] = counts[i]

        self.word_p = np.power(self.word_p, power)
        self.word_p /= np.sum(self.word_p)

    def get_negative_sample(self, target):
        batch_size = target.shape[0]
        negative_sample = np.random.choice(
            self.vocab_size,
            size=(batch_size, self.sample_size),
            replace=True,
            p=self.word_p,
        )
        return negative_sample

corpus = np.array([ 0, 1, 2, 3, 4, 1, 2, 3])
power = 0.75
sample_size=2

sampler = UnigramSampler(corpus,power,sample_size)
target = np.array([1,3,0])
negative_sample = sampler.get_negative_sample(target)
print(negative_sample)
                  


### 4.2.7 네거티브 샘플링 구현

In [None]:
import sys

sys.path.append("..")
from common.layers import Embedding, SigmoidWithLoss
import collections


class NegativeSamplingLoss:
    def __init__(self, W, corpus, power=0.75, sample_size=5):
        self.sample_size = sample_size
        self.sampler = UnigramSampler(corpus, power, sample_size)
        self.loss_layers = [SigmoidWithLoss() for _ in range(sample_size + 1)]
        self.embed_dot_layers = [
            EmbeddingDot(W) for _ in range(sample_size + 1)
        ]

        self.params, self.grads = [], []
        for layer in self.embed_dot_layers:
            self.params += layer.params
            self.grads += layer.grads

    def forward(self, h, target):
        batch_size = target.shape[0]
        negative_sample = self.sampler.get_negative_sample(target)

        score = self.embed_dot_layers[0].forward(h, target)
        correct_label = np.ones(batch_size, dtype=np.int32)
        loss = self.loss_layers[0].forward(score, correct_label)

        negative_label = np.zeros(batch_size, dtype=np.int32)
        for i in range(self.sample_size):
            negative_target = negative_sample[:, i]
            score = self.embed_dot_layers[1 + i].forward(h, negative_target)
            loss += self.loss_layers[1 + i].forward(score, negative_label)

        return loss

    def backward(self, dout=1):
        dh = 0
        for l0, l1 in zip(self.loss_layers, self.embed_dot_layers):
            dscore = l0.backward(dout)
            dh += l1.backward(dscore)

        return dh


## 4.3 개선판 word2vec 학습

In [None]:
import sys

sys.path.append("..")
import numpy as np
from common.layers import Embedding
from ch04.negative_sampling_layer import NegativeSamplingLoss


class CBOW:
    def __init__(self, vocab_size, hidden_size, window_size, corpus):
        V, H = vocab_size, hidden_size

        W_in = 0.01 * np.random.randn(V, H).astype("f")
        W_out = 0.01 * np.random.randn(V, H).astype("f")

        self.in_layers = []
        for i in range(2 * window_size):
            layer = Embedding(W_in)
            self.in_layers.append(layer)
        self.ns_loss = NegativeSamplingLoss(
            W_out, corpus, power=0.75, sample_size=5
        )

        layers = self.in_layers + [self.ns_loss]
        self.params, self.grads = [], []
        for layer in layers:
            self.params += layer.params
            self.grads += layer.grads

        self.word_vecs = W_in

    def forward(self, contexts, target):
        h = 0
        for i, layer in enumerate(self.in_layers):
            h += layer.forward(contexts[:, i])
        h *= 1 / len(self.in_layers)
        loss = self.ns_loss.forward(h, target)
        return loss

    def backward(self, dout=1):
        dout = self.ns_loss.backward(dout)
        dout *= 1 / len(self.in_layers)
        for layer in self.in_layers:
            layer.backward(dout)
        return None


### 4.3.2 CBOW 모델 학습 코드

In [None]:
# coding: utf-8
import sys

sys.path.append("..")
import numpy as np
from common import config

# GPU에서 실행하려면 아래 주석을 해제하세요(CuPy 필요).
# ===============================================
# config.GPU = True
# ===============================================
import pickle
from common.trainer import Trainer
from common.optimizer import Adam
from common.util import create_contexts_target, to_cpu, to_gpu
from dataset import ptb


# 하이퍼파라미터 설정
window_size = 5
hidden_size = 100
batch_size = 100
max_epoch = 10

# 데이터 읽기
corpus, word_to_id, id_to_word = ptb.load_data("train")
vocab_size = len(word_to_id)

contexts, target = create_contexts_target(corpus, window_size)
if config.GPU:
    contexts, target = to_gpu(contexts), to_gpu(target)

# 모델 등 생성
model = CBOW(vocab_size, hidden_size, window_size, corpus)
# model = SkipGram(vocab_size, hidden_size, window_size, corpus)
optimizer = Adam()
trainer = Trainer(model, optimizer)

# 학습 시작
trainer.fit(contexts, target, max_epoch, batch_size)
trainer.plot()

# 나중에 사용할 수 있도록 필요한 데이터 저장
word_vecs = model.word_vecs
if config.GPU:
    word_vecs = to_cpu(word_vecs)
params = {}
params["word_vecs"] = word_vecs.astype(np.float16)
params["word_to_id"] = word_to_id
params["id_to_word"] = id_to_word
pkl_file = "cbow_params.pkl"  # or 'skipgram_params.pkl'
with open(pkl_file, "wb") as f:
    pickle.dump(params, f, -1)


In [None]:
# coding: utf-8
import sys

sys.path.append("..")
from common.util import most_similar
import pickle


pkl_file = "cbow_params.pkl"
# pkl_file = 'skipgram_params.pkl'

with open(pkl_file, "rb") as f:
    params = pickle.load(f)
    word_vecs = params["word_vecs"]
    word_to_id = params["word_to_id"]
    id_to_word = params["id_to_word"]

querys = ["you", "year", "car", "toyota"]
for i, query in enumerate(querys):
    if i == 0:
        print("=" * 20)
    most_similar(query, word_to_id, id_to_word, word_vecs, top=5)
    print("=" * 20)


### Vector space에서의 관계성

In [None]:
from common.util import analogy

print("-" * 50)
analogy("king", "man", "queen", word_to_id, id_to_word, word_vecs)
analogy("take", "took", "go", word_to_id, id_to_word, word_vecs)
analogy("car", "cars", "child", word_to_id, id_to_word, word_vecs)
analogy("good", "better", "bad", word_to_id, id_to_word, word_vecs)
