## 4.2.4 多分类到二分类的实现

In [2]:
import numpy as np

In [3]:
class Embedding:
    def __init__(self, W):
        self.params = [W]
        self.grads = [np.zeros_like(W)]
        self.idx = None

    def forward(self, idx):
        W, = self.params
        self.idx = idx
        out = W[idx]
        return out

    def backward(self, dout):
        dW, = self.grads
        dW[...] = 0

        for i, word_id in enumerate(self.idx):
            dW[word_id] += dout[i]
        # 或者
        # np.add.at(dW, self.idx, dout)
        return None

In [4]:
class EmbeddingDot:
    def __init__(self, W):
        self.embed = Embedding(W)
        self.params = self.embed.params
        self.gards = self.embed.grads
        self.cache = None

    def forward(self, h, idx):
        target_W = self.embed.forward(idx)
        out = np.sum(target_W * h, axis=1)

        self.cache = (h, target_W)
        return out

    def backward(self, dout):
        h, target_W = self.cache
        dout = dout.reshape(dout.shape[0], 1)

        dtarget_W = dout * h
        self.embed.backward(dtarget_W)
        dh = dout * target_W
        return dh

In [None]:
"""
EmbeddingDot 类共有 4 个成员变量：embed、params、grads 和 cache。根据本书的代码规范，params 保存参数，grads 保存梯度。另外，作为缓存，embed 保存 Embedding 层，cache 保存正向传播时的计算结果

正向传播的 forward(h, idx) 方法的参数接收中间层的神经元（h）和单词 ID 的 NumPy 数组（idx）。这里，idx 是单词 ID 列表，这是因为我们假定了对数据进行 mini-batch 处理
"""

## 4.2.6 负采样的采样方法

In [5]:
# 从0到9的数字中随机选择一个数字
np.random.choice(10)

5

In [6]:
np.random.choice(10)

1

In [7]:
# 从words列表中随机选择一个元素
words = ['you', 'say', 'goodbye', 'I', 'hello', '.']

In [8]:
np.random.choice(words)

'.'

In [9]:
# 有放回采样5次
np.random.choice(words, size=5)

array(['.', 'you', 'you', 'goodbye', 'say'], dtype='<U7')

In [10]:
# 无放回采样5次
np.random.choice(words, size=5, replace=False)

array(['goodbye', 'hello', '.', 'I', 'say'], dtype='<U7')

In [11]:
# 基于概率分布进行采样
p = [0.5, 0.1, 0.05, 0.2, 0.05, 0.1]
np.random.choice(words, p=p)

'I'

In [None]:
"""
如上所示，np.random.choice() 可以用于随机抽样。如果指定 size 参数，将执行多次采样。如果指定 replace=False，将进行无放回采样。通过给参数 p 指定表示概率分布的列表，将进行基于概率分布的采样。剩下的就是使用这个函数抽取负例
"""

In [None]:
"""
word2vec 中提出的负采样对刚才的概率分布增加了一个步骤，对原来的概率分布取 0.75 次方
通过取 0.75 次方，低频单词的概率将稍微变高
"""

In [12]:
p = [0.7, 0.29, 0.01]

In [13]:
new_p = np.power(p, 0.75)

In [14]:
new_p /= np.sum(new_p)

In [15]:
print(new_p)

[0.64196878 0.33150408 0.02652714]


In [16]:
from negative_sampling_layer import UnigramSampler

In [None]:
"""
在进行初始化时，UnigramSampler 类取 3 个参数，分别是单词 ID 列表格式的 corpus、对概率分布取的次方值 power（默认值是 0.75）和负例的采样个数 sample_size。UnigramSampler 类有 get_negative_sample(target) 方法，该方法以参数 target 指定的单词 ID 为正例，对其他的单词 ID 进行采样。
"""

In [17]:
corpus = np.array([0, 1, 2, 3, 4, 1, 2, 3])
power = 0.75
sample_size = 2

In [18]:
sampler = UnigramSampler(corpus, power, sample_size)
target = np.array([1, 3, 0])
negative_sample = sampler.get_negative_sample(target)

In [19]:
print(negative_sample)

[[4 2]
 [1 4]
 [1 3]]


## 4.2.7  负采样的实现

In [20]:
from common.layers import Embedding, SigmoidWithLoss

In [21]:
class NegativeSamplingLoss:
    def __init__(self, W, corpus, power=0.75, simple_size=5):
        self.simple_size = simple_size
        self.sampler = UnigramSampler(corpus, power, sample_size)
        self.loss_layers = [SigmoidWithLoss() for _ in range(sample_size + 1)]
        self.embed_dot_layers = [EmbeddingDot(W) for _ in range(simple_size + 1)]
        self.params, self.grads = [], []
        for layer in self.embed_dot_layers:
            self.params += layer.params
            self.grads += layer.gards

    def forward(self, h, target):
     batch_size = target.shape[0]
     negative_sample = self.sampler.get_negative_sample(target)

     # 正例的正向传播
     score = self.embed_dot_layers[0].forward(h, target)
     correct_label = np.ones(batch_size, dtype=np.int32)
     loss = self.loss_layers[0].forward(score, correct_label)

     # 负例的正向传播
     negative_label = np.zeros(batch_size, dtype=np.int32)
     for i in range(self.sample_size):
         negative_target = negative_sample[:, i]
         score = self.embed_dot_layers[1 + i].forward(h, negative_target)
         loss += self.loss_layers[1 + i].forward(score, negative_label)

     return loss

    def backward(self, dout=1):
         dh = 0
         for l0, l1 in zip(self.loss_layers, self.embed_dot_layers):
             dscore = l0.backward(dout)
             dh += l1.backward(dscore)
         return dh