## 4.3.1 CBOW模型的实现

In [2]:
import sys

sys.path.append('..')
import numpy as np
from common.layers import Embedding
from negative_sampling_layer import NegativeSamplingLoss

In [None]:
"""
这个初始化方法有 4 个参数。vocab_size 是词汇量，hidden_size 是中间层的神经元个数，corpus 是单词 ID 列表。另外，通过 window_size 指定上下文的大小，即上下文包含多少个周围单词。如果 window_size 是 2，则目标词的左右 2 个单词（共 4 个单词）将成为上下文
"""

In [3]:
class CBOW:
    def __init__(self, vocab_size, hidden_size, window_size, corpus):
        V, H = vocab_size, hidden_size

        # 初始化权重
        W_in = 0.01 * np.random.randn(V, H).astype('f')
        W_out = 0.01 * np.random.randn(V, H).astype('f')

        # 生成层
        self.in_layers = []
        for i in range(2 * window_size):
            layer = Embedding(W_in)  # 使用Embedding层
            self.in_layers.append(layer)
        self.ns_loss = NegativeSamplingLoss(W_out, corpus, power=0.75,
                                            sample_size=5)

        # 将所有的权重和梯度整理到列表中
        layers = self.in_layers + [self.ns_loss]
        self.params, self.grads = [], []
        for layer in layers:
            self.params += layer.params
            self.grads += layer.grads

        # 将单词的分布式表示设置为成员变量
        self.word_vecs = W_in

    def forward(self, contexts, target):
        h = 0
        for i, layer in enumerate(self.in_layers):
            h += layer.forward(contexts[:, i])
        h *= 1 / len(self.in_layers)
        loss = self.ns_loss.forward(h, target)
        return loss

    def backward(self, dout=1):
        dout = self.ns_loss.backward(dout)
        dout *= 1 / len(self.in_layers)
        for layer in self.in_layers:
            layer.backward(dout)
        return None

In [18]:
import sys

sys.path.append('..')
import numpy as np
from common import config

# 在用GPU运行时，请打开下面的注释（需要cupy）
# ===============================================
config.GPU = False
# ===============================================
import pickle
from common.trainer import Trainer
from common.optimizer import Adam
from common.util import create_contexts_target, to_cpu, to_gpu
from dataset import ptb

In [19]:
window_size = 5
hidden_size = 100
batch_size = 100
max_epoch = 10

In [20]:
# 读入数据
corpus, word_to_id, id_to_word = ptb.load_data('train')
vocab_size = len(word_to_id)

In [21]:
print(config.GPU)

False


In [22]:
contexts, target = create_contexts_target(corpus, window_size)
if config.GPU:
    contexts, target = to_gpu(contexts), to_gpu(target)

In [23]:
# 生成模型等
model = CBOW(vocab_size, hidden_size, window_size, corpus)
optimizer = Adam()
trainer = Trainer(model, optimizer)

In [24]:
# 开始学习
trainer.fit(contexts, target, max_epoch, batch_size)
trainer.plot()

| epoch 1 |  iter 1 / 9295 | time 0[s] | loss 4.16
| epoch 1 |  iter 21 / 9295 | time 1[s] | loss 4.16
| epoch 1 |  iter 41 / 9295 | time 2[s] | loss 4.15
| epoch 1 |  iter 61 / 9295 | time 3[s] | loss 4.13
| epoch 1 |  iter 81 / 9295 | time 4[s] | loss 4.06
| epoch 1 |  iter 101 / 9295 | time 5[s] | loss 3.94
| epoch 1 |  iter 121 / 9295 | time 6[s] | loss 3.79
| epoch 1 |  iter 141 / 9295 | time 7[s] | loss 3.65
| epoch 1 |  iter 161 / 9295 | time 8[s] | loss 3.50
| epoch 1 |  iter 181 / 9295 | time 9[s] | loss 3.38
| epoch 1 |  iter 201 / 9295 | time 10[s] | loss 3.26
| epoch 1 |  iter 221 / 9295 | time 11[s] | loss 3.15
| epoch 1 |  iter 241 / 9295 | time 12[s] | loss 3.10
| epoch 1 |  iter 261 / 9295 | time 13[s] | loss 3.02
| epoch 1 |  iter 281 / 9295 | time 14[s] | loss 2.97
| epoch 1 |  iter 301 / 9295 | time 15[s] | loss 2.92
| epoch 1 |  iter 321 / 9295 | time 16[s] | loss 2.89
| epoch 1 |  iter 341 / 9295 | time 17[s] | loss 2.85
| epoch 1 |  iter 361 / 9295 | time 19[s] | 

KeyboardInterrupt: 

In [25]:
# 保存必要数据，以便后续使用
word_vecs = model.word_vecs
if config.GPU:
    word_vecs = to_cpu(word_vecs)
params = {}
params['word_vecs'] = word_vecs.astype(np.float16)
params['word_to_id'] = word_to_id
params['id_to_word'] = id_to_word
pkl_file = 'cbow_params.pkl'
with open(pkl_file, 'wb') as f:
    pickle.dump(params, f, -1)

In [27]:
from common.util import most_similar

In [28]:
pkl_file = 'cbow_params.pkl'
with open(pkl_file, 'rb') as f:
    params = pickle.load(f)
    word_vecs = params['word_vecs']
    word_to_id = params['word_to_id']
    id_to_word = params['id_to_word']

querys = ['you', 'year', 'car', 'toyota']
for query in querys:
    most_similar(query, word_to_id, id_to_word, word_vecs, top=5)


[query] you
 why: 0.974609375
 we: 0.97314453125
 i: 0.97021484375
 please: 0.95703125
 something: 0.94921875

[query] year
 month: 0.9365234375
 week: 0.8974609375
 earlier: 0.83154296875
 period: 0.82421875
 forecast: 0.81884765625

[query] car
 contest: 0.9453125
 injection: 0.94287109375
 sedan: 0.93798828125
 disk: 0.92724609375
 army: 0.92333984375

[query] toyota
 ind.: 0.9375
 penney: 0.93115234375
 northrop: 0.92919921875
 proposes: 0.92724609375
 kodak: 0.923828125


In [30]:
from common.util import analogy

In [31]:
analogy('king', 'man', 'queen', word_to_id, id_to_word, word_vecs)


[analogy] king:man = queen:?
 share: 5.0078125
 thing: 4.81640625
 spokesman: 4.74609375
 way: 4.58203125
 lot: 4.54296875


In [32]:
analogy('take', 'took', 'go', word_to_id, id_to_word, word_vecs)


[analogy] take:took = go:?
 composite: 5.7265625
 york: 4.66796875
 exchange: 4.5234375
 quarter: 4.296875
 ended: 4.08984375


In [33]:
analogy('car', 'cars', 'child', word_to_id, id_to_word, word_vecs)


[analogy] car:cars = child:?
 yield: 4.88671875
 addition: 4.2421875
 come: 4.171875
 revenue: 4.08984375
 try: 3.91015625


In [34]:
analogy('good', 'better', 'bad', word_to_id, id_to_word, word_vecs)


[analogy] good:better = bad:?
 do: 4.0
 know: 3.912109375
 than: 3.8515625
 think: 3.833984375
 n't: 3.75


In [None]:
"""
模型训练的时间太少，结果正确率低
"""