# word2vecの高速化

コーパスが大規模のものとなった場合、入力層の計算はone-hot表現の場合、とてつもないメモリが必要となってしまう。<br>
&rarr; Embeddingレイヤの導入

In [1]:
# Embeddingレイヤ
import numpy as np

class Embedding :
    def __init__(self, W) :
        self.params = [W]
        self.grads = [np.zeros_like(W)]
        self.idx = None
        
    def forward(self, idx) :
        W, = self.params
        self.idx = idx
        out = W[idx]
        return out
    
    def backward(self, dout) :
        dW, = self.grads
        dW[...] = 0
        for i, word_id in enumerate(self.idx) :
            dW[word_id] += dout[i]
            # np.add.at(dW, self.idx, dout)
        return None

中間層以降の計算も多くの計算が必要となり、Softmaxmレイヤに関わる箇所でも、コーパスが大規模になるにつれて計算量が増加することが問題となる。<br>
&rarr; Nagative Samplingという損失関数の導入(計算量を一定に抑える)

In [2]:
# EmbeddingDotレイヤの実装
class EmbeddingDot :
    def __init__(self, W) :
        self.embed = Embedding(W)
        self.params = self.embed.params
        sekf.grads = self.embed.grads
        self.cache = None
        
    def forward(self, h, idx) :
        target_W = self.embed.forward(idx)
        out = np.sum(target_W * h, axis=1)
        
        self.cache = (h, target_W)
        return out
    
    def backward(self, dout) :
        h, target_W = self.cache
        dout = dout.reshape(dout.shape[0],1)
        
        dtarget_W = dout * h
        self.embed.backward(dtarget_W)
        df = dout * target_W
        return dh

In [6]:
# Nagative Samplingの実装
import sys
sys.path.append('./samplecode')
from ch04.negative_sampling_layer import UnigramSampler
from common.layers import SigmoidWithLoss

class NegativeSamplingLoss :
    def __init__(sellf, W, corpus, power=0.75, sample_size=5) :
        self.sample_size = sample_size
        self.sampler = UnigramSampler(corpus, power, sample_size)
        self.loss_layers = [SigmoidWithLoss() for _ in range(sample_size+1)]
        self.embed_dot_layers = [EmbeddingDot(W) for _ in range(sample_size+1)]
        
        self.params, self.grads = [], []
        for layer in self.embed_dot_layers :
            self.params += layer.params
            self.grads += layer.grads
            
    def forward(self, h, target) :
        batch_size = target.shape[0]
        negative_sample = self.sampler.get_negative_sample(target)
        
        score = self.embed_dot_layers[0].forward(h, target)
        correct_label = np.ones(batch_size, dtype=np.int32)
        loss = self.loss_layers[0].forward(score, correct_label)
        
        negative_label = np.zeros(batch_size, dtype=np.int32)
        for i in range(self.sample_size) :
            negative_target = negative_sample[:, i]
            score = self.embed_dot_layers[1+i].forward(h, negative_target)
            loss += self.loss_layers[1+i].forward(score, negative_label)
            
        return loss
    
    def backward(self, dout=1) :
        dh = 0
        for l0, l1 in zip(self.loss_layers, self.embed_dot_layers) :
            dscore = l0.backward(dout)
            dh += l1.backward(dscore)
            
        return dh

In [7]:
# CBOWモデルの実装
import sys
sys.path.append('./samplecode')
import numpy as np
from common.layers import Embedding
from ch04.negative_sampling_layer import NegativeSamplingLoss

class CBOW :
    def __init__(self, vocab_size, hidden_size, window_size, corpus) :
        V, H = vocab_size, hidden_size
        
        # 重みの初期化
        W_in = 0.01 * np.random.randn(V, H).astype('f')
        W_out = 0.01 * np.random.randn(V, H).astype('f')
        
        # レイヤの生成
        self.in_layers = []
        for i in range(2*window_size) :
            layer = Embedding(W_in)
            self.in_layers.append(layer)
        self.ns_loss = NegativeSamplingLoss(W_out, corpus, power=0.75, sample_size=5)
        
        # すべての重みと勾配を配列にまとめる
        layers = self.in_layers + [self.ns_loss]
        self.params, self.grads = [], []
        for layer in layers :
            self.params += layer.params
            self.grads += layer.grads
            
        # メンバ変数に単語の分散表現を設定
        self.word_vecs = W_in
        
    def forward(self, contexts, target) :
        h = 0
        for i, layer in enumerate(self.in_layers) :
            h += layer.forward(contexts[:, i])
        h *= 1 / len(self.in_layers)
        loss = self.ns_loss.forward(h, target)
        
    def backward(self, dout=1) :
        dout = self.ns_loss.backward(dout)
        dout *= 1 / len(self.in_layers)
        for layer in self.in_layers :
            layer.backward(dout)
        return None

In [11]:
# CBOWモデルの学習コード(実行にとても時間がかかる)

import sys
sys.path.append('./samplecode')
import numpy as np
from common import config
# GPUで実行する場合は下記を使う(cupyが必要)
# config.GPU = True
import pickle
from common.trainer import Trainer
from common.optimizer import Adam
from ch04.cbow import CBOW
from common.util import create_contexts_target, to_cpu, to_gpu
from dataset import ptb

# ハイパーパラメータの設定
window_size = 5
hidden_size = 100
batch_size = 100
max_epoch = 10

# データの読み込み
corpus, word_to_id, id_to_word = ptb.load_data('train')
vocab_size = len(word_to_id)

contexts, target = create_contexts_target(corpus, window_size)
if config.GPU :
    contexts, target = to_gpu(contexts), to_gpu(target)
    
# モデルの作成
model = CBOW(vocab_size, hidden_size, window_size, corpus)
optimizer = Adam()
trainer = Trainer(model, optimizer)

# 学習開始
trainer.fit(contexts, target, max_epoch, batch_size)
trainer.plot()

# 後ほど利用できるように、必要なデータを保存
word_vecs = model.word_vecs
if config.GPU :
    word_vecs = to_cpu(word_vecs)
params = {}
params['word_vecs'] = word_vecs.astype(np.float16)
params['word_to_id'] = word_to_id
params['id_to_word'] = id_to_word
pkl_file = 'cbow_params.pkl'
with open(pkl_file, 'wb') as f :
    pickle.dump(params, f, -1)

| epoch 1 |  iter 1 / 9295 | time 0[s] | loss 4.16
| epoch 1 |  iter 21 / 9295 | time 1[s] | loss 4.16
| epoch 1 |  iter 41 / 9295 | time 3[s] | loss 4.15
| epoch 1 |  iter 61 / 9295 | time 4[s] | loss 4.12
| epoch 1 |  iter 81 / 9295 | time 6[s] | loss 4.05
| epoch 1 |  iter 101 / 9295 | time 8[s] | loss 3.92
| epoch 1 |  iter 121 / 9295 | time 10[s] | loss 3.78
| epoch 1 |  iter 141 / 9295 | time 11[s] | loss 3.63
| epoch 1 |  iter 161 / 9295 | time 13[s] | loss 3.49
| epoch 1 |  iter 181 / 9295 | time 15[s] | loss 3.37
| epoch 1 |  iter 201 / 9295 | time 17[s] | loss 3.25
| epoch 1 |  iter 221 / 9295 | time 18[s] | loss 3.15
| epoch 1 |  iter 241 / 9295 | time 20[s] | loss 3.09
| epoch 1 |  iter 261 / 9295 | time 21[s] | loss 3.00
| epoch 1 |  iter 281 / 9295 | time 23[s] | loss 2.95
| epoch 1 |  iter 301 / 9295 | time 25[s] | loss 2.91
| epoch 1 |  iter 321 / 9295 | time 26[s] | loss 2.88
| epoch 1 |  iter 341 / 9295 | time 28[s] | loss 2.83
| epoch 1 |  iter 361 / 9295 | time 30[s

| epoch 1 |  iter 2981 / 9295 | time 245[s] | loss 2.43
| epoch 1 |  iter 3001 / 9295 | time 247[s] | loss 2.47
| epoch 1 |  iter 3021 / 9295 | time 249[s] | loss 2.46
| epoch 1 |  iter 3041 / 9295 | time 250[s] | loss 2.44
| epoch 1 |  iter 3061 / 9295 | time 252[s] | loss 2.47
| epoch 1 |  iter 3081 / 9295 | time 253[s] | loss 2.43
| epoch 1 |  iter 3101 / 9295 | time 255[s] | loss 2.44
| epoch 1 |  iter 3121 / 9295 | time 257[s] | loss 2.46
| epoch 1 |  iter 3141 / 9295 | time 258[s] | loss 2.47
| epoch 1 |  iter 3161 / 9295 | time 260[s] | loss 2.46
| epoch 1 |  iter 3181 / 9295 | time 262[s] | loss 2.44
| epoch 1 |  iter 3201 / 9295 | time 263[s] | loss 2.45
| epoch 1 |  iter 3221 / 9295 | time 265[s] | loss 2.44
| epoch 1 |  iter 3241 / 9295 | time 267[s] | loss 2.49
| epoch 1 |  iter 3261 / 9295 | time 268[s] | loss 2.43
| epoch 1 |  iter 3281 / 9295 | time 270[s] | loss 2.44
| epoch 1 |  iter 3301 / 9295 | time 272[s] | loss 2.45
| epoch 1 |  iter 3321 / 9295 | time 273[s] | lo

| epoch 1 |  iter 5921 / 9295 | time 500[s] | loss 2.34
| epoch 1 |  iter 5941 / 9295 | time 503[s] | loss 2.29
| epoch 1 |  iter 5961 / 9295 | time 505[s] | loss 2.31
| epoch 1 |  iter 5981 / 9295 | time 506[s] | loss 2.31
| epoch 1 |  iter 6001 / 9295 | time 509[s] | loss 2.37
| epoch 1 |  iter 6021 / 9295 | time 510[s] | loss 2.31
| epoch 1 |  iter 6041 / 9295 | time 512[s] | loss 2.33
| epoch 1 |  iter 6061 / 9295 | time 514[s] | loss 2.32
| epoch 1 |  iter 6081 / 9295 | time 516[s] | loss 2.31
| epoch 1 |  iter 6101 / 9295 | time 518[s] | loss 2.31
| epoch 1 |  iter 6121 / 9295 | time 519[s] | loss 2.28
| epoch 1 |  iter 6141 / 9295 | time 521[s] | loss 2.34
| epoch 1 |  iter 6161 / 9295 | time 523[s] | loss 2.32
| epoch 1 |  iter 6181 / 9295 | time 525[s] | loss 2.32
| epoch 1 |  iter 6201 / 9295 | time 527[s] | loss 2.30
| epoch 1 |  iter 6221 / 9295 | time 528[s] | loss 2.31
| epoch 1 |  iter 6241 / 9295 | time 530[s] | loss 2.31
| epoch 1 |  iter 6261 / 9295 | time 531[s] | lo

| epoch 1 |  iter 8861 / 9295 | time 742[s] | loss 2.26
| epoch 1 |  iter 8881 / 9295 | time 743[s] | loss 2.18
| epoch 1 |  iter 8901 / 9295 | time 745[s] | loss 2.28
| epoch 1 |  iter 8921 / 9295 | time 746[s] | loss 2.24
| epoch 1 |  iter 8941 / 9295 | time 748[s] | loss 2.22
| epoch 1 |  iter 8961 / 9295 | time 750[s] | loss 2.18
| epoch 1 |  iter 8981 / 9295 | time 751[s] | loss 2.24
| epoch 1 |  iter 9001 / 9295 | time 753[s] | loss 2.21
| epoch 1 |  iter 9021 / 9295 | time 754[s] | loss 2.25
| epoch 1 |  iter 9041 / 9295 | time 756[s] | loss 2.24
| epoch 1 |  iter 9061 / 9295 | time 758[s] | loss 2.21
| epoch 1 |  iter 9081 / 9295 | time 759[s] | loss 2.20
| epoch 1 |  iter 9101 / 9295 | time 761[s] | loss 2.19
| epoch 1 |  iter 9121 / 9295 | time 762[s] | loss 2.22
| epoch 1 |  iter 9141 / 9295 | time 764[s] | loss 2.20
| epoch 1 |  iter 9161 / 9295 | time 765[s] | loss 2.19
| epoch 1 |  iter 9181 / 9295 | time 767[s] | loss 2.21
| epoch 1 |  iter 9201 / 9295 | time 769[s] | lo

KeyboardInterrupt: 

In [14]:
import sys
sys.path.append('./samplecode')
from common.util import most_similar
import pickle

pkl_file = 'cbow_params.pkl'

with open(pkl_file, 'rb') as f :
    params = pickle.load(f)
    word_vecs = params['word_vecs']
    word_to_id = params['word_to_id']
    id_to_word = params['id_to_word']
    
querys = ['you', 'year', 'car', 'toyota']
for query in querys :
    most_similar(query, word_to_id,id_to_word, word_vecs, top=5)


[query] you
 we: 0.6103515625
 someone: 0.59130859375
 i: 0.55419921875
 something: 0.48974609375
 anyone: 0.47314453125

[query] year
 month: 0.71875
 week: 0.65234375
 spring: 0.62744140625
 summer: 0.6259765625
 decade: 0.603515625

[query] car
 luxury: 0.497314453125
 arabia: 0.47802734375
 auto: 0.47119140625
 disk-drive: 0.450927734375
 travel: 0.4091796875

[query] toyota
 ford: 0.55078125
 instrumentation: 0.509765625
 mazda: 0.49365234375
 bethlehem: 0.47509765625
 nissan: 0.474853515625


In [16]:
from common.util import analogy

analogy('man', 'king', 'woman', word_to_id, id_to_word, word_vecs, top=5)
analogy('take', 'took', 'go', word_to_id, id_to_word, word_vecs, top=5)
analogy('car', 'cars', 'child', word_to_id, id_to_word, word_vecs, top=5)
analogy('good', 'better', 'bad', word_to_id, id_to_word, word_vecs, top=5)


[analogy] man:king = woman:?
 she: 4.1796875
 moody: 4.1328125
 share: 4.05078125
 character: 3.966796875
 chain: 3.912109375

[analogy] take:took = go:?
 went: 4.55078125
 points: 4.25
 began: 4.09375
 comes: 3.98046875
 oct.: 3.90625

[analogy] car:cars = child:?
 children: 5.21875
 average: 4.7265625
 yield: 4.20703125
 cattle: 4.1875
 priced: 4.1796875

[analogy] good:better = bad:?
 more: 6.6484375
 less: 6.0625
 rather: 5.21875
 slower: 4.734375
 greater: 4.671875
