使用gensim训练垂直领域的字向量，并且在模型中进行简单使用

In [1]:
from gensim.models import Word2Vec
from gensim.models.word2vec import LineSentence

In [15]:
import codecs
f = codecs.open('simple.txt','r',encoding='utf8')
target = codecs.open('simple_train.txt','w',encoding='utf8')
line_num = 1
line = f.readline()
print('处理后的语料：\n','-'*10)
while line:
    line_seg = ' '.join(str(line))
    target.writelines(line_seg)
    print(line_seg,end='')
    line_num = line_num+1
    line = f.readline()
f.close

target.close()

处理后的语料：
 ----------
这 是 一 个 大 苹 果 
这 是 一 个 大 菠 萝 
这 是 一 个 大 香 蕉 
那 是 一 辆 小 汽 车 
那 是 一 架 大 飞 机 
那 是 一 艘 小 轮 船

In [16]:
model = Word2Vec(LineSentence('simple_train.txt'), size=2, window=3, min_count=1)
# size 指的是字向量的大小
# window 指的是w2v的窗口大小

In [17]:
model.save('simple') # 此文件为二进制文件，无法直接打开

In [19]:
model.wv.save_word2vec_format('simple_not_c', binary=False)
# 此文件可以直接打开

加载刚训练好的字向量

In [20]:
simple_word2vec_model = Word2Vec.load('simple')

In [26]:
simple_word2vec_model.most_similar('苹',topn=1)

  """Entry point for launching an IPython kernel.


[('机', 0.9983408451080322)]

In [31]:
# 直接拿到某个字的字向量
simple_word2vec_model.wv['苹']

array([ 0.05518538, -0.21129605], dtype=float32)

In [60]:
vocab_list = [word for word, Vocab in simple_word2vec_model.wv.vocab.items()]

In [63]:
word_index = {" ": 0}
word_vector = {}
embedding_matrix = np.zeros((len(vocab_list) + 1, simple_word2vec_model.vector_size))

for i in range(len(vocab_list)):
    # print(i)
    word = vocab_list[i]  # 每个词语
    word_index[word] = i + 1 # 词语：索引
    word_vector[word] = simple_word2vec_model.wv[word] # 词语：词向量
    embedding_matrix[i + 1] = simple_word2vec_model.wv[word]  # 词向量矩阵

In [96]:
embedding_matrix.shape
# 即词向量矩阵
# word_index 为 “词语-索引”字典
number_dict = {i:w for w,i in word_index.items()}

将训练好的字向量用于 nnlm 模型当中

值得注意的是 需要按照 word_index 进行构造输入输出的batch

In [51]:
import numpy as np
import torch
import torch.nn as nn
import torch.optim as optim
from torch.autograd import Variable

In [91]:
def make_batch(sentences):
    input_batch = []
    target_batch = []
    
    for sen in sentences:
        word = ' '.join(sen).split()
        input = []
        for n in word[:-1]:
            if n in word_index.keys():
                input.append(word_index[n])
            else:
                input.append(0)
        target = word_index[word[-1]]
        
        input_batch.append(input)
        target_batch.append(target)
        
    return input_batch, target_batch
# 需要按照 word_index 构造输入输出batch
sentences = [ "这有一个大苹果", "那有一艘小轮船", "那是一架大飞机"]
input_batch, target_batch = make_batch(sentences)
input_batch = Variable(torch.LongTensor(input_batch))
target_batch = Variable(torch.LongTensor(target_batch))

In [120]:
n_step = 6  # 考虑前两个词
n_hidden = 2
m = 2
input_size = n_step * m # 2 * 2
hidden_size = n_hidden  # 2 隐藏层单元数为 2
n_class = 23

class NNLM(nn.Module):
    def __init__(self, input_size, hidden_size, n_class):
        super(NNLM, self).__init__()
        self.input_size = input_size
        self.hidden_size = hidden_size
        self.n_class = n_class
        
        # 在embedding层中不使用预训练好的word2vec词向量
        # self.emb = nn.Embedding(n_class, m)
        
        # 使用预训练词向量
        self.embedding = nn.Embedding.from_pretrained(torch.FloatTensor(embedding_matrix))
        # requires_grad指定是否在训练过程中对词向量的权重进行微调
        self.embedding.weight.requires_grad = True
        
        self.fc1 = nn.Linear(self.input_size, self.hidden_size) 
        self.fc2 = nn.Linear(self.hidden_size, self.n_class)
    
    def forward(self, x):
        x = self.embedding(x) 
        x = x.view(-1, self.input_size) 
        x = self.fc1(x)          
        x = torch.tanh(x)  
        output = self.fc2(x)      
        return output

model = NNLM(input_size, hidden_size, n_class)
criterion = nn.CrossEntropyLoss()
optimizer = optim.Adam(model.parameters(), lr=0.001)

print(model)

NNLM(
  (embedding): Embedding(23, 2)
  (fc1): Linear(in_features=12, out_features=2, bias=True)
  (fc2): Linear(in_features=2, out_features=23, bias=True)
)


In [121]:
for epoch in range(1000):
    output = model(input_batch)
    loss = criterion(output, target_batch)
    
    if (epoch + 1) % 1000 == 0:
        print('Epoch:', '%04d' % (epoch + 1), 'cost =', '{:.6f}'.format(loss))
    
    optimizer.zero_grad()
    loss.backward()
    optimizer.step()

Epoch: 1000 cost = 0.779972


In [125]:
# 预测结果

predict = model(input_batch).data.max(1, keepdim=True)[1]
print(predict)

print([sen[:-1] for sen in sentences], '->', [number_dict[n.item()] for n in predict.squeeze()]) 


tensor([[19],
        [22],
        [19]])
['这有一个大苹', '那有一艘小轮', '那是一架大飞'] -> ['机', '船', '机']


In [126]:
for embed in model.embedding.parameters():
    embedding_after_train = embed

In [127]:
i = 2
print('针对 "%s" 的字向量为\n 训练后 %s \n 训练前 %s'% (number_dict[i],embedding_after_train[i].tolist(), embedding_matrix[i]))

针对 "是" 的字向量为
 训练后 [-0.2596454322338104, 0.33751505613327026] 
 训练前 [-0.14180711 -0.1044377 ]


## 使用搜狗的词向量

In [1]:
import torch
import numpy as np
embedding_pretrained = torch.tensor(np.load('embedding_SougouNews.npz')["embeddings"].astype('float32'))

In [14]:
import pickle as pkl
vocab_path = 'vocab.pkl'
vocab = pkl.load(open(vocab_path, 'rb'))

In [22]:
len(vocab)

4762

In [33]:
index_to_vocab = {v:i for i,v in vocab.items()}

In [18]:
import numpy as np
import torch
import torch.nn as nn
import torch.optim as optim
from torch.autograd import Variable

def make_batch(sentences):
    input_batch = []
    target_batch = []
    
    for sen in sentences:
        word = ' '.join(sen).split()
        input = []
        for n in word[:-1]:
            input.append(vocab.get(n, vocab.get('<UNK>')))
        target = vocab.get(word[-1], vocab.get('<UNK>'))
        
        input_batch.append(input)
        target_batch.append(target)
        
    return input_batch, target_batch
# 需要按照 word_index 构造输入输出batch
sentences = [ "这有一个大苹果", "那有一艘小轮船", "那是一架大飞机"]
input_batch, target_batch = make_batch(sentences)

In [28]:
input_batch = Variable(torch.LongTensor(input_batch))
target_batch = Variable(torch.LongTensor(target_batch))


n_step = 6  # 考虑前两个词
n_hidden = 2
m = 300
input_size = n_step*m # 2 * 2
hidden_size = n_hidden  # 2 隐藏层单元数为 2
n_class = len(vocab)

class NNLM(nn.Module):
    def __init__(self, input_size, hidden_size, n_class):
        super(NNLM, self).__init__()
        self.input_size = input_size
        self.hidden_size = hidden_size
        self.n_class = n_class
        
        # 在embedding层中不使用预训练好的word2vec词向量
        # self.emb = nn.Embedding(n_class, m)
        
        # 使用预训练词向量
        self.embedding = nn.Embedding.from_pretrained(embedding_pretrained)
        # requires_grad指定是否在训练过程中对词向量的权重进行微调
        self.embedding.weight.requires_grad = False
        
        self.fc1 = nn.Linear(self.input_size, self.hidden_size) 
        self.fc2 = nn.Linear(self.hidden_size, self.n_class)
    
    def forward(self, x):
        x = self.embedding(x) 
        x = x.view(-1, self.input_size) 
        x = self.fc1(x)          
        x = torch.tanh(x)  
        output = self.fc2(x)      
        return output

model = NNLM(input_size, hidden_size, n_class)
criterion = nn.CrossEntropyLoss()
optimizer = optim.Adam(model.parameters(), lr=0.001)

print(model)


for epoch in range(1000):
    output = model(input_batch)
    loss = criterion(output, target_batch)
    
    if (epoch + 1) % 1000 == 0:
        print('Epoch:', '%04d' % (epoch + 1), 'cost =', '{:.6f}'.format(loss))
    
    optimizer.zero_grad()
    loss.backward()
    optimizer.step()
    
    
# 预测结果

predict = model(input_batch).data.max(1, keepdim=True)[1]
print(predict)

NNLM(
  (embedding): Embedding(4762, 300)
  (fc1): Linear(in_features=1800, out_features=2, bias=True)
  (fc2): Linear(in_features=2, out_features=4762, bias=True)
)
Epoch: 1000 cost = 3.154887
tensor([[ 579],
        [1172],
        [  47]])


In [34]:
print([sen[:-1] for sen in sentences], '->', [index_to_vocab[n.item()] for n in predict.squeeze()]) 

['这有一个大苹', '那有一艘小轮', '那是一架大飞'] -> ['果', '船', '机']
