In [1]:
import torch
import torch.nn as nn
import torch.nn.functional as F
import torch.optim as optim

In [2]:
docs = [
        "新的数学方法和概念，常常比解决数学问题本身更重要。",
        "在数学中，我们发现真理的主要工具是归纳和模拟。",
        "数学方法渗透并支配着一切自然科学的理论分支。它愈来愈成为衡量科学成就的主要标志了。",
        "第一是数学，第二是数学，第三是数学。",
        "历史使人贤明，诗造成气质高雅的人，数学使人高尚，自然哲学使人深沉，道德使人稳重，而伦理学和修辞学则使人善于争论。"
       ]

#### 使用jieba 分词

In [3]:
import jieba
words_list = [list(jieba.cut(doc)) for doc in docs]
words_list

Building prefix dict from the default dictionary ...
Loading model from cache C:\Users\62669\AppData\Local\Temp\jieba.cache
Loading model cost 0.637 seconds.
Prefix dict has been built succesfully.


[['新',
  '的',
  '数学方法',
  '和',
  '概念',
  '，',
  '常常',
  '比',
  '解决',
  '数学',
  '问题',
  '本身',
  '更',
  '重要',
  '。'],
 ['在',
  '数学',
  '中',
  '，',
  '我们',
  '发现',
  '真理',
  '的',
  '主要',
  '工具',
  '是',
  '归纳',
  '和',
  '模拟',
  '。'],
 ['数学方法',
  '渗透',
  '并',
  '支配',
  '着',
  '一切',
  '自然科学',
  '的',
  '理论',
  '分支',
  '。',
  '它',
  '愈来愈',
  '成为',
  '衡量',
  '科学',
  '成就',
  '的',
  '主要',
  '标志',
  '了',
  '。'],
 ['第一', '是', '数学', '，', '第二', '是', '数学', '，', '第三', '是', '数学', '。'],
 ['历史',
  '使人',
  '贤明',
  '，',
  '诗',
  '造成',
  '气质高雅',
  '的',
  '人',
  '，',
  '数学',
  '使人',
  '高尚',
  '，',
  '自然哲学',
  '使人',
  '深沉',
  '，',
  '道德',
  '使人',
  '稳重',
  '，',
  '而',
  '伦理学',
  '和',
  '修辞学',
  '则',
  '使',
  '人',
  '善于',
  '争论',
  '。']]

#### 构建词典

In [4]:
vocb = set([word for words in words_list for word in words])
word_to_idx = {word: i for i, word in enumerate(vocb)}
idx_to_word = {word_to_idx[word]: word for word in word_to_idx}

#### 使用nn.Embedding创建词嵌入单元

In [5]:
vocb_size = len(vocb)
embedding_size = 200
embeds = nn.Embedding(vocb_size, embedding_size) 
embeds

Embedding(63, 200)

#### 查找词嵌入，该方法对应字典中的id

In [6]:
word_to_idx["数学"]

28

In [7]:
look_up = torch.LongTensor([word_to_idx["数学"]])
embeds(look_up)

tensor([[-6.4583e-01,  7.7057e-01, -6.3130e-01,  5.4269e-01, -1.4782e+00,
         -1.2041e-01,  5.9758e-01,  1.5149e+00, -2.6339e-01, -6.7263e-01,
         -6.4217e-01,  1.0209e+00, -9.4901e-02, -3.6418e-01, -1.0050e-01,
         -1.4244e+00, -1.2991e+00, -1.0365e+00,  1.4462e-01, -1.2257e+00,
         -3.6916e-02, -4.1594e-01, -1.9010e-02, -4.3103e-01, -1.4997e+00,
          1.2723e+00,  7.0842e-01,  1.0682e+00,  3.6274e-01, -5.8121e-01,
          7.9005e-01, -1.2472e+00, -4.5186e-03, -3.3665e-02, -1.3081e+00,
         -1.6804e+00,  5.0088e-01,  1.9714e+00,  4.9155e-01,  6.5748e-01,
          1.3346e+00,  8.8960e-02,  6.9962e-01, -9.7362e-01,  5.2199e-02,
         -2.3520e+00,  1.1331e+00,  2.9872e-02,  2.8982e-01,  1.2950e+00,
         -7.4605e-01, -1.0734e+00,  8.5781e-01,  3.1512e-01,  1.2917e+00,
          1.0974e+00,  2.9043e-01, -4.8641e-01, -4.7770e-01,  5.4404e-01,
         -5.9924e-01, -6.6578e-01, -9.0113e-01,  1.2556e+00, -5.7889e-01,
         -8.8332e-01, -6.4160e-02, -6.

#### 加载外部训练好的词向量

In [9]:
import gensim
from gensim.models import Word2Vec
word2vec_model = gensim.models.KeyedVectors.load("news.word2vec")

In [10]:
embedding_shape = word2vec_model.wv.vectors.shape
embedding_shape

(155362, 200)

In [11]:
#初始化Embedding层
embed = nn.Embedding(embedding_shape[0], embedding_shape[1])
#拷贝训练好的词向量权重
embed.weight.data.copy_(torch.from_numpy(word2vec_model.wv.vectors))
#冻结权重，训练过程不更新
embed.weight.requires_grad = False

In [12]:
#找到数学的index
index = word2vec_model.wv.vocab["数学"].index
#查表
embed(torch.LongTensor([index]))

tensor([[ 0.9777,  0.3726,  0.3545, -0.3584,  0.9531,  0.3427, -0.4642,  1.2159,
         -0.9503, -0.5782, -2.4697,  1.5502, -0.9270,  2.3928, -0.2477, -0.6038,
         -0.4944,  1.3966,  0.7566, -0.8826, -1.1975, -0.0653, -0.1487,  0.7565,
         -0.6251, -1.6796, -0.3788, -1.0130,  1.9895, -0.4686, -0.1373,  2.0175,
         -0.2474, -1.5085,  0.7892,  0.3344, -1.1795, -3.1736,  0.9433, -1.4503,
         -1.5128,  0.2205,  0.9734, -1.2039, -1.7167,  1.3966, -1.7425, -0.2714,
          0.3819, -0.6154, -0.8171, -0.6028, -2.7886, -0.5618, -0.2610,  1.0492,
         -0.3627, -1.2233,  1.2634,  0.3334,  0.6682,  2.6612,  0.6975, -0.0708,
          1.5320, -0.1496, -0.2336,  1.2927, -0.7408, -1.0342, -0.9239,  0.0202,
          0.8004,  1.4688, -1.2386,  0.2145,  2.0381, -0.3545,  0.8606, -1.0852,
         -0.7719, -1.5213,  0.2656, -0.9081, -1.8508, -1.4375, -0.8160,  1.5982,
          0.8021,  0.5514,  0.4641,  0.4165,  0.2809, -0.8394, -0.1702, -0.4008,
          1.1085, -1.0770,  

In [13]:
index

5160

In [14]:
word2vec_model.wv.vectors[5160]

array([ 0.9777014 ,  0.37264398,  0.3544503 , -0.3584271 ,  0.9531079 ,
        0.34265763, -0.46419275,  1.2158922 , -0.95033014, -0.57818353,
       -2.4696522 ,  1.5502193 , -0.9270395 ,  2.3928015 , -0.24768908,
       -0.6038057 , -0.4943789 ,  1.3965534 ,  0.7565906 , -0.88257504,
       -1.1974952 , -0.06531978, -0.14872837,  0.7565044 , -0.625109  ,
       -1.6795636 , -0.37883943, -1.0129856 ,  1.9894767 , -0.46857628,
       -0.13727938,  2.0174541 , -0.24744098, -1.5085264 ,  0.7892023 ,
        0.33436   , -1.1795474 , -3.1735787 ,  0.94334424, -1.4502727 ,
       -1.512804  ,  0.22046994,  0.97340196, -1.2038714 , -1.7167488 ,
        1.3966297 , -1.74253   , -0.271398  ,  0.38193786, -0.6153631 ,
       -0.81710833, -0.6028143 , -2.7886477 , -0.56181955, -0.26096618,
        1.0491836 , -0.36269432, -1.2233461 ,  1.2633723 ,  0.33339623,
        0.6681777 ,  2.6611874 ,  0.6974885 , -0.07083179,  1.5320121 ,
       -0.14957856, -0.23364297,  1.2927059 , -0.740794  , -1.03

### Pytorch中对变成序列的处理

In [163]:
docs = [["历史","的","潮流","滚滚","向前"],["科技","工作者"]]

In [164]:
indexes = [[word2vec_model.wv.vocab[word].index for word in doc] for doc in docs]
indexes

[[440, 1, 5925, 18646, 4955], [341, 4238]]

#### look up for embedding

In [168]:
word_embedding = [[list(embed(torch.LongTensor([word2vec_model.wv.vocab[word].index])).squeeze(0).numpy()) for word in doc] for doc in docs]
#对于第一个句子，使用0进行填充
word_embedding[1].extend([list(np.zeros(200)) for i in range(3)])

In [169]:
batch_size = 2
max_length =5
embed_size = 200

#### 构建“参差不齐”的批次数据

In [170]:
batch_data = torch.Tensor(batch_size,max_length,embed_size)
batch_data.data.copy_(torch.from_numpy(np.array(word_embedding)))

tensor([[[ 1.9508, -1.4661,  2.0933,  ..., -1.7270, -0.1494, -0.0703],
         [-0.2667,  0.4490,  0.3256,  ..., -0.1530,  1.4263,  0.3550],
         [ 0.3754,  1.8401,  0.2334,  ..., -1.0909, -0.7132,  1.6995],
         [ 0.0494,  0.6918, -0.2538,  ...,  0.4563, -0.4146,  0.1772],
         [-0.7569,  0.7580, -0.5532,  ..., -0.2523, -0.4494, -1.6142]],

        [[ 0.4797,  1.4363,  1.2964,  ..., -0.5412, -0.1600,  0.3205],
         [ 0.3191, -0.8103, -0.5561,  ..., -0.3135, -1.0554,  0.3390],
         [ 0.0000,  0.0000,  0.0000,  ...,  0.0000,  0.0000,  0.0000],
         [ 0.0000,  0.0000,  0.0000,  ...,  0.0000,  0.0000,  0.0000],
         [ 0.0000,  0.0000,  0.0000,  ...,  0.0000,  0.0000,  0.0000]]])

#### 使用pack_padded_sequence对填充后的矩阵进行压缩

In [171]:
from torch.nn.utils.rnn import pack_padded_sequence,pad_packed_sequence

In [176]:
packedSequence = pack_padded_sequence(batch_data,torch.Tensor([5,2]),batch_first=True)

In [178]:
packedSequence

PackedSequence(data=tensor([[ 1.9508, -1.4661,  2.0933,  ..., -1.7270, -0.1494, -0.0703],
        [ 0.4797,  1.4363,  1.2964,  ..., -0.5412, -0.1600,  0.3205],
        [-0.2667,  0.4490,  0.3256,  ..., -0.1530,  1.4263,  0.3550],
        ...,
        [ 0.3754,  1.8401,  0.2334,  ..., -1.0909, -0.7132,  1.6995],
        [ 0.0494,  0.6918, -0.2538,  ...,  0.4563, -0.4146,  0.1772],
        [-0.7569,  0.7580, -0.5532,  ..., -0.2523, -0.4494, -1.6142]]), batch_sizes=tensor([2, 2, 1, 1, 1]))

In [179]:
packedSequence.data.shape

torch.Size([7, 200])

In [186]:
tensor,sizes = pad_packed_sequence(packedSequence,batch_first=True,padding_value=0.0)
print("pad packed sequence shape:{}".format(tensor.shape))
print("pad packed sequences length:{}".format(sizes))

pad packed sequence shape:torch.Size([2, 5, 200])
pad packed sequences length:tensor([5, 2])


In [187]:
tensor

tensor([[[ 1.9508, -1.4661,  2.0933,  ..., -1.7270, -0.1494, -0.0703],
         [-0.2667,  0.4490,  0.3256,  ..., -0.1530,  1.4263,  0.3550],
         [ 0.3754,  1.8401,  0.2334,  ..., -1.0909, -0.7132,  1.6995],
         [ 0.0494,  0.6918, -0.2538,  ...,  0.4563, -0.4146,  0.1772],
         [-0.7569,  0.7580, -0.5532,  ..., -0.2523, -0.4494, -1.6142]],

        [[ 0.4797,  1.4363,  1.2964,  ..., -0.5412, -0.1600,  0.3205],
         [ 0.3191, -0.8103, -0.5561,  ..., -0.3135, -1.0554,  0.3390],
         [ 0.0000,  0.0000,  0.0000,  ...,  0.0000,  0.0000,  0.0000],
         [ 0.0000,  0.0000,  0.0000,  ...,  0.0000,  0.0000,  0.0000],
         [ 0.0000,  0.0000,  0.0000,  ...,  0.0000,  0.0000,  0.0000]]])

In [192]:
#pack
pack = pack_padded_sequence(batch_data, torch.Tensor([5,2]), batch_first=True)
rnn = nn.RNN(200,10, 2, batch_first=True)
h0 = torch.randn(2, 2, 10)
#forward
out, _ = rnn(pack, h0)
# unpack
unpacked =pad_packed_sequence(out)
print(unpacked)

(tensor([[[ 0.0737, -0.2264,  0.3659, -0.7315,  0.4314, -0.5522,  0.4508,
           0.4300,  0.6089,  0.4955],
         [-0.6927, -0.9158,  0.1014, -0.3152,  0.7671, -0.0811, -0.3895,
           0.5228,  0.0641,  0.1213]],

        [[-0.4387, -0.3381,  0.0609, -0.3746,  0.4941, -0.3338,  0.8053,
           0.2477, -0.2927,  0.6532],
         [ 0.0118, -0.4609,  0.1862, -0.5827,  0.5011, -0.1901,  0.2654,
          -0.3303,  0.5781,  0.5101]],

        [[-0.4990,  0.1583, -0.7873, -0.8562,  0.3908, -0.1197,  0.3253,
           0.5072, -0.7456,  0.1218],
         [ 0.0000,  0.0000,  0.0000,  0.0000,  0.0000,  0.0000,  0.0000,
           0.0000,  0.0000,  0.0000]],

        [[-0.2892,  0.5787, -0.4259, -0.2719, -0.5481, -0.8696,  0.1278,
          -0.3356, -0.6982, -0.3731],
         [ 0.0000,  0.0000,  0.0000,  0.0000,  0.0000,  0.0000,  0.0000,
           0.0000,  0.0000,  0.0000]],

        [[-0.8881, -0.3430, -0.1504,  0.0181,  0.2018, -0.7650,  0.1453,
           0.3667, -0.8491, -0

In [195]:
unpacked[0].shape

torch.Size([5, 2, 10])