In [2]:
import torch
import pandas as pd
import numpy as np
import torchtext
from torchtext import data,datasets
from torchtext.vocab import GloVe
from torchtext import data
import torch.nn as nn
import torch.optim as optim
import torch.nn.functional as F
import random
import spacy
import re
import codecs
import gensim.downloader as api
from gensim.models import TfidfModel
from gensim.corpora import Dictionary

spacy_en = spacy.load('en_core_web_sm')

def tokenizer(text): # create a tokenizer function
    return [tok.text for tok in spacy_en.tokenizer(text)]

PREMISE = data.Field(sequential=True, tokenize=tokenizer, lower=True,batch_first=True)
PREMISE_LENGTH = data.Field(sequential=False, batch_first=True,use_vocab=False)
HYPOTHESIS = data.Field(sequential=True, tokenize=tokenizer, lower=True,batch_first=True)
HYPOTHESIS_LENGTH = data.Field(sequential=False, batch_first=True,use_vocab=False)
LABEL = data.Field(sequential=False,batch_first=True, use_vocab=False)

In [5]:
parentheses_table = str.maketrans({"(": "", ")": ""})
fw = codecs.open("/home/xiyu/data/trainee/ZhuYanru/snli/snli_test_pre.tsv",'w',encoding='utf8')
fw.write("idx\tpremise\tpremise_length\thypothesis\thypothesis_length\tlable\r\n")
with open("/home/xiyu/data/trainee/ZhuYanru/snli/snli_test.txt", "r", encoding="utf8") as input_data:
    rowid=1
    for line in input_data:
        line = line.strip().split("\t")
        idx=str(rowid)
        #idx=line[7].translate(parentheses_table)
        lable=line[0]
        premise = re.sub(r'[^\w\s]','',line[5])
        premise_length=str(len([p for p in premise.split(' ')]))
        hypothesis = re.sub(r'[^\w\s]','',line[6])
        hypothesis_length=str(len([h for h in hypothesis.split(' ')]))
        #print([idx,lable,premise,hypothesis])
        fw.write(idx+'\t')
        fw.write(premise+'\t')
        fw.write(premise_length+'\t')
        fw.write(hypothesis+'\t')
        fw.write(hypothesis_length+'\t')
        if lable=='neutral':
            fw.write('0\n')
        elif lable=='entailment':
            fw.write('1\n')
        elif lable=='contradiction':
            fw.write('2\n')
        rowid+=1
        '''
        fw.write('{"idx":"'+idx+'",')
        fw.write('"premise":"'+premise+'",')
        fw.write('"hypothesis":"'+hypothesis+'",')
        fw.write('"lable":"'+lable+'"}')
        '''


In [6]:
with open("/home/xiyu/data/trainee/ZhuYanru/snli/snli_test_pre.tsv", "r", encoding="utf8") as test_data:
    for line in test_data:
        line = line.strip().split("\t")
        print(line)

['idx', 'premise', 'premise_length', 'hypothesis', 'hypothesis_length', 'lable']
['1', 'Two women are embracing while holding to go packages', '9', 'The sisters are hugging goodbye while holding to go packages after just eating lunch', '14', '0']
['2', 'Two women are embracing while holding to go packages', '9', 'Two woman are holding packages', '5', '1']
['3', 'Two women are embracing while holding to go packages', '9', 'The men are fighting outside a deli', '7', '2']


In [7]:
train_on_gpu=torch.cuda.is_available()
if(train_on_gpu):
    print('Training on GPU.')
else:
    print('No GPU available, training on CPU.')


device = torch.device('cuda' if torch.cuda.is_available() else 'cpu')
train_data = data.TabularDataset(
        path='./snli_test_pre.tsv',format='tsv',
        skip_header=True,
        #fields = {"idx":("idx",None),"premise":("premise", PREMISE),"hypothesis":("hypothesis", HYPOTHESIS),"lable":("lable",LABEL)})
        fields =[('idx', None),('premise', PREMISE),('premise_length',PREMISE_LENGTH),('hypothesis', HYPOTHESIS),('hypothesis_length',HYPOTHESIS_LENGTH),('lable',LABEL)])


PREMISE.build_vocab(train_data, vectors=GloVe(name='6B', dim=300))
HYPOTHESIS.build_vocab(train_data, vectors=GloVe(name='6B', dim=300))
PREMISE_LENGTH.build_vocab()
HYPOTHESIS_LENGTH.build_vocab()
LABEL.build_vocab()
train_data, valid_data = train_data.split(random_state = random.seed(10))


Training on GPU.


In [8]:
BATCH_SIZE=16
train_iterator,valid_iterator = data.BucketIterator.splits(
    (train_data,valid_data), 
    sort_key=lambda x: len(x.premise),
    batch_size = BATCH_SIZE,
    device = device)
print(len(train_iterator))
print(len(valid_iterator))

1
1


In [11]:

for id, batch in enumerate(train_iterator):
    premise,premise_length,hypothesis,hypothesis_length,lable =batch.premise,batch.premise_length,batch.hypothesis,batch.hypothesis_length, batch.lable
    #print(batch.premise.shape,batch.premise_length.shape,batch.hypothesis.shape,batch.hypothesis_length.shape,batch.lable.shape)
    print(hypothesis)
    print(hypothesis_length)
for id, batch in enumerate(valid_iterator):
    premise,premise_length,hypothesis,hypothesis_length,lable =batch.premise,batch.premise_length,batch.hypothesis,batch.hypothesis_length, batch.lable
    #print(batch.premise.shape,batch.premise_length.shape,batch.hypothesis.shape,batch.hypothesis_length.shape,batch.lable.shape)
    print(hypothesis)
    print(hypothesis_length)

tensor([[ 5, 18,  2, 13, 12, 21,  3, 19, 11,  4,  7, 14,  9, 15],
        [ 5, 16,  2, 10, 17,  6,  8,  1,  1,  1,  1,  1,  1,  1]],
       device='cuda:0')
tensor([14,  7], device='cuda:0')
tensor([[20, 22,  2,  3,  4]], device='cuda:0')
tensor([5], device='cuda:0')


In [36]:
class EncodeLayerRNN(nn.Module):

    def __init__(self,  embedding_dim, hidden_dim , n_layers=1,bias=True,drop_prob=0.2,bidirectional=False):
        super(EncodeLayerRNN, self).__init__()
                 
        #self.vocab_size=vocab_size
        #self.input_size=input_size
        self.embedding_dim = embedding_dim
        self.hidden_dim = hidden_dim
        self.n_layers = n_layers
        self.bias = bias
        self.drop_prob = drop_prob
        self.bidirectional = bidirectional
        #随机生成Embedding
        #self.word_embeddings = nn.Embedding(vocab_size, embedding_dim) 
        # 若使用预训练的词向量，需在此处指定预训练的权重
        #使用预训练的文件Embedding 
        #self.word_embeddings = nn.Embedding.from_pretrained(PREMISE.vocab.vectors)
        self.word_embeddings = nn.Embedding.from_pretrained(HYPOTHESIS.vocab.vectors)
        #self.word_embeddings.weight.data.copy_(TEXT.vocab.vectors)
        self.word_embeddings.weight.data.requires_grad=False
        self.lstm = nn.LSTM(embedding_dim, hidden_dim, n_layers,dropout=drop_prob,bidirectional=False)  
        
        #self.dropout = nn.Dropout(drop_prob)
        #self.decoder = nn.Linear(hidden_dim, output_size)

    def forward(self, sequences_batch,sequences_lengths):
        #print(sequences_batch.size())
        #print(sequences_lengths)
        embeds = self.word_embeddings(sequences_batch)
        #对batch中的句子长度从大到小排序sorted_batch，并记录原来的句子顺序restoration_index
        #print(embeds)
        #print(sequences_lengths)
        sorted_seq_lens, sorting_index =sequences_lengths.sort(0, descending=True)
        #print(sorted_seq_lens)
        #print(1/0)
        #print(sorting_index)
        sorted_batch = embeds.index_select(0, sorting_index)
        #print(sorted_batch.size())
        idx_range =sequences_lengths.new_tensor(torch.arange(0, len(sequences_lengths)))
        #idx_range =torch.arange(0, len(sequences_lengths)).cuda()
        #print(idx_range)
        _, reverse_mapping = sorting_index.sort(0, descending=False)
        #print(reverse_mapping)
        restoration_index = idx_range.index_select(0, reverse_mapping)
        #对句子进行压缩操作，避免最后输出为许多padding结果
        packed_batch = nn.utils.rnn.pack_padded_sequence(sorted_batch,sorted_seq_lens,batch_first=True)
        #输入lstm
        outputs, _ = self.lstm(packed_batch, None)
        #print(outputs.size())
        #print(outputs)
        #恢复到padding状态
        outputs, _ = nn.utils.rnn.pad_packed_sequence(outputs,batch_first=True)
        #print(outputs.size())
        #print(outputs)
        #恢复到原batch次序
        reordered_outputs = outputs.index_select(0, restoration_index)
  
        return reordered_outputs

In [68]:

def get_mask(sequences_batch, sequences_lengths):

    batch_size = sequences_batch.size()[0]
    max_length = torch.max(sequences_lengths)
    mask = torch.ones(batch_size, max_length, dtype=torch.float)
    mask[sequences_batch[:, :max_length] == 0] = 0.0
    return mask

def weighted_sum(tensor, weights, mask):

    weighted_sum = weights.bmm(tensor)

    while mask.dim() < weighted_sum.dim():
        mask = mask.unsqueeze(1)
    mask = mask.transpose(-1, -2)
    mask = mask.expand_as(weighted_sum).contiguous().float()

    return weighted_sum * mask

def masked_softmax(tensor, mask):
    tensor_shape = tensor.size()
    reshaped_tensor = tensor.view(-1, tensor_shape[-1])
    #print(mask.size())
    #print(mask)
    #print(tensor.size())
    # Reshape the mask so it matches the size of the input tensor.
    while mask.dim() < tensor.dim():
        mask = mask.unsqueeze(1)
    print(mask.size())
    mask = mask.expand_as(tensor).contiguous().float()
    reshaped_mask = mask.view(-1, mask.size()[-1])
    print(reshaped_mask.size())
    print(reshaped_tensor.size())
    #对每一行进行softmax
    #print(reshaped_tensor * reshaped_mask)
    result = nn.functional.softmax(reshaped_tensor * reshaped_mask.cuda(), dim=-1)
    #print('A:',result.size())
    #疑问：为何需要在经过softmax层之后再乘mask
    result = result * reshaped_mask.cuda()
    #print('B:',result.size())
    # 1e-13 is added to avoid divisions by zero.
    result = result / (result.sum(dim=-1, keepdim=True) + 1e-13)

    return result.view(*tensor_shape)

class SoftmaxAttention(nn.Module):

    def forward(self,
                premise_batch,
                premise_mask,
                hypothesis_batch,
                hypothesis_mask):

        # Dot product between premises and hypotheses in each sequence of
        # the batch.
        similarity_matrix = premise_batch.bmm(hypothesis_batch.transpose(2, 1)
                                                              .contiguous())
        #print(similarity_matrix.size())
        #print(similarity_matrix)
        # Softmax attention weights.
        prem_hyp_attn = masked_softmax(similarity_matrix, hypothesis_mask)
        hyp_prem_attn = masked_softmax(similarity_matrix.transpose(1, 2)
                                                        .contiguous(),
                                       premise_mask)

        # Weighted sums of the hypotheses for the the premises attention,
        # and vice-versa for the attention of the hypotheses.
        attended_premises = weighted_sum(hypothesis_batch,
                                         prem_hyp_attn,
                                         premise_mask.cuda())
        attended_hypotheses = weighted_sum(premise_batch,
                                           hyp_prem_attn,
                                           hypothesis_mask.cuda())

        return attended_premises, attended_hypotheses

In [72]:

#编码层
Encoded = EncodeLayerRNN(embedding_dim=300, hidden_dim=300, n_layers=1)
print(Encoded)

#Attention层
Attention = SoftmaxAttention()
train_on_gpu=torch.cuda.is_available()
if(train_on_gpu):
    Encoded.cuda()
    
Encoded.train()
losses=[]
# train for some number of epochs
for epoch, batch in enumerate(train_iterator):
    #optimizer.zero_grad()
    #start = time.time()
    #print('premise_length')
    #print(batch.premise_length)
    mask_premise=get_mask(batch.premise,batch.premise_length)
    mask_hypothesis=get_mask(batch.hypothesis,batch.hypothesis_length)
    print(mask_premise)
    print(mask_hypothesis)
    premise_encoded = Encoded(batch.premise,batch.premise_length)
    #print('hypothesis_length')
    #print(batch.hypothesis_length)
    hypothesis_encoded = Encoded(batch.hypothesis,batch.hypothesis_length)
    #print(premise_encoded.size())
    #print(hypothesis_encoded.size())
    attended_premises,attended_hypotheses=Attention(premise_encoded,mask_premise,hypothesis_encoded,mask_hypothesis)
    #print(attended_premises)
    #print(attended_hypotheses)
    enhanced_premises = torch.cat([premise_encoded,
                               attended_premises,
                               premise_encoded - attended_premises,
                               premise_encoded * attended_premises],
                              dim=-1)
    enhanced_hypotheses = torch.cat([hypothesis_encoded,
                                 attended_hypotheses,
                                 hypothesis_encoded - attended_hypotheses,
                                 hypothesis_encoded * attended_hypotheses],
                                dim=-1)
    print(enhanced_premises.size())
    print(enhanced_hypotheses.size())
    #loss = loss_funtion(predicted, batch.Sentiment)
    #loss.backward()
    #optimizer.step()
    #losses.append(loss.item())

EncodeLayerRNN(
  (word_embeddings): Embedding(23, 300)
  (lstm): LSTM(300, 300, dropout=0.2)
)
tensor([[1., 1., 1., 1., 1., 1., 1., 1., 1.],
        [1., 1., 1., 1., 1., 1., 1., 1., 1.]])
tensor([[1., 1., 1., 1., 1., 1., 1., 1., 1., 1., 1., 1., 1., 1.],
        [1., 1., 1., 1., 1., 1., 1., 1., 1., 1., 1., 1., 1., 1.]])
torch.Size([2, 9, 14])
tensor([[[ 2.4960e-02, -6.0764e-02, -1.5055e-01, -1.0399e-01, -3.0015e-02,
          -7.1610e-02, -1.2968e-01, -7.4623e-02, -2.0779e-02,  4.8772e-02,
           1.7189e-02,  1.0484e-03,  1.7993e-01,  4.1333e-01],
         [ 2.9397e-01,  2.7375e-01,  3.5641e-01,  2.2192e-01,  1.5435e-01,
           3.3914e-01,  3.6200e-01,  4.1390e-01,  4.6000e-01,  3.2771e-01,
           4.6404e-01,  4.8587e-01,  5.6592e-01,  4.8381e-01],
         [ 4.4390e-01,  5.5792e-01,  1.2346e+00,  7.2402e-01,  4.8772e-01,
           7.3505e-01,  7.3953e-01,  8.6767e-01,  9.1199e-01,  7.4952e-01,
           7.3738e-01,  8.5360e-01,  8.1665e-01,  5.4702e-01],
         [ 3.968

