# LSTM ensember beam search

In [1]:

import numpy as np
import torch, torchtext
import torch.nn as nn
import torch.nn.functional as F
import math, copy, sys, os
from torch.autograd import Variable
import pickle
import csv
import re, random, string, subprocess, time


TEXT_vi = torchtext.data.ReversibleField(sequential=True, use_vocab=True, batch_first = True, tokenize= lambda t:t.split(),
                                        include_lengths=True)
TEXT_en = torchtext.data.ReversibleField(sequential=True, use_vocab=True, batch_first = False, tokenize= lambda t:t.split(),
                              lower=True, init_token='<sos>', eos_token='<eos>',include_lengths=True)
train_vi_en = torchtext.data.TabularDataset('/home/ql819/text_data/train_vi_en.csv', format='csv', 
                             fields=[('source',TEXT_vi),('target',TEXT_en)])
validation_vi_en = torchtext.data.TabularDataset('/home/ql819/text_data/dev_vi_en.csv', format='csv', 
                             fields=[('source',TEXT_vi),('target',TEXT_en)])


TEXT_vi.build_vocab(train_vi_en, min_freq=3)
TEXT_en.build_vocab(train_vi_en, min_freq=3)

train_vi_en_iter = torchtext.data.BucketIterator(train_vi_en, batch_size=1, sort_key= lambda e: len(e.source),
                             repeat = False, sort_within_batch=True, shuffle=True, device=torch.device(0))
validation_vi_en_iter = torchtext.data.BucketIterator(validation_vi_en, batch_size=1, sort_key= lambda e: len(e.source),
                             repeat = False, sort_within_batch=True, shuffle=True, device=torch.device(0))


class GRU_Decoder_With_Attention(torch.nn.Module):
    
    def __init__(self, num_vocab, input_size, hidden_size, dropout=0.1):
        super().__init__()
        self.num_vocab = num_vocab
        self.input_size = input_size
        self.hidden_size = hidden_size
        self.num_layers = 1
        self.dropout = dropout
        
        self.embedding_layer = torch.nn.Embedding(self.num_vocab, self.input_size)
        self.gru = torch.nn.GRU(hidden_size= self.hidden_size, input_size= self.input_size + 1 * self.hidden_size, 
                                  num_layers= self.num_layers)
        
        self.calcu_weight_1  = torch.nn.Linear(2*self.hidden_size, hidden_size)
        self.calcu_weight_2  = torch.nn.Linear(self.hidden_size, 1)
        self.init_weight = torch.nn.Linear(self.hidden_size, self.hidden_size)
        
        self.linear_vob = torch.nn.Linear(self.hidden_size, self.num_vocab)
        
    def forward(self, input_word_index, hidden_vector, encoder_memory, is_init = False):
        #input_word_index: [num]
        #hidden_vector: 1, 1, hidden_size
        #encoder_memory: source_sen_len , 1 * hidden_size
        
        if hidden_vector.shape[0] != self.num_layers or hidden_vector.shape[2] != self.hidden_size:
            raise ValueError('The size of hidden_vector is not correct, expect '+str((self.num_layers, self.hidden_size))\
                            + ', actually get ' + str(hidden_vector.shape))
        
        if is_init:
            hidden_vector = torch.tanh(self.init_weight(hidden_vector))
        
        
        n_hidden_vector = torch.stack([hidden_vector.squeeze()]*encoder_memory.shape[0],dim=0)
        com_n_h_memory = torch.cat([n_hidden_vector, encoder_memory], dim =1)
        com_n_h_temp = torch.tanh(self.calcu_weight_1(com_n_h_memory))
        
        
        weight_vector = self.calcu_weight_2(com_n_h_temp)
        weight_vector =  torch.nn.functional.softmax(weight_vector, dim=0)
        #weight_vector: source_sen_len * 1
        
        
        convect_vector = torch.mm(weight_vector.transpose(1,0), encoder_memory)
        #convect_vector: 1 , 2 * hidden_size
        
        
        input_vector = self.embedding_layer(input_word_index).view(1,1,-1)
        
        
        input_vector = torch.cat([convect_vector.unsqueeze(0), input_vector], dim=2)
        
        
        output, h_t = self.gru(input_vector,hidden_vector)
        output = output.view(1, self.hidden_size)
        
        
        prob = self.linear_vob(output)
        #prob 1, vob_size
        
        prob = torch.nn.functional.log_softmax(prob, dim=1)
        
        
        return prob, h_t


def clones(module, N):
    "Produce N identical layers."
    return nn.ModuleList([copy.deepcopy(module) for _ in range(N)])


class Encoder(nn.Module):
    "Core encoder is a stack of N layers"
    def __init__(self, layer, src_embed, N):
        super(Encoder, self).__init__()
        self.src_embed = src_embed
        self.layers = clones(layer, N)
        self.norm = LayerNorm(layer.size)
        
    def forward(self, x):
        "Pass the input (and mask) through each layer in turn."
        x = self.src_embed(x)
        for layer in self.layers:
            x = layer(x)
        return self.norm(x)
    
class LayerNorm(nn.Module):
    "Construct a layernorm module (See citation for details)."
    def __init__(self, features, eps=1e-6):
        super(LayerNorm, self).__init__()
        self.a_2 = nn.Parameter(torch.ones(features))
        self.b_2 = nn.Parameter(torch.zeros(features))
        self.eps = eps

    def forward(self, x):
        mean = x.mean(-1, keepdim=True)
        std = x.std(-1, keepdim=True)
        return self.a_2 * (x - mean) / (std + self.eps) + self.b_2

class SublayerConnection(nn.Module):
    """
    A residual connection followed by a layer norm.
    Note for code simplicity the norm is first as opposed to last.
    """
    def __init__(self, size, dropout):
        super(SublayerConnection, self).__init__()
        self.norm = LayerNorm(size)
        self.dropout = nn.Dropout(dropout)

    def forward(self, x, sublayer):
        "Apply residual connection to any sublayer with the same size."
        return x + self.dropout(sublayer(self.norm(x)))
    
class EncoderLayer(nn.Module):
    "Encoder is made up of self-attn and feed forward (defined below)"
    def __init__(self, size, self_attn, feed_forward, dropout):
        super(EncoderLayer, self).__init__()
        self.self_attn = self_attn
        self.feed_forward = feed_forward
        self.sublayer = clones(SublayerConnection(size, dropout), 2)
        self.size = size

    def forward(self, x):
        "Follow Figure 1 (left) for connections."
        x = self.sublayer[0](x, lambda x: self.self_attn(x, x, x))
        return self.sublayer[1](x, self.feed_forward)
    

    
def attention(query, key, value, dropout=None):
    '''
    query: batch, seq1, d_k
    key: batch, seq2, d_k
    value: batch, seq2, embedding_size
    mask: batch, 1, seq_2
    '''
    
    d_k = query.size(-1)
    scores = torch.matmul(query, key.transpose(-2, -1)) \
             / math.sqrt(d_k)
    p_attn = F.softmax(scores, dim = -1)
    if dropout is not None:
        p_attn = dropout(p_attn)
    return torch.matmul(p_attn, value), p_attn


class MultiHeadedAttention(nn.Module):
    def __init__(self, d_k, d_model, dropout=0.1):
        "Take in model size and number of heads."
        super(MultiHeadedAttention, self).__init__()
        
        # We assume d_v always equals d_k
        self.d_k = d_k
        self.linears = clones(nn.Linear(d_model, d_k), 2)
        self.attn = None
        self.dropout = nn.Dropout(p=dropout)
        
    def forward(self, query, key, value):
        "Implements Figure 2"
        
        nbatches = query.size(0)
        
        # 1) Do all the linear projections in batch from d_model => h x d_k 
        query, key = [l(x) for l, x in zip(self.linears, (query, key))]
        #query, key = batch, seq, d_k
        
        # 2) Apply attention on all the projected vectors in batch. 
        x, self.attn = attention(query, key, value,
                                 dropout=self.dropout)
        #x: batch, seq_query, embedding_size
        
        
        return x
    
    
class PositionwiseFeedForward(nn.Module):
    "Implements FFN equation."
    def __init__(self, d_model, dropout=0.1):
        super(PositionwiseFeedForward, self).__init__()
        self.w_1 = nn.Linear(d_model, d_model)
        self.dropout = nn.Dropout(dropout)

    def forward(self, x):
        return self.dropout(F.relu(self.w_1(x)))
    
    
class Embeddings(nn.Module):
    def __init__(self, d_model, vocab):
        super(Embeddings, self).__init__()
        self.lut = nn.Embedding(vocab, d_model)
        self.d_model = d_model

    def forward(self, x):
        return self.lut(x) * math.sqrt(self.d_model)
    
    
class PositionalEncoding(nn.Module):
    "Implement the PE function."
    def __init__(self, d_model, dropout, max_len=5000):
        super(PositionalEncoding, self).__init__()
        self.dropout = nn.Dropout(p=dropout)
        
        # Compute the positional encodings once in log space.
        pe = torch.zeros(max_len, d_model)
        position = torch.arange(0, max_len).unsqueeze(1).float()
        div_term = torch.exp(torch.arange(0, d_model, 2).float() *
                             -(math.log(10000.0) / d_model))
        pe[:, 0::2] = torch.sin(position * div_term)
        pe[:, 1::2] = torch.cos(position * div_term)
        pe = pe.unsqueeze(0)
        self.register_buffer('pe', pe)
        
    def forward(self, x):
        x = x + Variable(self.pe[:, :x.size(1)], 
                         requires_grad=False)
        return self.dropout(x)
    

    
def make_model(src_vocab, tgt_vocab, N=6, 
               d_model=512, d_k=64, dropout=0.1):
    "Helper: Construct a model from hyperparameters."
    c = copy.deepcopy
    attn = MultiHeadedAttention(d_k, d_model)
    ff = PositionwiseFeedForward(d_model, dropout)
    position = PositionalEncoding(d_model, dropout)
    encoder = Encoder(EncoderLayer(d_model, c(attn), c(ff), dropout), nn.Sequential(Embeddings(d_model, src_vocab), c(position)), N)
    decoder = GRU_Decoder_With_Attention(num_vocab = tgt_vocab, input_size = d_model, hidden_size = d_model)
    for p in encoder.parameters():
        if p.dim() > 1:
            nn.init.xavier_uniform_(p)
    return encoder, decoder



def train(encoder, decoder, optimizer, data_iter, teacher_forcing_ratio, batch_size = 64):

    encoder.train()
    decoder.train()
    
    count = 0
    loss = 0
    
    
    for batch in data_iter:
        
        
        source, target = batch.source, batch.target
        

        source_data,source_len = source[0], source[1]
        target_data,target_len = target[0], target[1]
        
        all_output = encoder(source_data)
        #all_output: 1, source_len, embedding_size

        use_teacher_forcing = True if random.random() < teacher_forcing_ratio else False
        

        output = all_output[0,:]
    
        target_word_list = target_data.squeeze()
        target_word = torch.tensor([TEXT_en.vocab.stoi['<sos>']]).cuda(0)

        h_t = output[0,:]
        h_t = h_t.view([1,1,-1])

        is_init = True

        for word_index in range(1, target_len[0].item()):
            prob, h_t = decoder(target_word, h_t, output, is_init)
            is_init = False
            if use_teacher_forcing:
                target_word = target_word_list[[word_index]]
                loss += torch.nn.functional.nll_loss(prob, target_word)
            else:
                right_target_word = target_word_list[[word_index]]
                loss += torch.nn.functional.nll_loss(prob, right_target_word)
                predict_target_word_index = prob.topk(1)[1].item()

                if TEXT_en.vocab.stoi['<eos>'] == predict_target_word_index:
                    break
                else:
                    target_word = torch.tensor([predict_target_word_index]).cuda(0)
                    
        count += 1
        if count % batch_size == 0:
            
            loss = loss/batch_size
            loss.backward()
            optimizer.step()
            optimizer.zero_grad()
            count = 0
            loss = 0
        
        
    if count % batch_size != 0:
        loss = loss/count
        loss.backward()
        optimizer.step()
        optimizer.zero_grad()
        
        
        
class Bean_Search_Status_Record:
    
    def __init__(self, h_t, predict_word_index_list, sum_log_prob):
        self.h_t = h_t
        self.predict_word_index_list = predict_word_index_list
        self.sum_log_prob = sum_log_prob
        self.avg_log_prob = 0
        
    

def test(encoder, decoder, data_iter, k=10):
    encoder.eval()
    decoder.eval()

    path_name = '../eval/'+str(time.time()).replace('.','_')+'/'
    os.mkdir(path_name)

    predict_file_name = path_name + 'predict.txt'
    target_file_name = path_name + 'target_file_name.txt'

    predict_file = open(predict_file_name, 'w')
    target_file = open(target_file_name, 'w')


    for batch in data_iter:
        
        
        
        source, target = batch.source, batch.target
        

        source_data,source_len = source[0], source[1]
        target_data,target_len = target[0], target[1]
        
        all_output = encoder(source_data)
        output = all_output[0,:]
        
        target_word = torch.tensor([TEXT_en.vocab.stoi['<sos>']]).cuda(0)

        h_t = output[0,:]
        h_t = h_t.view([1,1,-1])

        is_init = True


        right_whole_sentence_word_index = target_data[1: target_len[0].item()-1,0]
        right_whole_sentence_word_index = list(right_whole_sentence_word_index.cpu().numpy())
        
        
        sequences = [Bean_Search_Status_Record(h_t, predict_word_index_list = [target_word], 
                                               sum_log_prob = 0.0)]
        
        t = 0
        while (t < 100):
            all_candidates = []
            for i in range(len(sequences)):
                record = sequences[i]
                h_t = record.h_t
                predict_word_index_list = record.predict_word_index_list
                sum_log_prob = record.sum_log_prob
                target_word = predict_word_index_list[-1]
                
                if TEXT_en.vocab.stoi['<eos>'] != target_word:
                
                    prob, h_t = decoder(torch.tensor([target_word]).cuda(0), h_t, output, is_init)

                    k_prob_value_list, k_word_index_list = prob.topk(k,dim=1)
                    k_prob_value_list = k_prob_value_list.cpu().detach().squeeze().numpy()
                    k_word_index_list = k_word_index_list.cpu().squeeze().numpy()
                    
                    
                    for prob_value, word_index in zip(k_prob_value_list, k_word_index_list):
                        prob_value = float(prob_value)
                        word_index = int(word_index)
                        new_record = Bean_Search_Status_Record(h_t, predict_word_index_list+[word_index], sum_log_prob+prob_value)
                        new_record.avg_log_prob = new_record.sum_log_prob/(len(new_record.predict_word_index_list) - 1)
                        all_candidates.append(new_record)
                else:
                    all_candidates.append(record)
            is_init = False
                        
            ordered = sorted(all_candidates, key = lambda r: r.sum_log_prob, reverse = True)
            sequences = ordered[:k]
            
            t += 1
        final_record = sequences[0]
        
        predict_whole_sentence_word_index = [TEXT_en.vocab.itos[temp_index] for temp_index in final_record.predict_word_index_list[1:-1]]
        right_whole_sentence_word_index = [TEXT_en.vocab.itos[temp_index] for temp_index in right_whole_sentence_word_index]

        predict_whole_sentence = ' '.join(predict_whole_sentence_word_index)
        right_whole_sentence = ' '.join(right_whole_sentence_word_index)

        predict_file.write(predict_whole_sentence.strip() + '\n')
        target_file.write(right_whole_sentence.strip() + '\n')


    predict_file.close()
    target_file.close()

    result = subprocess.run('cat {} | sacrebleu {}'.format(predict_file_name,target_file_name),shell=True,stdout=subprocess.PIPE)
    result = str(result)
    print(result)
    sys.stdout.flush()
    
    
    return get_blue_score(result)


def get_blue_score(s):
    a = re.search(r'13a\+version\.1\.2\.12 = ([0-9.]+)',s)
    return float(a.group(1))



def parameters_list_change_grad(encoder, decoder):
    para_list = []
    for name, data in list(encoder.named_parameters()):
        if 'src_embed' in name:
            data.requires_grad = False
        else:
            para_list.append(data)
            
    for name, data in list(decoder.named_parameters()):
        if 'embedding' in name:
            data.requires_grad = False
        else:
            para_list.append(data)
    return para_list        




encoder,decoder = make_model(src_vocab=len(TEXT_vi.vocab.stoi), tgt_vocab=len(TEXT_en.vocab.stoi), N=2, 
               d_model=512, d_k=64, dropout=0.1)

encoder = encoder.cuda(0)
decoder = decoder.cuda(0)

early_stop = 3
best_blue_score = -1
best_index = -1

save_model_dir_name = '../save_model/vi_to_en_'
teacher_forcing_ratio = 0.9

optimizer = torch.optim.Adam([{'params': encoder.parameters(), 'lr': 0.001},
                              {'params': decoder.parameters(), 'lr': 0.001}])


for index_unique in range(100):
    train(encoder, decoder, optimizer, train_vi_en_iter, teacher_forcing_ratio)
    blue_score = test(encoder, decoder, validation_vi_en_iter)
    print('epoch: ',index_unique, ' the blue score on validation dataset is : ', blue_score)
    sys.stdout.flush()
    if best_blue_score < blue_score:
        
        best_index = index_unique
        best_blue_score = blue_score
        torch.save(encoder, save_model_dir_name+'cnn_encode')
        torch.save(decoder, save_model_dir_name+'rnn_decoder')
        
    if index_unique - best_index >= early_stop:
        break




print('--------------------------------------')
sys.stdout.flush()


encoder = torch.load(save_model_dir_name+'cnn_encode')
decoder = torch.load(save_model_dir_name+'rnn_decoder')
        
        

para_list = parameters_list_change_grad(encoder, decoder)     
optimizer = torch.optim.Adam(para_list, lr = 0.001)  
save_model_dir_name = '../save_model/refined_vi_to_en_'

early_stop = 3
best_blue_score = -1
best_index = -1

for index_unique in range(100):
    train(encoder, decoder, optimizer, train_vi_en_iter, teacher_forcing_ratio)
    blue_score = test(encoder, decoder, validation_vi_en_iter)
    print('epoch: ',index_unique, ' the blue score on validation dataset is : ', blue_score)
    sys.stdout.flush()
    
    if best_blue_score < blue_score:
        
        best_index = index_unique
        best_blue_score = blue_score
        torch.save(encoder, save_model_dir_name+'cnn_encode_'+str(index_unique))
        torch.save(decoder, save_model_dir_name+'rnn_decoder_'+str(index_unique))
    if index_unique - best_index >= early_stop:
        break

torch.Size([1, 15, 512])
torch.Size([1, 15, 512]) torch.Size([15, 512])
torch.Size([1, 28, 512])
torch.Size([1, 28, 512]) torch.Size([28, 512])
torch.Size([1, 11, 512])
torch.Size([1, 11, 512]) torch.Size([11, 512])
torch.Size([1, 8, 512])
torch.Size([1, 8, 512]) torch.Size([8, 512])
torch.Size([1, 5, 512])
torch.Size([1, 5, 512]) torch.Size([5, 512])
torch.Size([1, 5, 512])
torch.Size([1, 5, 512]) torch.Size([5, 512])
torch.Size([1, 12, 512])
torch.Size([1, 12, 512]) torch.Size([12, 512])
torch.Size([1, 7, 512])
torch.Size([1, 7, 512]) torch.Size([7, 512])
torch.Size([1, 39, 512])
torch.Size([1, 39, 512]) torch.Size([39, 512])
torch.Size([1, 32, 512])
torch.Size([1, 32, 512]) torch.Size([32, 512])
torch.Size([1, 8, 512])
torch.Size([1, 8, 512]) torch.Size([8, 512])
torch.Size([1, 23, 512])
torch.Size([1, 23, 512]) torch.Size([23, 512])
torch.Size([1, 9, 512])
torch.Size([1, 9, 512]) torch.Size([9, 512])
torch.Size([1, 22, 512])
torch.Size([1, 22, 512]) torch.Size([22, 512])
torch.Size

KeyboardInterrupt: 

In [2]:
class Bi_Multi_Layer_LSTM_Encoder(torch.nn.Module):
    
    def __init__(self, num_vocab, input_size = 512, hidden_size = 512, dropout = 0.15):
        super().__init__()
        self.input_size = input_size
        self.hidden_size = hidden_size
        self.num_layers = 1
        self.dropout = dropout
        
        self.dropout_layer = torch.nn.Dropout(self.dropout)
        
        self.bidirectional = True
        self.embedding_layer = torch.nn.Embedding(num_vocab, self.input_size)
        self.lstm = torch.nn.LSTM(input_size= self.input_size, hidden_size = self.hidden_size, batch_first = False,
                                 bidirectional = self.bidirectional, num_layers = self.num_layers)
        
        h_0 = torch.zeros(1, self.hidden_size)
        torch.nn.init.normal_(h_0, mean=0, std=0.0001)
        self.h_0 = torch.nn.Parameter(h_0,requires_grad=True)
        
        
        c_0 = torch.zeros(1, self.hidden_size)
        torch.nn.init.normal_(c_0, mean=0, std=0.0001)
        self.c_0 = torch.nn.Parameter(c_0,requires_grad=True)
        
        
        
        if self.bidirectional:
            h_1 = torch.zeros(1, self.hidden_size)
            torch.nn.init.normal_(h_1, mean=0, std=0.0001)
            self.h_1 = torch.nn.Parameter(h_1,requires_grad=True)
            
            
            c_1 = torch.zeros(1, self.hidden_size)
            torch.nn.init.normal_(c_1, mean=0, std=0.0001)
            self.c_1 = torch.nn.Parameter(c_1,requires_grad=True)
            
        
    def forward(self, X):
        
        X_data,X_len = X
        #X_data: source_len, 1, input_size    X_len:1,1
        
        X_data = self.embedding_layer(X_data)
        
        h_0 = torch.cat([self.h_0]*len(X_len), dim=0).unsqueeze(1)
        c_0 = torch.cat([self.c_0]*len(X_len), dim=0).unsqueeze(1)
        
        
        if self.bidirectional:
            h_1 = torch.cat([self.h_1]*len(X_len), dim=0).unsqueeze(1)
            c_1 = torch.cat([self.c_1]*len(X_len), dim=0).unsqueeze(1)
            
            h = torch.cat([h_0,h_1], dim=0)
            c = torch.cat([c_0,c_1], dim=0)   

        output, (h_n, c_n) = self.lstm(X_data, (h, c))
        #output: source_len, 1, 2*hidden_size
        h_n = h_n.view(self.num_layers, 2, len(X_len), self.hidden_size)
        c_n = c_n.view(self.num_layers, 2, len(X_len), self.hidden_size)
        
        
        return output, h_n, c_n
    
    def init_parameters(self):
        
        for name, matrix in self.lstm.named_parameters():
            if 'weight_hh_' in name:
                for i in range(0, matrix.size(0), self.hidden_size):
                    torch.nn.init.orthogonal_(matrix[i:i+self.hidden_size], gain=0.01)
            elif 'bias_' in name:
                l = len(matrix)
                matrix[l // 4: l //2].data.fill_(1.0)
                
                
class LSTM_Decoder_With_Attention(torch.nn.Module):
    
    def __init__(self, num_vocab, input_size = 512, hidden_size = 512, dropout=0.15):
        super().__init__()
        self.num_vocab = num_vocab
        self.input_size = input_size
        self.hidden_size = hidden_size
        self.num_layers = 1
        self.dropout = dropout
        self.dropout_layer = torch.nn.Dropout(self.dropout)
        
        self.embedding_layer = torch.nn.Embedding(self.num_vocab, self.input_size)
        self.lstm = torch.nn.LSTM(hidden_size= self.hidden_size, input_size= self.input_size + 2 * self.hidden_size, 
                                  num_layers= self.num_layers)
        
        self.calcu_weight_1  = torch.nn.Linear(3*self.hidden_size, hidden_size)
        self.calcu_weight_2  = torch.nn.Linear(self.hidden_size, 1)
        
        self.init_weight_1 = torch.nn.Linear(self.hidden_size, self.hidden_size)
        self.init_weight_2 = torch.nn.Linear(self.hidden_size, self.hidden_size)
        
        self.linear_vob = torch.nn.Linear(self.hidden_size, self.num_vocab)
        
        
    def forward(self, input_word_index, hidden_vector, cell_vector, encoder_memory, is_init = False):
        #input_word_index: [num]
        #hidden_vector: 1, 1, hidden_size
        #cell_vector: 1, 1, hidden_size
        #encoder_memory: source_sen_len , 2 * hidden_size
        
        if hidden_vector.shape[0] != self.num_layers or hidden_vector.shape[2] != self.hidden_size:
            raise ValueError('The size of hidden_vector is not correct, expect '+str((self.num_layers, self.hidden_size))\
                            + ', actually get ' + str(hidden_vector.shape))
        
        if is_init:
            hidden_vector = torch.tanh(self.init_weight_1(hidden_vector))
            cell_vector = torch.tanh(self.init_weight_2(cell_vector))
            
        
        
        n_hidden_vector = torch.stack([hidden_vector.squeeze()]*encoder_memory.shape[0],dim=0)
        com_n_h_memory = torch.cat([n_hidden_vector, encoder_memory], dim =1)
        com_n_h_temp = torch.tanh(self.calcu_weight_1(com_n_h_memory))
        
        
        weight_vector = self.calcu_weight_2(com_n_h_temp)
        weight_vector =  torch.nn.functional.softmax(weight_vector, dim=0)
        #weight_vector: source_sen_len * 1
        
        
        convect_vector = torch.mm(weight_vector.transpose(1,0), encoder_memory)
        #convect_vector: 1 , 2 * hidden_size
        
        
        input_vector = self.embedding_layer(input_word_index).view(1,1,-1)
        input_vector = self.dropout_layer(input_vector)
        
        
        input_vector = torch.cat([convect_vector.unsqueeze(0), input_vector], dim=2)
        
        
        output, (h_t, c_t) = self.lstm(input_vector,(hidden_vector, cell_vector))
        output = output.view(1, self.hidden_size)
        
        
        prob = self.linear_vob(output)
        #prob 1, vob_size
        
        prob = torch.nn.functional.log_softmax(prob, dim=1)
        
        
        return prob, h_t, c_t
    
    def init_parameters(self):
        
        for name, matrix in self.lstm.named_parameters():
            if 'weight_hh_' in name:
                for i in range(0, matrix.size(0), self.hidden_size):
                    torch.nn.init.orthogonal_(matrix[i:i+self.hidden_size], gain=0.01)
            elif 'bias_' in name:
                l = len(matrix)
                matrix[l // 4: l //2].data.fill_(1.0)
    
encoder = torch.load('../../save_model/vi_to_en_encode_9')
decoder = torch.load('../../save_model/vi_to_en_decoder_9')

  "type " + container_type.__name__ + ". It won't be checked "
  "type " + container_type.__name__ + ". It won't be checked "


In [3]:
encoder_1 = torch.load('../../save_model/vi_to_en_encode_5')
decoder_1 = torch.load('../../save_model/vi_to_en_decoder_5')

In [4]:
encoder_list = [encoder, encoder_1]
decoder_list = [decoder, decoder_1]

In [5]:
class Bean_Search_Status_Record:
    
    def __init__(self, h_t_list, c_t_list, predict_word_index_list, sum_log_prob):
        self.h_t_list = h_t_list
        self.c_t_list = c_t_list
        self.predict_word_index_list = predict_word_index_list
        self.sum_log_prob = sum_log_prob
        self.avg_log_prob = 0

def beam_search_ensembel_test(encoder_list, decoder_list, data_iter, k=10):
    
    assert len(encoder_list) == len(decoder_list), 'the num of encoders should be equal to the num of decoders'
    _ = [model.eval() for model in encoder_list]
    _ = [model.eval() for model in decoder_list]

    path_name = '../eval/'+str(time.time()).replace('.','_')+'/'
    os.mkdir(path_name)

    predict_file_name = path_name + 'predict.txt'
    target_file_name = path_name + 'target_file_name.txt'

    predict_file = open(predict_file_name, 'w')
    target_file = open(target_file_name, 'w')


    for batch in data_iter:
        
        
        
        source, target = batch.source, batch.target
        

        source_data,source_len = source[0], source[1]
        target_data,target_len = target[0], target[1]
        
        h_t_list = []
        c_t_list = []
        output_list = []
        
        for encoder in encoder_list:
            all_output, h_n, c_n = encoder(source)
            output = all_output[:,0]
            h_t = h_n[:,1,:]
            c_t = c_n[:,1,:]
            h_t_list.append(h_t)
            c_t_list.append(c_t)
            output_list.append(output)
            

        target_word = TEXT_en.vocab.stoi['<sos>']


        is_init = False


        right_whole_sentence_word_index = target_data[1: target_len[0].item()-1,0]
        right_whole_sentence_word_index = list(right_whole_sentence_word_index.cpu().numpy())
        
        
        sequences = [Bean_Search_Status_Record(h_t_list, c_t_list, predict_word_index_list = [target_word], 
                                               sum_log_prob = 0.0)]
        
        t = 0
        while (t < 60):
            all_candidates = []
            for i in range(len(sequences)):
                record = sequences[i]
                
                h_t_list = record.h_t_list
                c_t_list = record.c_t_list
                predict_word_index_list = record.predict_word_index_list
                sum_log_prob = record.sum_log_prob
                target_word = predict_word_index_list[-1]
                
                temp_h_t_list = []
                temp_c_t_list = []
                temp_prob = None
                
                if TEXT_en.vocab.stoi['<eos>'] != target_word:
                    for num_model in range(len(encoder_list)):
                        
                        decoder = decoder_list[num_model]
                        h_t = h_t_list[num_model]
                        c_t = c_t_list[num_model]
                        output = output_list[num_model]
                
                        prob, h_t, c_t = decoder(torch.tensor([target_word]).cuda(0), h_t, c_t, output, is_init)
                    
                        temp_h_t_list.append(h_t)
                        temp_c_t_list.append(c_t)
                        
                        if temp_prob is None:
                            temp_prob = prob
                        else:
                            temp_prob = torch.cat([temp_prob, prob], dim=0)
                            
                    
                            
                            
                    prob = temp_prob.mean(dim=0, keepdim=True)
                    k_prob_value_list, k_word_index_list = prob.topk(k,dim=1)
                    k_prob_value_list = k_prob_value_list.cpu().detach().squeeze().numpy()
                    k_word_index_list = k_word_index_list.cpu().squeeze().numpy()


                    for prob_value, word_index in zip(k_prob_value_list, k_word_index_list):
                        prob_value = float(prob_value)
                        word_index = int(word_index)
                        new_record = Bean_Search_Status_Record(temp_h_t_list, temp_c_t_list, predict_word_index_list+[word_index], sum_log_prob+prob_value)
                        new_record.avg_log_prob = new_record.sum_log_prob/((4+len(new_record.predict_word_index_list))**0.6/(6)**0.6)
                        all_candidates.append(new_record)
                else:
                    all_candidates.append(record)
            is_init = False
                        
            ordered = sorted(all_candidates, key = lambda r: r.avg_log_prob, reverse = True)
            sequences = ordered[:k]
            
            t += 1
        final_record = sequences[0]
        
        
        predict_whole_sentence_word_index = [TEXT_en.vocab.itos[temp_index] for temp_index in final_record.predict_word_index_list[1:-1]]
        right_whole_sentence_word_index = [TEXT_en.vocab.itos[temp_index] for temp_index in right_whole_sentence_word_index]

        predict_whole_sentence = ' '.join(predict_whole_sentence_word_index)
        right_whole_sentence = ' '.join(right_whole_sentence_word_index)

        predict_file.write(predict_whole_sentence.strip() + '\n')
        target_file.write(right_whole_sentence.strip() + '\n')


    predict_file.close()
    target_file.close()

    result = subprocess.run('cat {} | sacrebleu {}'.format(predict_file_name,target_file_name),shell=True,stdout=subprocess.PIPE)
    result = str(result)
    print(result)
    sys.stdout.flush()
    
    
    return get_blue_score(result)


    
def get_blue_score(s):
    a = re.search(r'13a\+version\.1\.2\.12 = ([0-9.]+)',s)
    return float(a.group(1))

In [7]:
r = beam_search_ensembel_test([encoder, encoder_1], [decoder, decoder_1], validation_vi_en_iter)



RuntimeError: CUDA error: out of memory

In [31]:

        
r = test([encoder, encoder_1], [decoder, decoder_1], validation_vi_en_iter)



CompletedProcess(args='cat ../eval/1543272655_7849529/predict.txt | sacrebleu ../eval/1543272655_7849529/target_file_name.txt', returncode=0, stdout=b'BLEU+case.mixed+numrefs.1+smooth.exp+tok.13a+version.1.2.12 = 18.6 53.1/25.7/14.3/8.6 (BP = 0.918 ratio = 0.921 hyp_len = 26043 ref_len = 28283)\n')


In [24]:
r.predict_word_index_list

[2,
 7,
 801,
 4,
 10,
 116,
 6,
 9,
 9,
 9,
 9,
 11,
 25,
 4,
 4,
 4,
 15,
 46,
 10,
 10,
 9,
 5,
 3]

In [15]:
validation_vi_en = torchtext.data.TabularDataset('../data/processed_data/test_vi_en.csv', format='csv', 
                             fields=[('source',TEXT_vi),('target',TEXT_en)])

validation_vi_en_iter = torchtext.data.BucketIterator(validation_vi_en, batch_size=1, sort_key= lambda e: len(e.source),
                             repeat = False, sort_within_batch=True, shuffle=True, device=torch.device(0))

  
test(encoder, decoder, validation_vi_en_iter)



CompletedProcess(args='cat ../eval/1543265601_7490401/predict.txt | sacrebleu ../eval/1543265601_7490401/target_file_name.txt', returncode=0, stdout=b'BLEU+case.mixed+numrefs.1+smooth.exp+tok.13a+version.1.2.12 = 18.3 51.6/24.8/14.0/8.4 (BP = 0.929 ratio = 0.931 hyp_len = 27954 ref_len = 30020)\n')


18.3

# end

In [10]:
class Bean_Search_Status_Record:
    
    def __init__(self, h_t, c_t, predict_word_index_list, sum_log_prob):
        self.h_t = h_t
        self.c_t = c_t
        self.predict_word_index_list = predict_word_index_list
        self.sum_log_prob = sum_log_prob
        self.avg_log_prob = 0
        
    

def test(encoder, decoder, data_iter, k=5):
    encoder.eval()
    decoder.eval()

    path_name = '../eval/'+str(time.time()).replace('.','_')+'/'
    os.mkdir(path_name)

    predict_file_name = path_name + 'predict.txt'
    target_file_name = path_name + 'target_file_name.txt'

    predict_file = open(predict_file_name, 'w')
    target_file = open(target_file_name, 'w')


    for batch in data_iter:
        
        
        
        source, target = batch.source, batch.target
        

        source_data,source_len = source[0], source[1]
        target_data,target_len = target[0], target[1]
        
        all_output, h_n, c_n = encoder(source)
        output = all_output[:,0]

        target_word = TEXT_en.vocab.stoi['<sos>']

        h_t = h_n[:,1,:]
        c_t = c_n[:,1,:]

        is_init = True


        right_whole_sentence_word_index = target_data[1: target_len[0].item()-1,0]
        right_whole_sentence_word_index = list(right_whole_sentence_word_index.cpu().numpy())
        
        
        sequences = [Bean_Search_Status_Record(h_t, c_t, predict_word_index_list = [target_word], 
                                               sum_log_prob = 0.0)]
        
        t = 0
        while (t < 100):
            all_candidates = []
            for i in range(len(sequences)):
                record = sequences[i]
                h_t = record.h_t
                c_t = record.c_t
                predict_word_index_list = record.predict_word_index_list
                sum_log_prob = record.sum_log_prob
                target_word = predict_word_index_list[-1]
                
                if TEXT_en.vocab.stoi['<eos>'] != target_word:
                
                    prob, h_t, c_t = decoder(torch.tensor([target_word]).cuda(0), h_t, c_t, output, is_init)

                    k_prob_value_list, k_word_index_list = prob.topk(k,dim=1)
                    k_prob_value_list = k_prob_value_list.cpu().detach().squeeze().numpy()
                    k_word_index_list = k_word_index_list.cpu().squeeze().numpy()
                    
                    for prob_value, word_index in zip(k_prob_value_list, k_word_index_list):
                        prob_value = float(prob_value)
                        word_index = int(word_index)
                        new_record = Bean_Search_Status_Record(h_t, c_t, predict_word_index_list+[word_index], sum_log_prob+prob_value)
                        new_record.avg_log_prob = new_record.sum_log_prob/(len(new_record.predict_word_index_list) - 1)
                        all_candidates.append(new_record)
                else:
                    all_candidates.append(record)
            is_init = False
                        
            ordered = sorted(all_candidates, key = lambda r: r.sum_log_prob, reverse = True)
            sequences = ordered[:k]
            
            t += 1
        final_record = sequences[0]
        
        predict_whole_sentence_word_index = [TEXT_en.vocab.itos[temp_index] for temp_index in final_record.predict_word_index_list[1:-1]]
        right_whole_sentence_word_index = [TEXT_en.vocab.itos[temp_index] for temp_index in right_whole_sentence_word_index]

        predict_whole_sentence = ' '.join(predict_whole_sentence_word_index)
        right_whole_sentence = ' '.join(right_whole_sentence_word_index)

        predict_file.write(predict_whole_sentence.strip() + '\n')
        target_file.write(right_whole_sentence.strip() + '\n')


    predict_file.close()
    target_file.close()

    result = subprocess.run('cat {} | sacrebleu {}'.format(predict_file_name,target_file_name),shell=True,stdout=subprocess.PIPE)
    result = str(result)
    print(result)
    sys.stdout.flush()
    
    
    return get_blue_score(result)
        
        
        
        
        
        
        

In [2]:
import torch, torchtext
import pickle
import csv
import unicodedata
import re, random, time, string, subprocess
import os, sys


TEXT_vi = torchtext.data.ReversibleField(sequential=True, use_vocab=True, batch_first = False, tokenize= lambda t:t.split(),
                                        include_lengths=True)
TEXT_en = torchtext.data.ReversibleField(sequential=True, use_vocab=True, batch_first = False, tokenize= lambda t:t.split(),
                              lower=True, init_token='<sos>', eos_token='<eos>',include_lengths=True)
train_vi_en = torchtext.data.TabularDataset('../data/processed_data/train_vi_en.csv', format='csv', 
                             fields=[('source',TEXT_vi),('target',TEXT_en)])
validation_vi_en = torchtext.data.TabularDataset('../data/processed_data/dev_vi_en.csv', format='csv', 
                             fields=[('source',TEXT_vi),('target',TEXT_en)])


TEXT_vi.build_vocab(train_vi_en)
TEXT_en.build_vocab(train_vi_en)


train_vi_en_iter = torchtext.data.BucketIterator(train_vi_en, batch_size=1, sort_key= lambda e: len(e.source),
                             repeat = False, sort_within_batch=True, shuffle=True, device=torch.device(0))
validation_vi_en_iter = torchtext.data.BucketIterator(validation_vi_en, batch_size=1, sort_key= lambda e: len(e.source),
                             repeat = False, sort_within_batch=True, shuffle=True, device=torch.device(0))



class Bi_Multi_Layer_LSTM_Encoder(torch.nn.Module):
    
    def __init__(self, num_vocab, input_size = 512, hidden_size = 512, dropout = 0.15):
        super().__init__()
        self.input_size = input_size
        self.hidden_size = hidden_size
        self.num_layers = 1
        self.dropout = dropout
        
        self.dropout_layer = torch.nn.Dropout(self.dropout)
        
        self.bidirectional = True
        self.embedding_layer = torch.nn.Embedding(num_vocab, self.input_size)
        self.lstm = torch.nn.LSTM(input_size= self.input_size, hidden_size = self.hidden_size, batch_first = False,
                                 bidirectional = self.bidirectional, num_layers = self.num_layers)
        
        h_0 = torch.zeros(1, self.hidden_size)
        torch.nn.init.normal_(h_0, mean=0, std=0.0001)
        self.h_0 = torch.nn.Parameter(h_0,requires_grad=True)
        
        
        c_0 = torch.zeros(1, self.hidden_size)
        torch.nn.init.normal_(c_0, mean=0, std=0.0001)
        self.c_0 = torch.nn.Parameter(c_0,requires_grad=True)
        
        
        
        if self.bidirectional:
            h_1 = torch.zeros(1, self.hidden_size)
            torch.nn.init.normal_(h_1, mean=0, std=0.0001)
            self.h_1 = torch.nn.Parameter(h_1,requires_grad=True)
            
            
            c_1 = torch.zeros(1, self.hidden_size)
            torch.nn.init.normal_(c_1, mean=0, std=0.0001)
            self.c_1 = torch.nn.Parameter(c_1,requires_grad=True)
            
        
    def forward(self, X):
        
        X_data,X_len = X
        #X_data: source_len, 1, input_size    X_len:1,1
        
        X_data = self.embedding_layer(X_data)
        
        h_0 = torch.cat([self.h_0]*len(X_len), dim=0).unsqueeze(1)
        c_0 = torch.cat([self.c_0]*len(X_len), dim=0).unsqueeze(1)
        
        
        if self.bidirectional:
            h_1 = torch.cat([self.h_1]*len(X_len), dim=0).unsqueeze(1)
            c_1 = torch.cat([self.c_1]*len(X_len), dim=0).unsqueeze(1)
            
            h = torch.cat([h_0,h_1], dim=0)
            c = torch.cat([c_0,c_1], dim=0)   

        output, (h_n, c_n) = self.lstm(X_data, (h, c))
        #output: source_len, 1, 2*hidden_size
        h_n = h_n.view(self.num_layers, 2, len(X_len), self.hidden_size)
        c_n = c_n.view(self.num_layers, 2, len(X_len), self.hidden_size)
        
        
        return output, h_n, c_n
    
    def init_parameters(self):
        
        for name, matrix in self.lstm.named_parameters():
            if 'weight_hh_' in name:
                for i in range(0, matrix.size(0), self.hidden_size):
                    torch.nn.init.orthogonal_(matrix[i:i+self.hidden_size], gain=0.01)
            elif 'bias_' in name:
                l = len(matrix)
                matrix[l // 4: l //2].data.fill_(1.0)
                
                
class LSTM_Decoder_With_Attention(torch.nn.Module):
    
    def __init__(self, num_vocab, input_size = 512, hidden_size = 512, dropout=0.15):
        super().__init__()
        self.num_vocab = num_vocab
        self.input_size = input_size
        self.hidden_size = hidden_size
        self.num_layers = 1
        self.dropout = dropout
        self.dropout_layer = torch.nn.Dropout(self.dropout)
        
        self.embedding_layer = torch.nn.Embedding(self.num_vocab, self.input_size)
        self.lstm = torch.nn.LSTM(hidden_size= self.hidden_size, input_size= self.input_size + 2 * self.hidden_size, 
                                  num_layers= self.num_layers)
        
        self.calcu_weight_1  = torch.nn.Linear(3*self.hidden_size, hidden_size)
        self.calcu_weight_2  = torch.nn.Linear(self.hidden_size, 1)
        
        self.init_weight_1 = torch.nn.Linear(self.hidden_size, self.hidden_size)
        self.init_weight_2 = torch.nn.Linear(self.hidden_size, self.hidden_size)
        
        self.linear_vob = torch.nn.Linear(self.hidden_size, self.num_vocab)
        
        
    def forward(self, input_word_index, hidden_vector, cell_vector, encoder_memory, is_init = False):
        #input_word_index: [num]
        #hidden_vector: 1, 1, hidden_size
        #cell_vector: 1, 1, hidden_size
        #encoder_memory: source_sen_len , 2 * hidden_size
        
        if hidden_vector.shape[0] != self.num_layers or hidden_vector.shape[2] != self.hidden_size:
            raise ValueError('The size of hidden_vector is not correct, expect '+str((self.num_layers, self.hidden_size))\
                            + ', actually get ' + str(hidden_vector.shape))
        
        if is_init:
            hidden_vector = torch.tanh(self.init_weight_1(hidden_vector))
            cell_vector = torch.tanh(self.init_weight_2(cell_vector))
            
        
        
        n_hidden_vector = torch.stack([hidden_vector.squeeze()]*encoder_memory.shape[0],dim=0)
        com_n_h_memory = torch.cat([n_hidden_vector, encoder_memory], dim =1)
        com_n_h_temp = torch.tanh(self.calcu_weight_1(com_n_h_memory))
        
        
        weight_vector = self.calcu_weight_2(com_n_h_temp)
        weight_vector =  torch.nn.functional.softmax(weight_vector, dim=0)
        #weight_vector: source_sen_len * 1
        
        
        convect_vector = torch.mm(weight_vector.transpose(1,0), encoder_memory)
        #convect_vector: 1 , 2 * hidden_size
        
        
        input_vector = self.embedding_layer(input_word_index).view(1,1,-1)
        input_vector = self.dropout_layer(input_vector)
        
        
        input_vector = torch.cat([convect_vector.unsqueeze(0), input_vector], dim=2)
        
        
        output, (h_t, c_t) = self.lstm(input_vector,(hidden_vector, cell_vector))
        output = output.view(1, self.hidden_size)
        
        
        prob = self.linear_vob(output)
        #prob 1, vob_size
        
        prob = torch.nn.functional.log_softmax(prob, dim=1)
        
        
        return prob, h_t, c_t
    
    def init_parameters(self):
        
        for name, matrix in self.lstm.named_parameters():
            if 'weight_hh_' in name:
                for i in range(0, matrix.size(0), self.hidden_size):
                    torch.nn.init.orthogonal_(matrix[i:i+self.hidden_size], gain=0.01)
            elif 'bias_' in name:
                l = len(matrix)
                matrix[l // 4: l //2].data.fill_(1.0)

In [5]:
def test(encoder, decoder, data_iter, k=10):
    encoder.eval()
    decoder.eval()

    path_name = '../eval/'+str(time.time()).replace('.','_')+'/'
    os.mkdir(path_name)

    predict_file_name = path_name + 'predict.txt'
    target_file_name = path_name + 'target_file_name.txt'

    predict_file = open(predict_file_name, 'w')
    target_file = open(target_file_name, 'w')


    for batch in data_iter:
        
        
        
        source, target = batch.source, batch.target
        

        source_data,source_len = source[0], source[1]
        target_data,target_len = target[0], target[1]
        
        all_output, h_n, c_n = encoder(source)
        output = all_output[:,0]

        target_word = TEXT_en.vocab.stoi['<sos>']

        h_t = h_n[:,1,:]
        c_t = c_n[:,1,:]

        is_init = False


        right_whole_sentence_word_index = target_data[1: target_len[0].item()-1,0]
        right_whole_sentence_word_index = list(right_whole_sentence_word_index.cpu().numpy())
        
        
        sequences = [Bean_Search_Status_Record(h_t, c_t, predict_word_index_list = [target_word], 
                                               sum_log_prob = 0.0)]
        
        t = 0
        while (t < 100):
            all_candidates = []
            for i in range(len(sequences)):
                record = sequences[i]
                h_t = record.h_t
                c_t = record.c_t
                predict_word_index_list = record.predict_word_index_list
                sum_log_prob = record.sum_log_prob
                target_word = predict_word_index_list[-1]
                
                if TEXT_en.vocab.stoi['<eos>'] != target_word:
                
                    prob, h_t, c_t = decoder(torch.tensor([target_word]).cuda(0), h_t, c_t, output, is_init)

                    k_prob_value_list, k_word_index_list = prob.topk(k,dim=1)
                    k_prob_value_list = k_prob_value_list.cpu().detach().squeeze().numpy()
                    k_word_index_list = k_word_index_list.cpu().squeeze().numpy()
                    
                    
                    for prob_value, word_index in zip(k_prob_value_list, k_word_index_list):
                        prob_value = float(prob_value)
                        word_index = int(word_index)
                        new_record = Bean_Search_Status_Record(h_t, c_t, predict_word_index_list+[word_index], sum_log_prob+prob_value)
                        new_record.avg_log_prob = new_record.sum_log_prob/(len(new_record.predict_word_index_list) - 1)
                        all_candidates.append(new_record)
                else:
                    all_candidates.append(record)
            is_init = False
                        
            ordered = sorted(all_candidates, key = lambda r: r.sum_log_prob, reverse = True)
            sequences = ordered[:k]
            
            t += 1
        final_record = sequences[0]
        
        predict_whole_sentence_word_index = [TEXT_en.vocab.itos[temp_index] for temp_index in final_record.predict_word_index_list[1:-1]]
        right_whole_sentence_word_index = [TEXT_en.vocab.itos[temp_index] for temp_index in right_whole_sentence_word_index]

        predict_whole_sentence = ' '.join(predict_whole_sentence_word_index)
        right_whole_sentence = ' '.join(right_whole_sentence_word_index)

        predict_file.write(predict_whole_sentence.strip() + '\n')
        target_file.write(right_whole_sentence.strip() + '\n')


    predict_file.close()
    target_file.close()

    result = subprocess.run('cat {} | sacrebleu {}'.format(predict_file_name,target_file_name),shell=True,stdout=subprocess.PIPE)
    result = str(result)
    print(result)
    sys.stdout.flush()
    
    
    return get_blue_score(result)

In [4]:
encoder = torch.load('../save_model/vi_to_en_encode_6')
decoder = torch.load('../save_model/vi_to_en_decoder_6')

  "type " + container_type.__name__ + ". It won't be checked "
  "type " + container_type.__name__ + ". It won't be checked "


In [8]:
class Bean_Search_Status_Record:
    
    def __init__(self, h_t, c_t, predict_word_index_list, sum_log_prob):
        self.h_t = h_t
        self.c_t = c_t
        self.predict_word_index_list = predict_word_index_list
        self.sum_log_prob = sum_log_prob
        self.avg_log_prob = 0
        
validation_vi_en = torchtext.data.TabularDataset('../data/processed_data/test_vi_en.csv', format='csv', 
                             fields=[('source',TEXT_vi),('target',TEXT_en)])
  
test(encoder, decoder, validation_vi_en_iter)



CompletedProcess(args='cat ../eval/1543033030_7136176/predict.txt | sacrebleu ../eval/1543033030_7136176/target_file_name.txt', returncode=0, stdout=b'BLEU+case.mixed+numrefs.1+smooth.exp+tok.13a+version.1.2.12 = 16.5 50.3/22.9/12.1/6.8 (BP = 0.940 ratio = 0.941 hyp_len = 26626 ref_len = 28283)\n')


NameError: name 'get_blue_score' is not defined

In [None]:
import pandas as pd
import numpy as np
import torch
import torchtext
import pickle
import csv
import unicodedata
import re, random, time, string, subprocess, sys
import os, re


TEXT_vi = torchtext.data.ReversibleField(sequential=True, use_vocab=True, batch_first = False, tokenize= lambda t:t.split(),
                                        include_lengths=True)
TEXT_en = torchtext.data.ReversibleField(sequential=True, use_vocab=True, batch_first = False, tokenize= lambda t:t.split(),
                              lower=True, init_token='<sos>', eos_token='<eos>',include_lengths=True)
train_vi_en = torchtext.data.TabularDataset('../data/processed_data/train_vi_en.csv', format='csv', 
                             fields=[('source',TEXT_vi),('target',TEXT_en)])
validation_vi_en = torchtext.data.TabularDataset('../data/processed_data/dev_vi_en.csv', format='csv', 
                             fields=[('source',TEXT_vi),('target',TEXT_en)])


TEXT_vi.build_vocab(train_vi_en)
TEXT_en.build_vocab(train_vi_en)

train_vi_en_iter = torchtext.data.BucketIterator(train_vi_en, batch_size=1, sort_key= lambda e: len(e.source),
                             repeat = False, sort_within_batch=True, shuffle=True, device=torch.device(0))
validation_vi_en_iter = torchtext.data.BucketIterator(validation_vi_en, batch_size=1, sort_key= lambda e: len(e.source),
                             repeat = False, sort_within_batch=True, shuffle=True, device=torch.device(0))





class Bi_Multi_Layer_GRU_Encoder(torch.nn.Module):
    
    def __init__(self, num_vocab, input_size, hidden_size, num_layers = 2, dropout = 0.1):
        super().__init__()
        self.input_size = input_size
        self.hidden_size = hidden_size
        self.num_layers = num_layers
        self.dropout = dropout
        self.bidirectional = True
        self.embedding_layer = torch.nn.Embedding(num_vocab, self.input_size)
        self.gru = torch.nn.GRU(input_size= self.input_size, hidden_size = self.hidden_size, batch_first = False,
                                 bidirectional = self.bidirectional, dropout = self.dropout, 
                                  num_layers = self.num_layers)
        
        h_0 = torch.zeros(1, self.hidden_size)
        torch.nn.init.normal_(h_0, mean=0, std=0.001)
        self.h_0 = torch.nn.Parameter(h_0,requires_grad=True)
        
        
        
        if self.bidirectional:
            h_1 = torch.zeros(1, self.hidden_size)
            torch.nn.init.normal_(h_1, mean=0, std=0.001)
            self.h_1 = torch.nn.Parameter(h_1,requires_grad=True)
            
        
    def forward(self, X):
        
        X_data,X_len = X
        
        X_data = self.embedding_layer(X_data)
        
        h_0 = torch.cat([self.h_0]*len(X_len), dim=0).unsqueeze(0)
        
        
        if self.bidirectional:
            h_1 = torch.cat([self.h_1]*len(X_len), dim=0).unsqueeze(0)
            
            h = torch.cat([h_0,h_1], dim=0)
            
            
            h = torch.cat([h]*self.num_layers, dim=0)
           
            

        output, h_n = self.gru(X_data, h)
        #seq_len, batch, num_directions * hidden_size
        
        
        return output
        
        
        
class GRU_Decoder_With_Attention(torch.nn.Module):
    
    def __init__(self, num_vocab, input_size, hidden_size, dropout=0.1):
        super().__init__()
        self.num_vocab = num_vocab
        self.input_size = input_size
        self.hidden_size = hidden_size
        self.num_layers = 1
        self.dropout = dropout
        
        self.embedding_layer = torch.nn.Embedding(self.num_vocab, self.input_size)
        self.gru = torch.nn.GRU(hidden_size= self.hidden_size, input_size= self.input_size + 2 * self.hidden_size, 
                                  num_layers= self.num_layers)
        
        self.calcu_weight_1  = torch.nn.Linear(3*self.hidden_size, hidden_size)
        self.calcu_weight_2  = torch.nn.Linear(self.hidden_size, 1)
        self.init_weight = torch.nn.Linear(self.hidden_size, self.hidden_size)
        
        self.linear_vob = torch.nn.Linear(self.hidden_size, self.num_vocab)
        
    def forward(self, input_word_index, hidden_vector, encoder_memory, is_init = False):
        #input_word_index: [num]
        #hidden_vector: 1, 1, hidden_size
        #encoder_memory: source_sen_len , 2 * hidden_size
        
        if hidden_vector.shape[0] != self.num_layers or hidden_vector.shape[2] != self.hidden_size:
            raise ValueError('The size of hidden_vector is not correct, expect '+str((self.num_layers, self.hidden_size))\
                            + ', actually get ' + str(hidden_vector.shape))
        
        if is_init:
            hidden_vector = torch.tanh(self.init_weight(hidden_vector))
        
        
        n_hidden_vector = torch.stack([hidden_vector.squeeze()]*encoder_memory.shape[0],dim=0)
        com_n_h_memory = torch.cat([n_hidden_vector, encoder_memory], dim =1)
        com_n_h_temp = torch.tanh(self.calcu_weight_1(com_n_h_memory))
        
        
        weight_vector = self.calcu_weight_2(com_n_h_temp)
        weight_vector =  torch.nn.functional.softmax(weight_vector, dim=0)
        #weight_vector: source_sen_len * 1
        
        
        convect_vector = torch.mm(weight_vector.transpose(1,0), encoder_memory)
        #convect_vector: 1 , 2 * hidden_size
        
        
        input_vector = self.embedding_layer(input_word_index).view(1,1,-1)
        
        
        input_vector = torch.cat([convect_vector.unsqueeze(0), input_vector], dim=2)
        
        
        output, h_t = self.gru(input_vector,hidden_vector)
        output = output.view(1, self.hidden_size)
        
        
        prob = self.linear_vob(output)
        #prob 1, vob_size
        
        prob = torch.nn.functional.log_softmax(prob, dim=1)
        
        
        return prob, h_t



     
        
        
        
        
        
def train(encoder, decoder, optimizer, data_iter, teacher_forcing_ratio, batch_size = 32):

    encoder.train()
    decoder.train()
    
    count = 0
    loss = 0
    
    
    for batch in data_iter:
        
        
        source, target = batch.source, batch.target
        

        source_data,source_len = source[0], source[1]
        target_data,target_len = target[0], target[1]
        
        all_output = encoder(source)

        use_teacher_forcing = True if random.random() < teacher_forcing_ratio else False
        

        output = all_output[:,0]
        target_word_list = target_data.squeeze()
        target_word = torch.tensor([TEXT_en.vocab.stoi['<sos>']]).cuda(0)

        h_t = output[0,int(output.shape[1]/2):]
        h_t = h_t.view([1,1,-1])

        is_init = True

        for word_index in range(1, target_len[0].item()):
            prob, h_t = decoder(target_word, h_t, output, is_init)
            is_init = False
            if use_teacher_forcing:
                target_word = target_word_list[[word_index]]
                loss += torch.nn.functional.nll_loss(prob, target_word)
            else:
                right_target_word = target_word_list[[word_index]]
                loss += torch.nn.functional.nll_loss(prob, right_target_word)
                predict_target_word_index = prob.topk(1)[1].item()

                if TEXT_en.vocab.stoi['<eos>'] == predict_target_word_index:
                    break
                else:
                    target_word = torch.tensor([predict_target_word_index]).cuda(0)
                    
        count += 1
        if count % batch_size == 0:
            
            loss = loss/batch_size
            loss.backward()
            optimizer.step()
            optimizer.zero_grad()
            count = 0
            loss = 0
        
        
    if count % batch_size != 0:
        loss = loss/count
        loss.backward()
        optimizer.step()
        optimizer.zero_grad()

        
        
class Bean_Search_Status_Record:
    
    def __init__(self, h_t, predict_word_index_list, sum_log_prob):
        self.h_t = h_t
        self.predict_word_index_list = predict_word_index_list
        self.sum_log_prob = sum_log_prob
        self.avg_log_prob = 0
        
    

def test(encoder, decoder, data_iter, k=10):
    encoder.eval()
    decoder.eval()

    path_name = '../eval/'+str(time.time()).replace('.','_')+'/'
    os.mkdir(path_name)

    predict_file_name = path_name + 'predict.txt'
    target_file_name = path_name + 'target_file_name.txt'

    predict_file = open(predict_file_name, 'w')
    target_file = open(target_file_name, 'w')


    for batch in data_iter:
        
        
        
        source, target = batch.source, batch.target
        

        source_data,source_len = source[0], source[1]
        target_data,target_len = target[0], target[1]
        
        all_output = encoder(source)
        output = all_output[:,0]
        
        target_word = torch.tensor([TEXT_en.vocab.stoi['<sos>']]).cuda(0)

        h_t = output[0,int(output.shape[1]/2):]
        h_t = h_t.view([1,1,-1])

        is_init = True


        right_whole_sentence_word_index = target_data[1: target_len[0].item()-1,0]
        right_whole_sentence_word_index = list(right_whole_sentence_word_index.cpu().numpy())
        
        
        sequences = [Bean_Search_Status_Record(h_t, predict_word_index_list = [target_word], 
                                               sum_log_prob = 0.0)]
        
        t = 0
        while (t < 100):
            all_candidates = []
            for i in range(len(sequences)):
                record = sequences[i]
                h_t = record.h_t
                predict_word_index_list = record.predict_word_index_list
                sum_log_prob = record.sum_log_prob
                target_word = predict_word_index_list[-1]
                
                if TEXT_en.vocab.stoi['<eos>'] != target_word:
                
                    prob, h_t = decoder(torch.tensor([target_word]).cuda(0), h_t, output, is_init)

                    k_prob_value_list, k_word_index_list = prob.topk(k,dim=1)
                    k_prob_value_list = k_prob_value_list.cpu().detach().squeeze().numpy()
                    k_word_index_list = k_word_index_list.cpu().squeeze().numpy()
                    
                    
                    for prob_value, word_index in zip(k_prob_value_list, k_word_index_list):
                        prob_value = float(prob_value)
                        word_index = int(word_index)
                        new_record = Bean_Search_Status_Record(h_t, predict_word_index_list+[word_index], sum_log_prob+prob_value)
                        new_record.avg_log_prob = new_record.sum_log_prob/(len(new_record.predict_word_index_list) - 1)
                        all_candidates.append(new_record)
                else:
                    all_candidates.append(record)
            is_init = False
                        
            ordered = sorted(all_candidates, key = lambda r: r.sum_log_prob, reverse = True)
            sequences = ordered[:k]
            
            t += 1
        final_record = sequences[0]
        
        predict_whole_sentence_word_index = [TEXT_en.vocab.itos[temp_index] for temp_index in final_record.predict_word_index_list[1:-1]]
        right_whole_sentence_word_index = [TEXT_en.vocab.itos[temp_index] for temp_index in right_whole_sentence_word_index]

        predict_whole_sentence = ' '.join(predict_whole_sentence_word_index)
        right_whole_sentence = ' '.join(right_whole_sentence_word_index)

        predict_file.write(predict_whole_sentence.strip() + '\n')
        target_file.write(right_whole_sentence.strip() + '\n')


    predict_file.close()
    target_file.close()

    result = subprocess.run('cat {} | sacrebleu {}'.format(predict_file_name,target_file_name),shell=True,stdout=subprocess.PIPE)
    result = str(result)
    print(result)
    sys.stdout.flush()
    
    
    return get_blue_score(result)
        
        
        
        
        
        
        



def get_blue_score(s):
    a = re.search(r'13a\+version\.1\.2\.12 = ([0-9.]+)',s)
    return float(a.group(1))


def parameters_list(encoder, decoder):
    para_list_1 = []
    para_list_2 = []
    for name, data in list(encoder.named_parameters()):
        if 'embedding' in name:
            para_list_1.append(data)
        else:
            para_list_2.append(data)

    for name, data in list(decoder.named_parameters()):
        if 'embedding' in name:
            para_list_1.append(data)
        else:
            para_list_2.append(data)
    return para_list_1, para_list_2


def parameters_list_change_grad(encoder, decoder):
    para_list = []
    for name, data in list(encoder.named_parameters()):
        if 'embedding' in name:
            data.requires_grad = False
        else:
            para_list.append(data)

    for name, data in list(decoder.named_parameters()):
        if 'embedding' in name:
            data.requires_grad = False
        else:
            para_list.append(data)
    return para_list



encoder = Bi_Multi_Layer_GRU_Encoder(num_vocab=len(TEXT_vi.vocab.stoi), input_size=300, hidden_size=300, num_layers=2)
decoder = GRU_Decoder_With_Attention(num_vocab = len(TEXT_en.vocab.stoi), input_size = 300, hidden_size=300, dropout=0.1)

encoder = encoder.cuda(0)
decoder = decoder.cuda(0)

early_stop = 2
best_blue_score = -1
best_index = -1

save_model_dir_name = '../save_model/vi_to_en_'

para_list_1, para_list_2 = parameters_list(encoder, decoder)


optimizer = torch.optim.Adam([{'params': para_list_1, 'lr': 0.001},
                              {'params': para_list_2, 'lr': 0.001}])

teacher_forcing_ratio = 0.5

for index_unique in range(100):
    train(encoder, decoder, optimizer, train_vi_en_iter, teacher_forcing_ratio)
    blue_score = test(encoder, decoder, validation_vi_en_iter)
    print('epoch: ',index_unique, ' the blue score on validation dataset is : ', blue_score)
    sys.stdout.flush()
    if best_blue_score < blue_score:
        
        best_index = index_unique
        best_blue_score = blue_score
        best_encoder = encoder
        best_decoder = decoder
        torch.save(encoder, save_model_dir_name+'encode_'+str(index_unique))
        torch.save(decoder, save_model_dir_name+'decoder_'+str(index_unique))
        
    if index_unique - best_index >= early_stop:
        break
        
        
        
        
        
print('--------------------------------------')
sys.stdout.flush()


encoder = best_encoder
decoder = best_decoder
        
        

para_list = parameters_list_change_grad(encoder, decoder)     
optimizer = torch.optim.Adam(para_list, lr = 0.001)  
save_model_dir_name = '../save_model/refined_vi_to_en_'

early_stop = 2
best_blue_score = -1
best_index = -1

for index_unique in range(100):
    train(encoder, decoder, optimizer, train_vi_en_iter, teacher_forcing_ratio)
    blue_score = test(encoder, decoder, validation_vi_en_iter)
    print('epoch: ',index_unique, ' the blue score on validation dataset is : ', blue_score)
    sys.stdout.flush()
    
    if best_blue_score < blue_score:
        
        best_index = index_unique
        best_blue_score = blue_score
        torch.save(encoder, save_model_dir_name+'encode_'+str(index_unique))
        torch.save(decoder, save_model_dir_name+'decoder_'+str(index_unique))
    if index_unique - best_index >= early_stop:
        break

CompletedProcess(args='cat ../eval/1542906497_4235842/predict.txt | sacrebleu ../eval/1542906497_4235842/target_file_name.txt', returncode=0, stdout=b'BLEU+case.mixed+numrefs.1+smooth.exp+tok.13a+version.1.2.12 = 0.1 2.0/0.1/0.0/0.0 (BP = 1.000 ratio = 2.962 hyp_len = 83763 ref_len = 28283)\n')
epoch:  0  the blue score on validation dataset is :  0.1


  "type " + obj.__name__ + ". It won't be checked "
  "type " + obj.__name__ + ". It won't be checked "


CompletedProcess(args='cat ../eval/1542907219_6010282/predict.txt | sacrebleu ../eval/1542907219_6010282/target_file_name.txt', returncode=0, stdout=b'BLEU+case.mixed+numrefs.1+smooth.exp+tok.13a+version.1.2.12 = 0.0 23.6/0.0/0.0/0.0 (BP = 0.038 ratio = 0.235 hyp_len = 6635 ref_len = 28283)\n')
epoch:  1  the blue score on validation dataset is :  0.0


In [72]:
TEXT_en.vocab.itos

[' UNK ',
 '<pad>',
 '<sos>',
 '<eos>',
 ',',
 '.',
 'the',
 'and',
 'to',
 'of',
 'a',
 'that',
 'i',
 'in',
 'it',
 'you',
 'we',
 'is',
 '&apos;s',
 'this',
 'so',
 '&quot;',
 'they',
 'for',
 'was',
 'are',
 'have',
 'what',
 'but',
 'on',
 'with',
 'can',
 '--',
 '?',
 '&apos;t',
 'about',
 'there',
 'be',
 'as',
 'all',
 'at',
 'not',
 'my',
 'do',
 'one',
 'people',
 '&apos;re',
 'from',
 'like',
 'if',
 'now',
 'an',
 'he',
 'our',
 'these',
 'just',
 ':',
 'when',
 'or',
 'because',
 'how',
 'me',
 'very',
 'by',
 'out',
 'them',
 'more',
 'going',
 'up',
 'know',
 'your',
 'who',
 'had',
 'think',
 'their',
 'which',
 'see',
 'were',
 'would',
 'really',
 'here',
 'get',
 'then',
 'us',
 'world',
 '&apos;ve',
 '&apos;m',
 'some',
 'time',
 'actually',
 'don',
 'has',
 'way',
 'into',
 'years',
 'will',
 'things',
 'where',
 'other',
 'no',
 'could',
 'want',
 'go',
 'make',
 'she',
 'well',
 'been',
 'said',
 'first',
 'something',
 'right',
 'two',
 'than',
 'those',
 'much'

In [45]:
k_prob_value_list, k_word_index_list = p.topk(10,dim=1)

In [62]:
for a, b in zip(list(k_prob_value_list), list(k_word_index_list)):
    print(type(a), type(b))

<class 'numpy.float32'> <class 'numpy.int64'>
<class 'numpy.float32'> <class 'numpy.int64'>
<class 'numpy.float32'> <class 'numpy.int64'>
<class 'numpy.float32'> <class 'numpy.int64'>
<class 'numpy.float32'> <class 'numpy.int64'>
<class 'numpy.float32'> <class 'numpy.int64'>
<class 'numpy.float32'> <class 'numpy.int64'>
<class 'numpy.float32'> <class 'numpy.int64'>
<class 'numpy.float32'> <class 'numpy.int64'>
<class 'numpy.float32'> <class 'numpy.int64'>


In [17]:
model = LSTM_Decoder_With_Attention(100,100,100)
model.init_parameters()

In [57]:
for i,j in model.named_parameters():
    print(i,j.shape)

h_0 torch.Size([1, 256])
c_0 torch.Size([1, 256])
h_1 torch.Size([1, 256])
c_1 torch.Size([1, 256])
embedding_layer.weight torch.Size([47818, 256])
lstm.weight_ih_l0 torch.Size([1024, 256])
lstm.weight_hh_l0 torch.Size([1024, 256])
lstm.bias_ih_l0 torch.Size([1024])
lstm.bias_hh_l0 torch.Size([1024])
lstm.weight_ih_l0_reverse torch.Size([1024, 256])
lstm.weight_hh_l0_reverse torch.Size([1024, 256])
lstm.bias_ih_l0_reverse torch.Size([1024])
lstm.bias_hh_l0_reverse torch.Size([1024])


In [4]:
model = Bi_Multi_Layer_LSTM_Encoder(40000)
model = model.cuda(0)

In [5]:
output, h_n, c_n = model(batch.source)

In [8]:
h_n[:,1,:]

tensor([[[ 0.0644, -0.2107,  0.0619, -0.1083,  0.1643, -0.1319, -0.1833,
          -0.0865, -0.1231,  0.0314,  0.1980,  0.0591,  0.0410, -0.0407,
           0.1086,  0.1122,  0.0773, -0.1008, -0.0328,  0.0581, -0.1446,
           0.1377, -0.2802, -0.1474, -0.1325, -0.1730, -0.0371,  0.0442,
           0.0064,  0.0894,  0.0054, -0.1113, -0.0061,  0.0448,  0.0152,
           0.0657, -0.0109, -0.0666,  0.1466, -0.0901,  0.2611, -0.3704,
          -0.1226,  0.0712,  0.1101,  0.1109, -0.2114,  0.1134, -0.0082,
          -0.0465,  0.2074,  0.0117, -0.1715,  0.3426, -0.2799, -0.0629,
           0.0599, -0.1153,  0.1230,  0.2035, -0.0557, -0.1402, -0.0621,
          -0.0943,  0.0436, -0.0852, -0.1129, -0.0263, -0.1597,  0.2120,
           0.0877,  0.1135, -0.0120,  0.2677, -0.0315,  0.3531,  0.0118,
           0.1171,  0.1543,  0.0550,  0.0083,  0.0208,  0.0730, -0.0727,
          -0.0326, -0.1360,  0.0260,  0.0334, -0.1456,  0.0079, -0.0447,
           0.1556,  0.2574, -0.0306, -0.1912, -0.30

In [11]:
output[0,0,256:]

tensor([ 0.0644, -0.2107,  0.0619, -0.1083,  0.1643, -0.1319, -0.1833, -0.0865,
        -0.1231,  0.0314,  0.1980,  0.0591,  0.0410, -0.0407,  0.1086,  0.1122,
         0.0773, -0.1008, -0.0328,  0.0581, -0.1446,  0.1377, -0.2802, -0.1474,
        -0.1325, -0.1730, -0.0371,  0.0442,  0.0064,  0.0894,  0.0054, -0.1113,
        -0.0061,  0.0448,  0.0152,  0.0657, -0.0109, -0.0666,  0.1466, -0.0901,
         0.2611, -0.3704, -0.1226,  0.0712,  0.1101,  0.1109, -0.2114,  0.1134,
        -0.0082, -0.0465,  0.2074,  0.0117, -0.1715,  0.3426, -0.2799, -0.0629,
         0.0599, -0.1153,  0.1230,  0.2035, -0.0557, -0.1402, -0.0621, -0.0943,
         0.0436, -0.0852, -0.1129, -0.0263, -0.1597,  0.2120,  0.0877,  0.1135,
        -0.0120,  0.2677, -0.0315,  0.3531,  0.0118,  0.1171,  0.1543,  0.0550,
         0.0083,  0.0208,  0.0730, -0.0727, -0.0326, -0.1360,  0.0260,  0.0334,
        -0.1456,  0.0079, -0.0447,  0.1556,  0.2574, -0.0306, -0.1912, -0.3019,
         0.0219, -0.0611, -0.0219,  0.25

In [26]:
p = test(encoder,decoder, train_vi_en_iter)



In [28]:
p.shape

torch.Size([1, 47818])

In [31]:
p.topk(10)[1].item()

ValueError: only one element tensors can be converted to Python scalars

In [30]:
p.topk(10)

(tensor([[-2.0558, -2.2132, -2.9957, -3.2576, -3.3534, -3.4411, -3.5530, -3.7668,
          -3.7925, -3.8231]], device='cuda:0', grad_fn=<TopkBackward>),
 tensor([[15,  6, 20, 14,  7, 42, 70, 13, 12, 50]], device='cuda:0'))

In [1]:
import numpy as np
import torch, torchtext
import torch.nn as nn
import torch.nn.functional as F
import math, copy, time
from torch.autograd import Variable
import matplotlib.pyplot as plt

In [2]:

TEXT_vi = torchtext.data.ReversibleField(sequential=True, use_vocab=True, batch_first = True, tokenize= lambda t:t.split(),
                                        include_lengths=True)
TEXT_en = torchtext.data.ReversibleField(sequential=True, use_vocab=True, batch_first = True, tokenize= lambda t:t.split(),
                              lower=True, init_token='<sos>', eos_token='<eos>',include_lengths=True)


train_vi_en = torchtext.data.TabularDataset('../data/processed_data/train_vi_en.csv', format='csv', 
                             fields=[('source',TEXT_vi),('target',TEXT_en)])
validation_vi_en = torchtext.data.TabularDataset('../data/processed_data/dev_vi_en.csv', format='csv', 
                             fields=[('source',TEXT_vi),('target',TEXT_en)])


TEXT_vi.build_vocab(train_vi_en)
TEXT_en.build_vocab(train_vi_en)


train_vi_en_iter = torchtext.data.BucketIterator(train_vi_en, batch_size=1, sort_key= lambda e: len(e.source) + len(e.target),
                             repeat = False, sort_within_batch=True, shuffle=True, device=torch.device(0))
validation_vi_en_iter = torchtext.data.BucketIterator(validation_vi_en, batch_size=1, sort_key= lambda e: len(e.source) + len(e.target),
                             repeat = False, sort_within_batch=True, shuffle=True, device=torch.device(0))


In [3]:
import numpy as np
import torch, torchtext
import torch.nn as nn
import torch.nn.functional as F
import math, copy, sys
from torch.autograd import Variable
import matplotlib.pyplot as plt


TEXT_vi = torchtext.data.ReversibleField(sequential=True, use_vocab=True, batch_first = True, tokenize= lambda t:t.split(),
                                        include_lengths=True)
TEXT_en = torchtext.data.ReversibleField(sequential=True, use_vocab=True, batch_first = True, tokenize= lambda t:t.split(),
                              lower=True, init_token='<sos>', eos_token='<eos>',include_lengths=True)


train_vi_en = torchtext.data.TabularDataset('../data/processed_data/train_vi_en.csv', format='csv', 
                             fields=[('source',TEXT_vi),('target',TEXT_en)])
validation_vi_en = torchtext.data.TabularDataset('../data/processed_data/dev_vi_en.csv', format='csv', 
                             fields=[('source',TEXT_vi),('target',TEXT_en)])


TEXT_vi.build_vocab(train_vi_en)
TEXT_en.build_vocab(train_vi_en)


train_vi_en_iter = torchtext.data.BucketIterator(train_vi_en, batch_size=4, sort_key= lambda e: len(e.source) + len(e.target),
                             repeat = False, sort_within_batch=True, shuffle=True, device=torch.device(0))
validation_vi_en_iter = torchtext.data.BucketIterator(validation_vi_en, batch_size=4, sort_key= lambda e: len(e.source) + len(e.target),
                             repeat = False, sort_within_batch=True, shuffle=True, device=torch.device(0))



class EncoderDecoder(nn.Module):
    """
    A standard Encoder-Decoder architecture. Base for this and many 
    other models.
    """
    def __init__(self, encoder, decoder, src_embed, tgt_embed, generator):
        super(EncoderDecoder, self).__init__()
        self.encoder = encoder
        self.decoder = decoder
        self.src_embed = src_embed
        self.tgt_embed = tgt_embed
        self.generator = generator
        
    def forward(self, src, tgt, src_mask, tgt_mask):
        "Take in and process masked src and target sequences."
        return self.decode(self.encode(src, src_mask), src_mask,
                            tgt, tgt_mask)
    
    def encode(self, src, src_mask):
        return self.encoder(self.src_embed(src), src_mask)
    
    def decode(self, memory, src_mask, tgt, tgt_mask):
        return self.decoder(self.tgt_embed(tgt), memory, src_mask, tgt_mask)
    
    
class Generator(nn.Module):
    "Define standard linear + softmax generation step."
    def __init__(self, d_model, vocab):
        super(Generator, self).__init__()
        self.proj = nn.Linear(d_model, vocab)

    def forward(self, x):
        return F.log_softmax(self.proj(x), dim=-1)
    
def clones(module, N):
    "Produce N identical layers."
    return nn.ModuleList([copy.deepcopy(module) for _ in range(N)])


class Encoder(nn.Module):
    "Core encoder is a stack of N layers"
    def __init__(self, layer, N):
        super(Encoder, self).__init__()
        self.layers = clones(layer, N)
        self.norm = LayerNorm(layer.size)
        
    def forward(self, x, mask):
        "Pass the input (and mask) through each layer in turn."
        for layer in self.layers:
            x = layer(x, mask)
        return self.norm(x)
    
class LayerNorm(nn.Module):
    "Construct a layernorm module (See citation for details)."
    def __init__(self, features, eps=1e-6):
        super(LayerNorm, self).__init__()
        self.a_2 = nn.Parameter(torch.ones(features))
        self.b_2 = nn.Parameter(torch.zeros(features))
        self.eps = eps

    def forward(self, x):
        mean = x.mean(-1, keepdim=True)
        std = x.std(-1, keepdim=True)
        return self.a_2 * (x - mean) / (std + self.eps) + self.b_2

class SublayerConnection(nn.Module):
    """
    A residual connection followed by a layer norm.
    Note for code simplicity the norm is first as opposed to last.
    """
    def __init__(self, size, dropout):
        super(SublayerConnection, self).__init__()
        self.norm = LayerNorm(size)
        self.dropout = nn.Dropout(dropout)

    def forward(self, x, sublayer):
        "Apply residual connection to any sublayer with the same size."
        return x + self.dropout(sublayer(self.norm(x)))
    
class EncoderLayer(nn.Module):
    "Encoder is made up of self-attn and feed forward (defined below)"
    def __init__(self, size, self_attn, feed_forward, dropout):
        super(EncoderLayer, self).__init__()
        self.self_attn = self_attn
        self.feed_forward = feed_forward
        self.sublayer = clones(SublayerConnection(size, dropout), 2)
        self.size = size

    def forward(self, x, mask):
        "Follow Figure 1 (left) for connections."
        x = self.sublayer[0](x, lambda x: self.self_attn(x, x, x, mask))
        return self.sublayer[1](x, self.feed_forward)

class Decoder(nn.Module):
    "Generic N layer decoder with masking."
    def __init__(self, layer, N):
        super(Decoder, self).__init__()
        self.layers = clones(layer, N)
        self.norm = LayerNorm(layer.size)
        
    def forward(self, x, memory, src_mask, tgt_mask):
        for layer in self.layers:
            x = layer(x, memory, src_mask, tgt_mask)
        return self.norm(x)
    
class DecoderLayer(nn.Module):
    "Decoder is made of self-attn, src-attn, and feed forward (defined below)"
    def __init__(self, size, self_attn, src_attn, feed_forward, dropout):
        super(DecoderLayer, self).__init__()
        self.size = size
        self.self_attn = self_attn
        self.src_attn = src_attn
        self.feed_forward = feed_forward
        self.sublayer = clones(SublayerConnection(size, dropout), 3)
 
    def forward(self, x, memory, src_mask, tgt_mask):
        "Follow Figure 1 (right) for connections."
        m = memory
        x = self.sublayer[0](x, lambda x: self.self_attn(x, x, x, tgt_mask))
        x = self.sublayer[1](x, lambda x: self.src_attn(x, m, m, src_mask))
        return self.sublayer[2](x, self.feed_forward)
    
def attention(query, key, value, mask=None, dropout=None):
    '''
    query: batch, seq1, d_k
    key: batch, seq2, d_k
    value: batch, seq2, embedding_size
    mask: batch, 1, seq_2
    '''
    
    d_k = query.size(-1)
    scores = torch.matmul(query, key.transpose(-2, -1)) \
             / math.sqrt(d_k)
    if mask is not None:
        scores = scores.masked_fill(mask == 0, -1e9)
    p_attn = F.softmax(scores, dim = -1)
    if dropout is not None:
        p_attn = dropout(p_attn)
    return torch.matmul(p_attn, value), p_attn


class MultiHeadedAttention(nn.Module):
    def __init__(self, d_k, d_model, dropout=0.1):
        "Take in model size and number of heads."
        super(MultiHeadedAttention, self).__init__()
        
        # We assume d_v always equals d_k
        self.d_k = d_k
        self.linears = clones(nn.Linear(d_model, d_k), 2)
        self.attn = None
        self.dropout = nn.Dropout(p=dropout)
        
    def forward(self, query, key, value, mask=None):
        "Implements Figure 2"
        
        nbatches = query.size(0)
        
        # 1) Do all the linear projections in batch from d_model => h x d_k 
        query, key = [l(x) for l, x in zip(self.linears, (query, key))]
        #query, key = batch, seq, d_k
        
        # 2) Apply attention on all the projected vectors in batch. 
        x, self.attn = attention(query, key, value, mask=mask, 
                                 dropout=self.dropout)
        #x: batch, seq_query, embedding_size
        
        
        return x
    
    
class PositionwiseFeedForward(nn.Module):
    "Implements FFN equation."
    def __init__(self, d_model, dropout=0.1):
        super(PositionwiseFeedForward, self).__init__()
        self.w_1 = nn.Linear(d_model, d_model)
        self.dropout = nn.Dropout(dropout)

    def forward(self, x):
        return self.dropout(F.relu(self.w_1(x)))
    
    
class Embeddings(nn.Module):
    def __init__(self, d_model, vocab):
        super(Embeddings, self).__init__()
        self.lut = nn.Embedding(vocab, d_model)
        self.d_model = d_model

    def forward(self, x):
        return self.lut(x) * math.sqrt(self.d_model)
    
    
class PositionalEncoding(nn.Module):
    "Implement the PE function."
    def __init__(self, d_model, dropout, max_len=5000):
        super(PositionalEncoding, self).__init__()
        self.dropout = nn.Dropout(p=dropout)
        
        # Compute the positional encodings once in log space.
        pe = torch.zeros(max_len, d_model)
        position = torch.arange(0, max_len).unsqueeze(1).float()
        div_term = torch.exp(torch.arange(0, d_model, 2).float() *
                             -(math.log(10000.0) / d_model))
        pe[:, 0::2] = torch.sin(position * div_term)
        pe[:, 1::2] = torch.cos(position * div_term)
        pe = pe.unsqueeze(0)
        self.register_buffer('pe', pe)
        
    def forward(self, x):
        x = x + Variable(self.pe[:, :x.size(1)], 
                         requires_grad=False)
        return self.dropout(x)
    
    
    
class LabelSmoothing(nn.Module):
    "Implement label smoothing."
    def __init__(self, size, padding_idx, smoothing=0.0):
        super(LabelSmoothing, self).__init__()
        self.criterion = nn.KLDivLoss(reduction='sum')
        self.padding_idx = padding_idx
        self.confidence = 1.0 - smoothing
        self.smoothing = smoothing
        self.size = size
        self.true_dist = None
        
    def forward(self, x, target):
        assert x.size(1) == self.size
        true_dist = x.data.clone()
        true_dist.fill_(self.smoothing / (self.size - 2))
        true_dist.scatter_(1, target.data.unsqueeze(1), self.confidence)
        true_dist[:, self.padding_idx] = 0
        mask = torch.nonzero(target.data == self.padding_idx)
        if mask.sum() > 0 and len(mask) > 0:
            true_dist.index_fill_(0, mask.squeeze(), 0.0)
        self.true_dist = true_dist
        return self.criterion(x, Variable(true_dist, requires_grad=False))
    
    
class SimpleLossCompute:
    "A simple loss compute and train function."
    def __init__(self, generator, criterion):
        self.generator = generator
        self.criterion = criterion
        
    def __call__(self, x, y):
        x = self.generator(x)
        loss = self.criterion(x.contiguous().view(-1, x.size(-1)), 
                              y.contiguous().view(-1))
        return loss
    
def subsequent_mask(size):
    "Mask out subsequent positions."
    attn_shape = (1, size, size)
    subsequent_mask = np.triu(np.ones(attn_shape), k=1).astype('uint8')
    return torch.from_numpy(subsequent_mask) == 0

In [4]:
model = torch.load('../../machine_translation_attention/save_model/vi_to_en_model')

  "type " + container_type.__name__ + ". It won't be checked "
  "type " + container_type.__name__ + ". It won't be checked "
  "type " + container_type.__name__ + ". It won't be checked "
  "type " + container_type.__name__ + ". It won't be checked "
  "type " + container_type.__name__ + ". It won't be checked "
  "type " + container_type.__name__ + ". It won't be checked "
  "type " + container_type.__name__ + ". It won't be checked "
  "type " + container_type.__name__ + ". It won't be checked "
  "type " + container_type.__name__ + ". It won't be checked "
  "type " + container_type.__name__ + ". It won't be checked "
  "type " + container_type.__name__ + ". It won't be checked "
  "type " + container_type.__name__ + ". It won't be checked "


In [8]:
def greedy_decode(model, src, src_mask, max_len, start_symbol, end_symbol):
    memory = model.encode(src, src_mask)
    ys = torch.ones(1, 1).fill_(start_symbol).type_as(src.data)
    for i in range(max_len-1):
        out = model.decode(memory, src_mask, 
                           Variable(ys), 
                           Variable(subsequent_mask(ys.size(1))
                                    .type_as(src.data)))
        prob = model.generator(out[:, -1])
        _, next_word = torch.max(prob, dim = 1)
        next_word = next_word.data[0]
        ys = torch.cat([ys, 
                        torch.ones(1, 1).type_as(src.data).fill_(next_word)], dim=1)
        if next_word.item() == end_symbol:
            break
    return ys
validation_vi_en_iter = torchtext.data.BucketIterator(validation_vi_en, batch_size=1, sort_key= lambda e: len(e.source) + len(e.target),
                             repeat = False, sort_within_batch=True, shuffle=True, device=torch.device(0))

for batch in (validation_vi_en_iter):
    
    source, target = batch.source, batch.target
    source_data, source_len = source[0], source[1]
    target_data, target_len = target[0], target[1]

    source_mask = (source_data != TEXT_vi.vocab.stoi['<pad>']).unsqueeze(1)
    #source_mask: batch, 1, source_sen_len

    
    
    
    out = greedy_decode(model, source_data, source_mask, 
                        max_len=60, start_symbol=TEXT_en.vocab.stoi["<sos>"],
                       end_symbol=TEXT_en.vocab.stoi["<eos>"])
    print("Translation:", end="\t")
    for i in range(1, out.size(1)):
        sym = TGT.vocab.itos[out[0, i]]
        if sym == "</s>": break
        print(sym, end =" ")
    print()
    print("Target:", end="\t")
    for i in range(1, batch.trg.size(0)):
        sym = TGT.vocab.itos[batch.trg.data[i, 0]]
        if sym == "</s>": break
        print(sym, end =" ")
    print()
    break

tensor([[    2,  9794, 12230, 30385,  3967, 37037, 37221, 37221, 37221, 37221,
         18247, 18247, 32363, 24118, 37221,  2496, 12230, 18247, 22832, 35480,
         18247,  7971, 24118, 37221, 44315, 35785, 18247, 37221,  3963, 30835,
         45696, 37221, 11409, 35150, 18247, 24973, 30835, 34219, 39120, 44471,
         44266, 18247, 45696, 35150,  9794, 18194, 15957, 45696,  8545, 46928,
         21895,  3963,  1104,  3430, 29435,  9084, 11409, 39120,  4641, 45696]],
       device='cuda:0')


In [19]:
class Bean_Search_Status_Record:
    
    def __init__(self, predict_word_index_list, sum_log_prob):
        self.predict_word_index_list = predict_word_index_list
        self.sum_log_prob = sum_log_prob
        self.avg_log_prob = 0
        
    

def test(model, data_iter, k=10):
    model.eval()

    path_name = '../eval/'+str(time.time()).replace('.','_')+'/'
    os.mkdir(path_name)

    predict_file_name = path_name + 'predict.txt'
    target_file_name = path_name + 'target_file_name.txt'

    predict_file = open(predict_file_name, 'w')
    target_file = open(target_file_name, 'w')


    for batch in data_iter:
        
        
        
        source, target = batch.source, batch.target
        

        source_data,source_len = source[0], source[1]
        target_data,target_len = target[0], target[1]
        
        source_mask = (source_data != TEXT_vi.vocab.stoi['<pad>']).unsqueeze(1)
        
        memory = model.encode(source_data, source_mask)
        
        target_word = TEXT_en.vocab.stoi['<sos>']


        right_whole_sentence_word_index = target_data[1: target_len[0].item()-1,0]
        right_whole_sentence_word_index = list(right_whole_sentence_word_index.cpu().numpy())
        
        
        sequences = [Bean_Search_Status_Record(predict_word_index_list = [target_word], 
                                               sum_log_prob = 0.0)]
        
        t = 0
        while (t < 100):
            all_candidates = []
            for i in range(len(sequences)):
                record = sequences[i]
                predict_word_index_list = record.predict_word_index_list
                predict_word_index_list_tensor = torch.tensor(predict_word_index_list).view(1,-1).type_as(source_data)
                sum_log_prob = record.sum_log_prob
                last_word_index = predict_word_index_list[-1]
                
                if TEXT_en.vocab.stoi['<eos>'] != last_word_index:
                
                    out = model.decode(memory, source_mask, 
                                       Variable(predict_word_index_list_tensor), 
                                       Variable(subsequent_mask(predict_word_index_list_tensor.size(1))
                                                .type_as(source_data)))
                    prob = model.generator(out[:, -1])
        
                    k_prob_value_list, k_word_index_list = prob.topk(k,dim=1)
                    k_prob_value_list = k_prob_value_list.cpu().detach().squeeze().numpy()
                    k_word_index_list = k_word_index_list.cpu().squeeze().numpy()
                    
                    
                    for prob_value, word_index in zip(k_prob_value_list, k_word_index_list):
                        prob_value = float(prob_value)
                        word_index = int(word_index)
                        new_record = Bean_Search_Status_Record( predict_word_index_list+[word_index], sum_log_prob+prob_value)
                        new_record.avg_log_prob = new_record.sum_log_prob/(len(new_record.predict_word_index_list) - 1)
                        all_candidates.append(new_record)
                else:
                    all_candidates.append(record)
                        
            ordered = sorted(all_candidates, key = lambda r: r.sum_log_prob, reverse = True)
            sequences = ordered[:k]
            
            t += 1
            
        final_record = sequences[0]
        
        
        predict_whole_sentence_word_index = [TEXT_en.vocab.itos[temp_index] for temp_index in final_record.predict_word_index_list[1:-1]]
        right_whole_sentence_word_index = [TEXT_en.vocab.itos[temp_index] for temp_index in right_whole_sentence_word_index]

        predict_whole_sentence = ' '.join(predict_whole_sentence_word_index)
        right_whole_sentence = ' '.join(right_whole_sentence_word_index)

        predict_file.write(predict_whole_sentence.strip() + '\n')
        target_file.write(right_whole_sentence.strip() + '\n')


    predict_file.close()
    target_file.close()

    result = subprocess.run('cat {} | sacrebleu {}'.format(predict_file_name,target_file_name),shell=True,stdout=subprocess.PIPE)
    result = str(result)
    print(result)
    sys.stdout.flush()
    
    
    return get_blue_score(result)

In [22]:
validation_vi_en_iter = torchtext.data.BucketIterator(validation_vi_en, batch_size=1, sort_key= lambda e: len(e.source) + len(e.target),
                             repeat = False, sort_within_batch=True, shuffle=True, device=torch.device(0))

s = test(model, validation_vi_en_iter, k=2)

KeyboardInterrupt: 

In [13]:
s.predict_word_index_list

[2, 2711, 5, 19, 17, 42, 1193, 770, 5, 3]

In [18]:
import os


In [220]:
out.shape

torch.Size([32, 35, 512])

In [256]:
class LabelSmoothing(nn.Module):
    "Implement label smoothing."
    def __init__(self, size, padding_idx, smoothing=0.0):
        super(LabelSmoothing, self).__init__()
        self.criterion = nn.KLDivLoss(reduction='sum')
        self.padding_idx = padding_idx
        self.confidence = 1.0 - smoothing
        self.smoothing = smoothing
        self.size = size
        self.true_dist = None
        
    def forward(self, x, target):
        assert x.size(1) == self.size
        true_dist = x.data.clone()
        true_dist.fill_(self.smoothing / (self.size - 2))
        true_dist.scatter_(1, target.data.unsqueeze(1), self.confidence)
        true_dist[:, self.padding_idx] = 0
        mask = torch.nonzero(target.data == self.padding_idx)
        if mask.sum() > 0 and len(mask) > 0:
            true_dist.index_fill_(0, mask.squeeze(), 0.0)
        self.true_dist = true_dist
        return self.criterion(x, Variable(true_dist, requires_grad=False))
    
    
class SimpleLossCompute:
    "A simple loss compute and train function."
    def __init__(self, generator, criterion, opt=None):
        self.generator = generator
        self.criterion = criterion
        self.opt = opt
        
    def __call__(self, x, y):
        x = self.generator(x)
        loss = self.criterion(x.contiguous().view(-1, x.size(-1)), 
                              y.contiguous().view(-1))
        loss.backward()
        if self.opt is not None:
            self.opt.step()
            self.opt.zero_grad()
        return loss.cpu().detach().item()

In [254]:
criterion = LabelSmoothing(size=len(TEXT_en.vocab.stoi), padding_idx=TEXT_en.vocab.stoi['<pad>'], smoothing=0.1)
optimizer = torch.optim.Adam(model.parameters(), lr=0.001)
compute_loss = SimpleLossCompute(model.generator, criterion, optimizer)

In [255]:
compute_loss(out, target_true_word_index)

torch.Size([32, 1, 21])

In [107]:
mask = (t!=1).unsqueeze(-2)

In [108]:
mask.shape

torch.Size([32, 1, 21])

In [109]:
mask = mask.unsqueeze(1)

In [110]:
mask.shape

torch.Size([32, 1, 1, 21])

In [113]:
scores = torch.rand(32, 8, 21, 21).cuda(0)

In [115]:
scores = scores.masked_fill(mask == 0, -1e9)

In [119]:
scores = torch.nn.functional.softmax(scores, dim = -1)

In [120]:
mask

tensor([[[[1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 0]]],


        [[[1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 0]]],


        [[[1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 0, 0, 0]]],


        [[[1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 0]]],


        [[[1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 0, 0, 0, 0]]],


        [[[1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 0, 0, 0, 0, 0, 0, 0]]],


        [[[1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 0, 0]]],


        [[[1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 0, 0, 0, 0]]],


        [[[1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 0, 0, 0, 0]]],


        [[[1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1]]],


        [[[1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 0, 0, 0]]],


        [[[1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 0, 0, 0, 0]]],


        [[[1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1,

In [121]:
scores[4,1]

tensor([[0.0685, 0.0620, 0.0439, 0.0575, 0.0468, 0.0452, 0.0655, 0.0710, 0.0907,
         0.0386, 0.0478, 0.0623, 0.0604, 0.0673, 0.0451, 0.0695, 0.0579, 0.0000,
         0.0000, 0.0000, 0.0000],
        [0.0628, 0.0387, 0.0506, 0.0471, 0.0391, 0.0772, 0.0495, 0.0927, 0.0786,
         0.0512, 0.0401, 0.0535, 0.0969, 0.0381, 0.0822, 0.0467, 0.0550, 0.0000,
         0.0000, 0.0000, 0.0000],
        [0.0626, 0.0405, 0.0416, 0.0741, 0.0831, 0.0747, 0.0667, 0.0467, 0.0736,
         0.0425, 0.0484, 0.0594, 0.0748, 0.0670, 0.0400, 0.0333, 0.0707, 0.0000,
         0.0000, 0.0000, 0.0000],
        [0.0490, 0.0718, 0.0441, 0.0702, 0.0914, 0.0573, 0.0439, 0.0777, 0.0786,
         0.0358, 0.0481, 0.0428, 0.0567, 0.0430, 0.0818, 0.0693, 0.0387, 0.0000,
         0.0000, 0.0000, 0.0000],
        [0.0888, 0.0424, 0.0561, 0.0669, 0.0836, 0.0452, 0.0630, 0.0384, 0.0355,
         0.0545, 0.0623, 0.0389, 0.0348, 0.0746, 0.0701, 0.0589, 0.0859, 0.0000,
         0.0000, 0.0000, 0.0000],
        [0.0838, 0.0

In [123]:
class SimpleLossCompute:
    "A simple loss compute and train function."
    def __init__(self, generator, criterion, opt=None):
        self.generator = generator
        self.criterion = criterion
        self.opt = opt
        
    def __call__(self, x, y, norm):
        x = self.generator(x)
        loss = self.criterion(x.contiguous().view(-1, x.size(-1)), 
                              y.contiguous().view(-1)) / norm
        #x: batch, target_sentence_len, target_vocab_size
        #y: batch, target_sentence_len
        loss.backward()
        if self.opt is not None:
            self.opt.step()
            self.opt.optimizer.zero_grad()
        return loss.data[0] * norm

class LabelSmoothing(nn.Module):
    "Implement label smoothing."
    def __init__(self, size, padding_idx, smoothing=0.0):
        super(LabelSmoothing, self).__init__()
        self.criterion = nn.KLDivLoss(size_average=False)
        self.padding_idx = padding_idx
        self.confidence = 1.0 - smoothing
        self.smoothing = smoothing
        self.size = size
        self.true_dist = None
        
    def forward(self, x, target):
        '''
        x: batch * target_sentence_len, target_vob_size
        target: batch * target_sentence_len
        '''
        assert x.size(1) == self.size
        true_dist = x.data.clone()
        true_dist.fill_(self.smoothing / (self.size - 2))
        true_dist.scatter_(1, target.data.unsqueeze(1), self.confidence)
        true_dist[:, self.padding_idx] = 0
        mask = torch.nonzero(target.data == self.padding_idx)
        if mask.dim() > 0:
            true_dist.index_fill_(0, mask.squeeze(), 0.0)
        self.true_dist = true_dist
        return self.criterion(x, Variable(true_dist, requires_grad=False))


In [164]:
x = torch.rand(2*3,10, requires_grad=True)
y = torch.tensor([4,3,1,7,1,1])

In [165]:
true_dist = x.data.clone()
true_dist.fill_(0.1 / (10 - 2)).int()
true_dist.scatter_(1, y.data.unsqueeze(1), 0.9)
true_dist[:, 1] = 0
mask = torch.nonzero(y.data == 1)
true_dist.index_fill_(0, mask.squeeze(), 0.0)

tensor([[0.0125, 0.0000, 0.0125, 0.0125, 0.9000, 0.0125, 0.0125, 0.0125, 0.0125,
         0.0125],
        [0.0125, 0.0000, 0.0125, 0.9000, 0.0125, 0.0125, 0.0125, 0.0125, 0.0125,
         0.0125],
        [0.0000, 0.0000, 0.0000, 0.0000, 0.0000, 0.0000, 0.0000, 0.0000, 0.0000,
         0.0000],
        [0.0125, 0.0000, 0.0125, 0.0125, 0.0125, 0.0125, 0.0125, 0.9000, 0.0125,
         0.0125],
        [0.0000, 0.0000, 0.0000, 0.0000, 0.0000, 0.0000, 0.0000, 0.0000, 0.0000,
         0.0000],
        [0.0000, 0.0000, 0.0000, 0.0000, 0.0000, 0.0000, 0.0000, 0.0000, 0.0000,
         0.0000]])

In [173]:
a = torch.nn.KLDivLoss(reduction='sum')
loss = a(x[:],true_dist[:])

In [174]:
loss.backward()

In [175]:
x.grad

tensor([[-0.0125,  0.0000, -0.0125, -0.0125, -0.9000, -0.0125, -0.0125, -0.0125,
         -0.0125, -0.0125],
        [-0.0125,  0.0000, -0.0125, -0.9000, -0.0125, -0.0125, -0.0125, -0.0125,
         -0.0125, -0.0125],
        [ 0.0000,  0.0000,  0.0000,  0.0000,  0.0000,  0.0000,  0.0000,  0.0000,
          0.0000,  0.0000],
        [-0.0250,  0.0000, -0.0250, -0.0250, -0.0250, -0.0250, -0.0250, -1.8000,
         -0.0250, -0.0250],
        [ 0.0000,  0.0000,  0.0000,  0.0000,  0.0000,  0.0000,  0.0000,  0.0000,
          0.0000,  0.0000],
        [ 0.0000,  0.0000,  0.0000,  0.0000,  0.0000,  0.0000,  0.0000,  0.0000,
          0.0000,  0.0000]])

In [177]:
batch.source

(tensor([[[  149,   668,   567,  ...,  1590,     3,     1],
          [ 1804, 14556,    67,  ...,    80,  2659,     1],
          [ 1804,  6614,   487,  ...,     1,     1,     1],
          ...,
          [   35,     7,     6,  ..., 20397,     3,     1],
          [    4,   193,    12,  ...,     1,     1,     1],
          [    4,   181,     5,  ...,     3,     1,     1]],
 
         [[  149,   668,   567,  ...,  1590,     3,     1],
          [ 1804, 14556,    67,  ...,    80,  2659,     1],
          [ 1804,  6614,   487,  ...,     1,     1,     1],
          ...,
          [   35,     7,     6,  ..., 20397,     3,     1],
          [    4,   193,    12,  ...,     1,     1,     1],
          [    4,   181,     5,  ...,     3,     1,     1]]], device='cuda:0'),
 tensor([20, 20, 18, 20, 17, 14, 19, 17, 17, 21, 18, 17, 21, 19, 18, 19, 21, 19,
         18, 15, 19, 19, 16, 17, 16, 19, 18, 16, 19, 20, 17, 19],
        device='cuda:0'))

In [1]:

import numpy as np
import torch, torchtext
import torch.nn as nn
import torch.nn.functional as F
import math, copy, sys, os
from torch.autograd import Variable
import pickle
import csv
import re, random, string, subprocess, time


TEXT_vi = torchtext.data.ReversibleField(sequential=True, use_vocab=True, batch_first = True, tokenize= lambda t:t.split(),
                                        include_lengths=True)
TEXT_en = torchtext.data.ReversibleField(sequential=True, use_vocab=True, batch_first = False, tokenize= lambda t:t.split(),
                              lower=True, init_token='<sos>', eos_token='<eos>',include_lengths=True)
train_vi_en = torchtext.data.TabularDataset('/home/ql819/text_data/train_vi_en.csv', format='csv', 
                             fields=[('source',TEXT_vi),('target',TEXT_en)])
validation_vi_en = torchtext.data.TabularDataset('/home/ql819/text_data/dev_vi_en.csv', format='csv', 
                             fields=[('source',TEXT_vi),('target',TEXT_en)])


TEXT_vi.build_vocab(train_vi_en, min_freq=3)
TEXT_en.build_vocab(train_vi_en, min_freq=3)

train_vi_en_iter = torchtext.data.BucketIterator(train_vi_en, batch_size=1, sort_key= lambda e: len(e.source),
                             repeat = False, sort_within_batch=True, shuffle=True, device=torch.device(0))
validation_vi_en_iter = torchtext.data.BucketIterator(validation_vi_en, batch_size=1, sort_key= lambda e: len(e.source),
                             repeat = False, sort_within_batch=True, shuffle=True, device=torch.device(0))


class GRU_Decoder_With_Attention(torch.nn.Module):
    
    def __init__(self, num_vocab, input_size, hidden_size, dropout=0.1):
        super().__init__()
        self.num_vocab = num_vocab
        self.input_size = input_size
        self.hidden_size = hidden_size
        self.num_layers = 1
        self.dropout = dropout
        
        self.embedding_layer = torch.nn.Embedding(self.num_vocab, self.input_size)
        self.gru = torch.nn.GRU(hidden_size= self.hidden_size, input_size= self.input_size + 1 * self.hidden_size, 
                                  num_layers= self.num_layers)
        
        self.calcu_weight_1  = torch.nn.Linear(2*self.hidden_size, hidden_size)
        self.calcu_weight_2  = torch.nn.Linear(self.hidden_size, 1)
        
        self.init_weight = torch.nn.Linear(self.hidden_size, self.hidden_size)
        
        self.linear_vob = torch.nn.Linear(self.hidden_size, self.num_vocab)
        
    def forward(self, input_word_index, hidden_vector, encoder_memory, is_init = False):
        #input_word_index: [num]
        #hidden_vector: 1, 1, hidden_size
        #encoder_memory: source_sen_len , 1 * hidden_size
        
        if hidden_vector.shape[0] != self.num_layers or hidden_vector.shape[2] != self.hidden_size:
            raise ValueError('The size of hidden_vector is not correct, expect '+str((self.num_layers, self.hidden_size))\
                            + ', actually get ' + str(hidden_vector.shape))
        
        if is_init:
            hidden_vector = torch.tanh(self.init_weight(hidden_vector))
        
        
        n_hidden_vector = torch.stack([hidden_vector.squeeze()]*encoder_memory.shape[0],dim=0)
        com_n_h_memory = torch.cat([n_hidden_vector, encoder_memory], dim =1)
        com_n_h_temp = torch.tanh(self.calcu_weight_1(com_n_h_memory))
        
        
        weight_vector = self.calcu_weight_2(com_n_h_temp)
        weight_vector =  torch.nn.functional.softmax(weight_vector, dim=0)
        #weight_vector: source_sen_len * 1
        
        
        convect_vector = torch.mm(weight_vector.transpose(1,0), encoder_memory)
        #convect_vector: 1 , 2 * hidden_size
        
        
        input_vector = self.embedding_layer(input_word_index).view(1,1,-1)
        
        
        input_vector = torch.cat([convect_vector.unsqueeze(0), input_vector], dim=2)
        
        
        output, h_t = self.gru(input_vector,hidden_vector)
        output = output.view(1, self.hidden_size)
        
        
        prob = self.linear_vob(output)
        #prob 1, vob_size
        
        prob = torch.nn.functional.log_softmax(prob, dim=1)
        
        
        return prob, h_t


def clones(module, N):
    "Produce N identical layers."
    return nn.ModuleList([copy.deepcopy(module) for _ in range(N)])


class Encoder(nn.Module):
    "Core encoder is a stack of N layers"
    def __init__(self, layer, src_embed, N):
        super(Encoder, self).__init__()
        self.src_embed = src_embed
        self.layers = clones(layer, N)
        self.norm = LayerNorm(layer.size)
        
    def forward(self, x):
        "Pass the input (and mask) through each layer in turn."
        x = self.src_embed(x)
        for layer in self.layers:
            x = layer(x)
        return self.norm(x)
    
class LayerNorm(nn.Module):
    "Construct a layernorm module (See citation for details)."
    def __init__(self, features, eps=1e-6):
        super(LayerNorm, self).__init__()
        self.a_2 = nn.Parameter(torch.ones(features))
        self.b_2 = nn.Parameter(torch.zeros(features))
        self.eps = eps

    def forward(self, x):
        mean = x.mean(-1, keepdim=True)
        std = x.std(-1, keepdim=True)
        return self.a_2 * (x - mean) / (std + self.eps) + self.b_2

class SublayerConnection(nn.Module):
    """
    A residual connection followed by a layer norm.
    Note for code simplicity the norm is first as opposed to last.
    """
    def __init__(self, size, dropout):
        super(SublayerConnection, self).__init__()
        self.norm = LayerNorm(size)
        self.dropout = nn.Dropout(dropout)

    def forward(self, x, sublayer):
        "Apply residual connection to any sublayer with the same size."
        return x + self.dropout(sublayer(self.norm(x)))
    
class EncoderLayer(nn.Module):
    "Encoder is made up of self-attn and feed forward (defined below)"
    def __init__(self, size, self_attn, feed_forward, dropout):
        super(EncoderLayer, self).__init__()
        self.self_attn = self_attn
        self.feed_forward = feed_forward
        self.sublayer = clones(SublayerConnection(size, dropout), 2)
        self.size = size

    def forward(self, x):
        "Follow Figure 1 (left) for connections."
        x = self.sublayer[0](x, lambda x: self.self_attn(x, x, x))
        return self.sublayer[1](x, self.feed_forward)
    

    
def attention(query, key, value, dropout=None):
    '''
    query: batch, seq1, d_k
    key: batch, seq2, d_k
    value: batch, seq2, embedding_size
    mask: batch, 1, seq_2
    '''
    
    d_k = query.size(-1)
    scores = torch.matmul(query, key.transpose(-2, -1)) \
             / math.sqrt(d_k)
    p_attn = F.softmax(scores, dim = -1)
    if dropout is not None:
        p_attn = dropout(p_attn)
    return torch.matmul(p_attn, value), p_attn


class MultiHeadedAttention(nn.Module):
    def __init__(self, d_k, d_model, dropout=0.1):
        "Take in model size and number of heads."
        super(MultiHeadedAttention, self).__init__()
        
        # We assume d_v always equals d_k
        self.d_k = d_k
        self.linears = clones(nn.Linear(d_model, d_k), 2)
        self.attn = None
        self.dropout = nn.Dropout(p=dropout)
        
    def forward(self, query, key, value):
        "Implements Figure 2"
        
        nbatches = query.size(0)
        
        # 1) Do all the linear projections in batch from d_model => h x d_k 
        query, key = [l(x) for l, x in zip(self.linears, (query, key))]
        #query, key = batch, seq, d_k
        
        # 2) Apply attention on all the projected vectors in batch. 
        x, self.attn = attention(query, key, value,
                                 dropout=self.dropout)
        #x: batch, seq_query, embedding_size
        
        
        return x
    
    
class PositionwiseFeedForward(nn.Module):
    "Implements FFN equation."
    def __init__(self, d_model, dropout=0.1):
        super(PositionwiseFeedForward, self).__init__()
        self.w_1 = nn.Linear(d_model, d_model)
        self.dropout = nn.Dropout(dropout)

    def forward(self, x):
        return self.dropout(F.relu(self.w_1(x)))
    
    
class Embeddings(nn.Module):
    def __init__(self, d_model, vocab):
        super(Embeddings, self).__init__()
        self.lut = nn.Embedding(vocab, d_model)
        self.d_model = d_model

    def forward(self, x):
        return self.lut(x) * math.sqrt(self.d_model)
    
    
class PositionalEncoding(nn.Module):
    "Implement the PE function."
    def __init__(self, d_model, dropout, max_len=5000):
        super(PositionalEncoding, self).__init__()
        self.dropout = nn.Dropout(p=dropout)
        
        # Compute the positional encodings once in log space.
        pe = torch.zeros(max_len, d_model)
        position = torch.arange(0, max_len).unsqueeze(1).float()
        div_term = torch.exp(torch.arange(0, d_model, 2).float() *
                             -(math.log(10000.0) / d_model))
        pe[:, 0::2] = torch.sin(position * div_term)
        pe[:, 1::2] = torch.cos(position * div_term)
        pe = pe.unsqueeze(0)
        self.register_buffer('pe', pe)
        
    def forward(self, x):
        x = x + Variable(self.pe[:, :x.size(1)], 
                         requires_grad=False)
        return self.dropout(x)
    

    
def make_model(src_vocab, tgt_vocab, N=6, 
               d_model=512, d_k=64, dropout=0.1):
    "Helper: Construct a model from hyperparameters."
    c = copy.deepcopy
    attn = MultiHeadedAttention(d_k, d_model)
    ff = PositionwiseFeedForward(d_model, dropout)
    position = PositionalEncoding(d_model, dropout)
    encoder = Encoder(EncoderLayer(d_model, c(attn), c(ff), dropout), nn.Sequential(Embeddings(d_model, src_vocab), c(position)), N)
    decoder = GRU_Decoder_With_Attention(num_vocab = tgt_vocab, input_size = d_model, hidden_size = d_model)
    for p in encoder.parameters():
        if p.dim() > 1:
            nn.init.xavier_uniform_(p)
    return encoder, decoder



def train(encoder, decoder, optimizer, data_iter, teacher_forcing_ratio, batch_size = 64):

    encoder.train()
    decoder.train()
    
    count = 0
    loss = 0
    
    
    for batch in data_iter:
        
        
        source, target = batch.source, batch.target
        

        source_data,source_len = source[0], source[1]
        target_data,target_len = target[0], target[1]
        
        all_output = encoder(source_data)
        #all_output: 1, source_len, embedding_size

        use_teacher_forcing = True if random.random() < teacher_forcing_ratio else False
        

        output = all_output[:,0]
        target_word_list = target_data.squeeze()
        target_word = torch.tensor([TEXT_en.vocab.stoi['<sos>']]).cuda(0)

        h_t = output[0,:]
        h_t = h_t.view([1,1,-1])

        is_init = True

        for word_index in range(1, target_len[0].item()):
            prob, h_t = decoder(target_word, h_t, output, is_init)
            is_init = False
            if use_teacher_forcing:
                target_word = target_word_list[[word_index]]
                loss += torch.nn.functional.nll_loss(prob, target_word)
            else:
                right_target_word = target_word_list[[word_index]]
                loss += torch.nn.functional.nll_loss(prob, right_target_word)
                predict_target_word_index = prob.topk(1)[1].item()

                if TEXT_en.vocab.stoi['<eos>'] == predict_target_word_index:
                    break
                else:
                    target_word = torch.tensor([predict_target_word_index]).cuda(0)
                    
        count += 1
        if count % batch_size == 0:
            
            loss = loss/batch_size
            loss.backward()
            optimizer.step()
            optimizer.zero_grad()
            count = 0
            loss = 0
        
        
    if count % batch_size != 0:
        loss = loss/count
        loss.backward()
        optimizer.step()
        optimizer.zero_grad()
        
        
        
class Bean_Search_Status_Record:
    
    def __init__(self, h_t, predict_word_index_list, sum_log_prob):
        self.h_t = h_t
        self.predict_word_index_list = predict_word_index_list
        self.sum_log_prob = sum_log_prob
        self.avg_log_prob = 0
        
    

def test(encoder, decoder, data_iter, k=10):
    encoder.eval()
    decoder.eval()

    path_name = '../eval/'+str(time.time()).replace('.','_')+'/'
    os.mkdir(path_name)

    predict_file_name = path_name + 'predict.txt'
    target_file_name = path_name + 'target_file_name.txt'

    predict_file = open(predict_file_name, 'w')
    target_file = open(target_file_name, 'w')


    for batch in data_iter:
        
        
        
        source, target = batch.source, batch.target
        

        source_data,source_len = source[0], source[1]
        target_data,target_len = target[0], target[1]
        
        all_output = encoder(source_data)
        output = all_output[:,0]
        
        target_word = torch.tensor([TEXT_en.vocab.stoi['<sos>']]).cuda(0)

        h_t = output[0,:]
        h_t = h_t.view([1,1,-1])

        is_init = True


        right_whole_sentence_word_index = target_data[1: target_len[0].item()-1,0]
        right_whole_sentence_word_index = list(right_whole_sentence_word_index.cpu().numpy())
        
        
        sequences = [Bean_Search_Status_Record(h_t, predict_word_index_list = [target_word], 
                                               sum_log_prob = 0.0)]
        
        t = 0
        while (t < 100):
            all_candidates = []
            for i in range(len(sequences)):
                record = sequences[i]
                h_t = record.h_t
                predict_word_index_list = record.predict_word_index_list
                sum_log_prob = record.sum_log_prob
                target_word = predict_word_index_list[-1]
                
                if TEXT_en.vocab.stoi['<eos>'] != target_word:
                
                    prob, h_t = decoder(torch.tensor([target_word]).cuda(0), h_t, output, is_init)

                    k_prob_value_list, k_word_index_list = prob.topk(k,dim=1)
                    k_prob_value_list = k_prob_value_list.cpu().detach().squeeze().numpy()
                    k_word_index_list = k_word_index_list.cpu().squeeze().numpy()
                    
                    
                    for prob_value, word_index in zip(k_prob_value_list, k_word_index_list):
                        prob_value = float(prob_value)
                        word_index = int(word_index)
                        new_record = Bean_Search_Status_Record(h_t, predict_word_index_list+[word_index], sum_log_prob+prob_value)
                        new_record.avg_log_prob = new_record.sum_log_prob/(len(new_record.predict_word_index_list) - 1)
                        all_candidates.append(new_record)
                else:
                    all_candidates.append(record)
            is_init = False
                        
            ordered = sorted(all_candidates, key = lambda r: r.sum_log_prob, reverse = True)
            sequences = ordered[:k]
            
            t += 1
        final_record = sequences[0]
        
        predict_whole_sentence_word_index = [TEXT_en.vocab.itos[temp_index] for temp_index in final_record.predict_word_index_list[1:-1]]
        right_whole_sentence_word_index = [TEXT_en.vocab.itos[temp_index] for temp_index in right_whole_sentence_word_index]

        predict_whole_sentence = ' '.join(predict_whole_sentence_word_index)
        right_whole_sentence = ' '.join(right_whole_sentence_word_index)

        predict_file.write(predict_whole_sentence.strip() + '\n')
        target_file.write(right_whole_sentence.strip() + '\n')


    predict_file.close()
    target_file.close()

    result = subprocess.run('cat {} | sacrebleu {}'.format(predict_file_name,target_file_name),shell=True,stdout=subprocess.PIPE)
    result = str(result)
    print(result)
    sys.stdout.flush()
    
    
    return get_blue_score(result)


def get_blue_score(s):
    a = re.search(r'13a\+version\.1\.2\.12 = ([0-9.]+)',s)
    return float(a.group(1))



def parameters_list_change_grad(encoder, decoder):
    para_list = []
    for name, data in list(encoder.named_parameters()):
        if 'src_embed' in name:
            data.requires_grad = False
        else:
            para_list.append(data)
            
    for name, data in list(decoder.named_parameters()):
        if 'embedding' in name:
            data.requires_grad = False
        else:
            para_list.append(data)
    return para_list        




encoder,decoder = make_model(src_vocab=len(TEXT_vi.vocab.stoi), tgt_vocab=len(TEXT_en.vocab.stoi), N=6, 
               d_model=512, d_k=64, dropout=0.1)

encoder = encoder.cuda(0)
decoder = decoder.cuda(0)



In [None]:
def train(encoder, decoder, optimizer, data_iter, teacher_forcing_ratio, batch_size = 64):

    encoder.train()
    decoder.train()
    
    count = 0
    loss = 0
    
    
    for batch in data_iter:
        
        
        source, target = batch.source, batch.target
        

        source_data,source_len = source[0], source[1]
        target_data,target_len = target[0], target[1]
        
        all_output = encoder(source_data)
        #all_output: 1, source_len, embedding_size

        use_teacher_forcing = True if random.random() < teacher_forcing_ratio else False
        

        output = all_output[:,0]
        target_word_list = target_data.squeeze()
        target_word = torch.tensor([TEXT_en.vocab.stoi['<sos>']]).cuda(0)

        h_t = output[0,:]
        h_t = h_t.view([1,1,-1])

        is_init = True

        for word_index in range(1, target_len[0].item()):
            prob, h_t = decoder(target_word, h_t, output, is_init)
            is_init = False
            if use_teacher_forcing:
                target_word = target_word_list[[word_index]]
                loss += torch.nn.functional.nll_loss(prob, target_word)
            else:
                right_target_word = target_word_list[[word_index]]
                loss += torch.nn.functional.nll_loss(prob, right_target_word)
                predict_target_word_index = prob.topk(1)[1].item()

                if TEXT_en.vocab.stoi['<eos>'] == predict_target_word_index:
                    break
                else:
                    target_word = torch.tensor([predict_target_word_index]).cuda(0)
                    
        count += 1
        if count % batch_size == 0:
            
            loss = loss/batch_size
            loss.backward()
            optimizer.step()
            optimizer.zero_grad()
            count = 0
            loss = 0
        
        
    if count % batch_size != 0:
        loss = loss/count
        loss.backward()
        optimizer.step()
        optimizer.zero_grad()
        