In [1]:
filepath = r'C:\Users\hp\Desktop\WikiTable'

In [2]:
import pandas as pd
import numpy as np
import re, random, os, torch, math, time
import torch.nn as nn
import torch.nn.functional as F
from torchtext.data import Field, BucketIterator, get_tokenizer, Dataset, TabularDataset

In [3]:
device = torch.device("cuda" if torch.cuda.is_available() else "cpu")

In [4]:
tokenize = lambda text: text.split()

ATTRIBUTE = Field(tokenize=tokenize)
CELL = Field(tokenize=tokenize)
CAPTION = Field(tokenize=tokenize, init_token='<sos>', eos_token='<eos>')

In [5]:
data_fields = [('attributes',ATTRIBUTE),('cells',CELL),('captions',CAPTION)]

train_data, val_data, test_data = TabularDataset.splits(path=filepath, 
                                                        format='csv',
                                                        train='train.csv', 
                                                        validation='val.csv', 
                                                        test='test.csv', 
                                                        skip_header=True, 
                                                        fields=data_fields)

In [6]:
print(f"Number of training examples: {len(train_data.examples)}")
print(f"Number of validation examples: {len(val_data.examples)}")
print(f"Number of testing examples: {len(test_data.examples)}")

Number of training examples: 10000
Number of validation examples: 1318
Number of testing examples: 2000


In [7]:
print(vars(train_data.examples[0]))

{'attributes': ['subj_title', 'subj_subtitle', 'date', 'winning_$$_team', 'score', 'losing_$$_team'], 'cells': ['1978_$$_federation_$$_cup_$$_(_$$_tennis_$$_)', 'qualifying_$$_round', '19_$$_august', 'philippines', '3–0', 'thailand'], 'captions': ['philippines', 'won', 'thailand', 'with', '3–0', 'during', '1978', 'federation', 'cup.']}


In [8]:
ATTRIBUTE.build_vocab(train_data, min_freq = 1)
CELL.build_vocab(train_data, min_freq = 1)
CAPTION.build_vocab(train_data, min_freq = 1)

In [10]:
print(f"Unique tokens in attributes vocabulary: {len(ATTRIBUTE.vocab)}")
print(f"Unique tokens in cells vocabulary: {len(CELL.vocab)}")
print(f"Unique tokens in captions vocabulary: {len(CAPTION.vocab)}")

Unique tokens in attributes vocabulary: 2905
Unique tokens in cells vocabulary: 29688
Unique tokens in captions vocabulary: 25176


In [11]:
BATCH_SIZE = 128

train_iterator, valid_iterator, test_iterator = BucketIterator.splits((train_data, val_data, test_data), 
                                                                      batch_size = BATCH_SIZE, 
                                                                      device = device)

In [12]:
train_batch = next(iter(train_iterator))

print(f'shape of training batch attributes: {train_batch.attributes.shape}')
print(f'shape of training batch cells: {train_batch.cells.shape}')
print(f'shape of training batch captions: {train_batch.captions.shape}')

shape of training batch attributes: torch.Size([12, 128])
shape of training batch cells: torch.Size([12, 128])
shape of training batch captions: torch.Size([22, 128])


In [13]:
max_attr_length = max([len(train_data[i].attributes) for i in range(len(train_data))])
max_source_length = max([len(train_data[i].cells) for i in range(len(train_data))])
max_summary_length = max([len(train_data[i].captions) for i in range(len(train_data))])
print('Maximum attribute length: ', max_attr_length)
print('Maximum source length is: ', max_source_length)
print('Maximum target length is: ', max_summary_length)

Maximum attribute length:  12
Maximum source length is:  12
Maximum target length is:  47


In [14]:
word2id_cell, id2word_cell = {}, {}
for word,idx in CELL.vocab.stoi.items():
    word2id_cell[word] = idx
    id2word_cell[idx] = word

In [16]:
word2id_cap, id2word_cap = {}, {}
for word,idx in CAPTION.vocab.stoi.items():
    word2id_cap[word] = idx
    id2word_cap[idx] = word

In [17]:
class Encoder(nn.Module):
  
  '''
  Args:
    input_vocab_size: (int) Size of source vocabulary
    embed_size: (int) Embedding dimensions
    hidden_size: (int) Dimensions of hidden state
    num_layers: (int) Number of stacked GRU layers, default is 1
    bidirectional: (Bool) If RNN is required to be birectional in nature, default is False
  '''
  
  def __init__(self,attr_vocab_size, input_vocab_size, embed_size, hidden_size,num_layers=1,bidirectional=False):
    super(Encoder,self).__init__()

    self.bidirectional = bidirectional
    self.num_layers = num_layers
    
    self.hidden_size = hidden_size
    self.attr_vocab_size = attr_vocab_size
    self.input_vocab_size = input_vocab_size

    self.embedding = nn.Embedding(input_vocab_size, embed_size) #new try with embedding
    self.linear = nn.Linear(2*embed_size, embed_size)
    self.nonlinear = nn.Tanh()

    self.gru_layer = nn.GRU(embed_size, hidden_size, num_layers, bidirectional=bidirectional)

  def forward(self, input_cell, input_attr, prev_hidden_state):
    '''Arg:
        input_: Tensor of source word indices
        prev_hidden_state: Previous hidden state
    '''

    attr_tensor = input_attr
    #cell_tensor = [seq length x batch size]
    cell_tensor = input_cell
    #cell_tensor = [seq length x batch size]
    embedded_attr = self.embedding(attr_tensor)
    #embedded_attr = [seq length x batch size x embed dim]
    embedded_cell = self.embedding(cell_tensor)
    #embedded_cell = [seq length x batch size x embed dim]
    input_source = torch.cat((embedded_cell,embedded_attr), -1)
    #input_source = [seq length x batch size x 2*embed dim]
    
    input_tanh = self.linear(input_source)
    embedded_outputs = self.nonlinear(input_tanh)

    output, prev_hidden_state = self.gru_layer(embedded_outputs,prev_hidden_state)  #output is batch_size times hidden_size
    #prev_hidden_state = [n_layers*n direction x batch size x hidden dime]
    #output = [seq length x batch size x hidden dim*n direction]
    # default n direction = 1
    #outputs are always from the top hidden layer
    return output,prev_hidden_state

  def init_hidden(self):
    return torch.zeros(1,1,self.hidden_size,device=device)

In [18]:
class AttentionDecoder(nn.Module):

  '''
  Args:
    output_vocab_size: (int) Size of target vocabulary
    embed_dim: (int) Embedding dimensions
    hidden_size: (int) Dimensions of hidden state
    max_length_encoder: (int) Maximum length of encoder sequence
    dropout_value: (float) Value between 0 & 1
    num_layers: (int) Number of stacked GRU layers, default is 1
  ''' 

  def __init__(self, output_vocab_size, embed_dim, hidden_size, max_length_encoder, dropout_value, num_layers=1):
      super(AttentionDecoder,self).__init__()
   
      self.hidden_size = hidden_size
      self.num_layers = num_layers
      self.output_vocab_size = output_vocab_size
      self.dropout_p = dropout_value
      self.max_length_encoder = max_length_encoder
      
      self.embedding = nn.Embedding(output_vocab_size, embed_dim) 
      
      self.attention_layer = nn.Linear(hidden_size*2, max_length_encoder)
      self.attention_combine = nn.Linear(hidden_size*2, hidden_size)

      self.s_layer = nn.Linear(hidden_size, 1)
      self.x_layer = nn.Linear(hidden_size, 1)
      self.context_layer = nn.Linear(hidden_size, 1)
      self.linear_pgen = nn.Linear(3, 1)

      self.gru_layer = nn.GRU(embed_dim, hidden_size)
      self.output_layer = nn.Linear(hidden_size, output_vocab_size)
      self.dropout_layer = nn.Dropout(self.dropout_p)    

  def forward(self,input_tens,prev_hidden_state,encoder_output):
      
      #input_tens = [batch size]
      input_tens = input_tens.unsqueeze(0)
      embedded_outputs = self.embedding(input_tens)  #.view(1,1,-1)
      #input_tens = [1 x batch size]
      #embedded_outputs = [1 x batch size x embed dim]

      embeddings_dropout = self.dropout_layer(embedded_outputs)
      #embeddings_dropout = [1 x batch size x embed dim]

      #prev_hidden_state = [n_layers x batch size x hidden dim] = [1 x batch size x hidden dim]
      attention_layer_output = self.attention_layer(torch.cat((embeddings_dropout[0],prev_hidden_state[0]),1)) #was 0th index before
      #cat = [batch size x (embed dim + hidden dim)] = [batch size x 2*(hidden dim)]
      #in our case emdedding dimension is going to be same as hidden dimension
      #attention_layer_output = [batch size x max encoder length]

      attention_weights = nn.functional.softmax(attention_layer_output,dim=1)
      #attention_weights = [batch size x max encoder length]
      
      attention_applied = torch.bmm(attention_weights.unsqueeze(1),encoder_output.permute(1, 0, 2)) # .unsqueeze(0)
      #attention_weights = [batch size x max encoder length], after unsqueezing in 1st dim ==> [batch size x 1 x max encoder length]
      #encoder_output = [max encoder length x batch size x hidden dim], after permute ==> [batch size x max encoder length x hidden dim] 
      #attention_applied = [batch size x 1 x hidden dim]
      
      attention_applied = attention_applied.permute(1, 0, 2)
      #attention_applied = [1 x batch size x hidden dim]

      attention_combine_logits = self.attention_combine(torch.cat((embeddings_dropout[0],attention_applied[0]),1)).unsqueeze(0)  #since gru requires a batch dimension
      #embeddings_dropout = [1 x batch size x embed dim]
      #attention_applied = [1 x batch size x hidden dim]
      #cat = [batch size x (embed dim + hidden dim)] = [batch size x 2*(hidden dim)]
      #attention_combine_logits = [batch size x hidden dim], after unsqueezing in 0th dim ==> [1 x batch size x hidden dim]
      
      attention_combine_relu = nn.functional.relu(attention_combine_logits)
      #attention_combine_relu = [1 x batch size x hidden dim]

      s_output = self.s_layer(prev_hidden_state[0])
      #prev_hidden_state = [n_layers x batch size x hidden dime]
      #s_output = [batch size x 1]

      x_output = self.x_layer(embeddings_dropout[0])
      #embeddings_dropout = [seq length x batch size x embed dim]
      #x_output = [batch size x 1] as (hidden dim = embed dim)
      
      context = torch.flatten(attention_applied)
      #attention_applied = [1 x batch size x hidden dim]
      #context = [batch size * hidden dim]

      context_weights = self.context_layer(attention_applied)
      #context_weights = [1 x batch size x 1]

      sx = torch.cat((s_output[0],x_output[0]),0)
      #sx = [1 x 2*(unit)]
      sxc = torch.cat((sx,context_weights[0][0]),0)
      #sxc = [1 x 3*(unit)]
      linear_pgen = self.linear_pgen(sxc)
      #linear_pgen = [1 x 1]
      m = nn.Sigmoid()
      pgen = m(linear_pgen)
      #pgen = [1 x 1]

      output,hidden = self.gru_layer(attention_combine_relu,prev_hidden_state)
      #attention_combine_relu = [1 x batch size x hidden dim]
      #prev_hidden_state = [n_layers x batch size x hidden dime]
      #output = [1 x batch size x hidden dim]
      #hidden = [n_layers x batch size x hidden dime]

      output_logits = self.output_layer(output)
      #output_logits = [1 x batch size x output vocab size]
      output_softmax = nn.functional.log_softmax(output_logits[0],dim=1)
      #output_softmax = [batch size x output vocab size], softmax applied distribution over whole target vocab
      return output_softmax,hidden,attention_weights,pgen

  def init_hidden(self):
    return torch.zeros(1,1,self.hidden_size,device=device)

In [55]:
class Seq2Seq(nn.Module):
    def __init__(self, encoder, decoder, device):
        super().__init__()
        
        self.encoder = encoder
        self.decoder = decoder
        self.device = device
        
        assert encoder.hidden_size == decoder.hidden_size, \
            "Hidden dimensions of encoder and decoder must be equal!"
        assert encoder.num_layers == decoder.num_layers, \
            "Encoder and decoder must have equal number of layers!"
        
    def forward(self, attr, src, trg, teacher_forcing_ratio = 0.5):
        
        #attr = [src len x batch size] (attr length = source length)
        #src = [src len x batch size]
        #trg = [trg len x batch size]
        #teacher_forcing_ratio is probability to use teacher forcing
        #e.g. if teacher_forcing_ratio is 0.75 we use ground-truth inputs 75% of the time
        
        batch_size = trg.shape[1]
        input_length = src.shape[0]
        trg_len = trg.shape[0]
        trg_vocab_size = self.decoder.output_vocab_size
        
        #tensor to store decoder outputs
        outputs = torch.zeros(trg_len, batch_size, trg_vocab_size).to(self.device)
        init_hidden = torch.zeros(self.encoder.num_layers, batch_size, self.encoder.hidden_size).to(self.device) #(new)
        
        #last hidden state of the encoder is used as the initial hidden state of the decoder
        #hidden, cell = self.encoder(src)  (original)
        enc_output, enc_hidden = self.encoder(src, attr, init_hidden) #(new)
        
        #first input to the decoder is the <sos> tokens
        dec_input = trg[0,:]

        duplicate_words = []
        input_list = src.permute(1,0).tolist()
        i = 0
        for batch in input_list:
            j = 0
            for input_word in batch:
                if id2word_cell[input_word] in word2id_cap.keys():
                    duplicate_words.append( (i,j,word2id_cap[id2word_cell[input_word]]) )
                j += 1
            i += 1
        
        for t in range(1, trg_len):
            
            #insert input token embedding, previous hidden and previous cell states
            #receive output tensor (predictions) and new hidden and cell states
            #output, hidden, cell = self.decoder(dec_input, hidden, cell) (original)
            decoder_output,decoder_hidden,decoder_attention,pgen = self.decoder(dec_input, enc_hidden, enc_output)
            
            
            #decide if we are going to use teacher forcing or not
            teacher_force = random.random() < teacher_forcing_ratio
            
            #get the highest predicted token from our predictions
            #top1 = decoder_output.argmax(1) (original)
            
            '''experiment with pgen'''
            
            P_over_extended_vocab = torch.exp(decoder_output)*pgen.expand_as(torch.exp(decoder_output))
            #P_over_extended_vocab = [batch size x output vocab size] (exp(decoder_output)*pgen)

            decoder_attention = decoder_attention  #.squeeze(0)[0:input_length].unsqueeze(0)
            #restricting decoder attention upto only input length
            #decoder_attention = [batch size x input_length]
            p_duplicate_list = torch.zeros([batch_size, input_length, P_over_extended_vocab.size(1)], device=device)
            #p_duplicate_list = [batch size x input_length x output vocab size] 

            p_duplicate_list = p_duplicate_list.tolist()
            for (batch_id, duplicate_word_key,duplicate_word_value) in duplicate_words:
                p_duplicate_list[batch_id][duplicate_word_key][duplicate_word_value] = 1 #making duplicate key,vals apparent

            p_duplicate = torch.tensor(p_duplicate_list, dtype=torch.float, device=device)
            p_diag = torch.mm(decoder_attention.unsqueeze(1), p_duplicate)
            #decoder_attention after unsqueezing ==> [batch size x 1 x input_length]
            #p_diag = [batch size x 1 x output vocab size]
            
            p_diag = p_diag.squeeze(1)

            p_diag = p_diag*(torch.tensor([1], device=device).sub(pgen)).expand_as(p_diag)
            #p_diag = p_diag*(1 - pgen)
            
            p_add_diag = []
            for i in range(batch_size):
                diag = torch.diag(p_diag[i],diagonal=0)
                p_add_diag.append(diag.tolist())
            
            p_add_diag = torch.tensor(p_add_diag, dtype=torch.float, device=device) #new
            #p_add_diag = torch.diag(p_diag.squeeze(0),diagonal=0) #p_diag.squeeze(0) ==> [output vocab size]
            #p_add_diag = [batch size x output vocab size x output vocab size] 

            P_over_extended_vocab = torch.mm(P_over_extended_vocab.unsqueeze(1),p_add_diag).add(P_over_extended_vocab.unsqueeze(1))
            #mm = [batch size x 1 x output vocab size]
            #P_over_extended_vocab = [batch size x 1 x output vocab size] (element wise summation)
            
            P_over_extended_vocab = P_over_extended_vocab.squeeze(1)
            '''for batch_id in range(batch_size):
                for i in range(input_length):
                    if not (1 in p_duplicate_list[batch_id][i]):
                        P_over_extended_vocab[batch_id] = torch.cat((P_over_extended_vocab[batch_id], 
                                                           torch.mm(decoder_attention[batch_id][i].unsqueeze(0).unsqueeze(0), 
                                                                    torch.tensor([1], device=device).sub(pgen).unsqueeze(0)).squeeze(0)),0)'''

            #place predictions in a tensor holding predictions for each token
            outputs[t] = P_over_extended_vocab   # decoder_output was before           
            
            top1 = P_over_extended_vocab.argmax(1)
            #if teacher forcing, use actual next token as next input
            #if not, use predicted token
            dec_input = trg[t] if teacher_force else top1
        
        return outputs

In [56]:
p_diag = torch.randn(2, 3)
p_add_diag = []
for i in range(2):
    diag = torch.diag(p_diag[i],diagonal=0)
    p_add_diag.append(diag.tolist())

p_add_diag = torch.tensor(p_add_diag, dtype=torch.float, device=device)

p_diag[0][0].unsqueeze(0).unsqueeze(0)

tensor([[-1.4466]])

In [57]:
p_diag = torch.randn(2, 3)
p_diag

tensor([[-0.1650,  1.8783,  0.0852],
        [ 0.2867,  0.1171,  0.2655]])

In [58]:
rep = torch.randn(1)
rep

tensor([1.1255])

In [59]:
p_diag[0] = torch.cat((p_diag[0], rep),0)
p_diag[0]

RuntimeError: The expanded size of the tensor (3) must match the existing size (4) at non-singleton dimension 0.  Target sizes: [3].  Tensor sizes: [4]

In [60]:
ATTR_DIM = len(ATTRIBUTE.vocab)
INPUT_DIM = len(CELL.vocab)
OUTPUT_DIM = len(CAPTION.vocab)
ENC_EMB_DIM = 128
DEC_EMB_DIM = 128
HID_DIM = 128
MAX_CELL_LEN = max_source_length
N_LAYERS = 1
DEC_DROPOUT = 0.2

enc = Encoder(attr_vocab_size=ATTR_DIM, 
              input_vocab_size=INPUT_DIM, 
              embed_size=ENC_EMB_DIM, 
              hidden_size=HID_DIM, 
              num_layers=1, 
              bidirectional=False)
dec = AttentionDecoder(output_vocab_size=OUTPUT_DIM, 
                       embed_dim=DEC_EMB_DIM, 
                       hidden_size=HID_DIM, 
                       max_length_encoder=MAX_CELL_LEN, 
                       dropout_value=DEC_DROPOUT, 
                       num_layers=1)

model = Seq2Seq(enc, dec, device).to(device)

In [61]:
enc.hidden_size

128

In [62]:
def init_weights(m):
    for name, param in m.named_parameters():
        nn.init.uniform_(param.data, -0.08, 0.08)
        
model.apply(init_weights)

Seq2Seq(
  (encoder): Encoder(
    (embedding): Embedding(29688, 128)
    (linear): Linear(in_features=256, out_features=128, bias=True)
    (nonlinear): Tanh()
    (gru_layer): GRU(128, 128)
  )
  (decoder): AttentionDecoder(
    (embedding): Embedding(25176, 128)
    (attention_layer): Linear(in_features=256, out_features=12, bias=True)
    (attention_combine): Linear(in_features=256, out_features=128, bias=True)
    (s_layer): Linear(in_features=128, out_features=1, bias=True)
    (x_layer): Linear(in_features=128, out_features=1, bias=True)
    (context_layer): Linear(in_features=128, out_features=1, bias=True)
    (linear_pgen): Linear(in_features=3, out_features=1, bias=True)
    (gru_layer): GRU(128, 128)
    (output_layer): Linear(in_features=128, out_features=25176, bias=True)
    (dropout_layer): Dropout(p=0.2, inplace=False)
  )
)

In [63]:
def count_parameters(model):
    return sum(p.numel() for p in model.parameters() if p.requires_grad)

print(f'The model has {count_parameters(model):,} trainable parameters')

The model has 10,537,707 trainable parameters


In [64]:
import torch.optim as optim

optimizer = optim.Adam(model.parameters())

In [65]:
CAPTION_PAD_IDX = CAPTION.vocab.stoi[CAPTION.pad_token]

criterion = nn.CrossEntropyLoss(ignore_index=CAPTION_PAD_IDX)

In [66]:
def train(model, iterator, optimizer, criterion, clip):
    
    model.train()
    
    epoch_loss = 0
    
    for i, batch in enumerate(iterator):
        
        attr = batch.attributes
        src = batch.cells
        trg = batch.captions
        
        optimizer.zero_grad()
        
        output = model(attr, src, trg)
        
        #trg = [trg len, batch size]
        #output = [trg len, batch size, output dim]
        
        output_dim = output.shape[-1]
        
        output = output[1:].view(-1, output_dim)
        trg = trg[1:].view(-1)
        
        #trg = [(trg len - 1) * batch size]
        #output = [(trg len - 1) * batch size, output dim]
        
        loss = criterion(output, trg)
        
        loss.backward()
        
        torch.nn.utils.clip_grad_norm_(model.parameters(), clip)
        
        optimizer.step()
        
        epoch_loss += loss.item()
        
    return epoch_loss / len(iterator)

In [67]:
def evaluate(model, iterator, criterion):
    
    model.eval()
    
    epoch_loss = 0
    
    with torch.no_grad():
    
        for i, batch in enumerate(iterator):

            attr = batch.attributes
            src = batch.cells
            trg = batch.captions

            output = model(attr, src, trg, 0) #turn off teacher forcing

            #trg = [trg len, batch size]
            #output = [trg len, batch size, output dim]

            output_dim = output.shape[-1]
            
            output = output[1:].view(-1, output_dim)
            trg = trg[1:].view(-1)

            #trg = [(trg len - 1) * batch size]
            #output = [(trg len - 1) * batch size, output dim]

            loss = criterion(output, trg)
            
            epoch_loss += loss.item()
        
    return epoch_loss / len(iterator)

In [68]:
def epoch_time(start_time, end_time):
    elapsed_time = end_time - start_time
    elapsed_mins = int(elapsed_time / 60)
    elapsed_secs = int(elapsed_time - (elapsed_mins * 60))
    return elapsed_mins, elapsed_secs

In [69]:
N_EPOCHS = 10
CLIP = 1

best_valid_loss = float('inf')

for epoch in range(N_EPOCHS):
    
    start_time = time.time()
    
    train_loss = train(model, train_iterator, optimizer, criterion, CLIP)
    valid_loss = evaluate(model, valid_iterator, criterion)
    
    end_time = time.time()
    
    epoch_mins, epoch_secs = epoch_time(start_time, end_time)
    
    if valid_loss < best_valid_loss:
        best_valid_loss = valid_loss
        torch.save(model.state_dict(), 'tut1-model.pt')
    
    print(f'Epoch: {epoch+1:02} | Time: {epoch_mins}m {epoch_secs}s')
    print(f'\tTrain Loss: {train_loss:.3f} | Val. Loss: {valid_loss:.3f}')

RuntimeError: matrices expected, got 3D, 3D tensors at C:\cb\pytorch_1000000000000\work\aten\src\TH/generic/THTensorMath.cpp:36

In [14]:
print('Size of encoder vocab: ',len(word2Index_enc))
print('Size of decoder vocab: Full {} | Frequent {}'.format(len(word2Index_dec_big),len(word2Index_dec)))

Size of encoder vocab:  29687
Size of decoder vocab: Full 25175 | Frequent 5565


In [15]:
def train(encoder, decoder, input_tensor, attr_tensor, target_tensor, 
          encoder_optimizer, decoder_optimizer, criterion, max_length, iters, 
          teacher_forcing_ratio = 0.5, clip = 0.4):
  '''
  Arg:
    encoder: encoder model to train
    decoder: decoder model to train
    input_tensor: source seq in tensor form [seq length x 1] batch size = 1
    target_tensor: target seq in tensor form [seq length x 1] batch size = 1
    encoder_optimizer: optimizer for encoder
    decoder_optimizer: optimizer for decoder
    citerion: 
    max length: maximum source length 
    iters: number of iterations
    teacher_forcing_ratio: if teacher forcing, actual next token is useed as next input
    clip: to prevent gradients from exploding 
  '''
  encoder_optimizer.zero_grad() #initialize encoder_optimizer at zero gradient
  decoder_optimizer.zero_grad() #initialize decoder_optimizer at zero gradient

  #prev_unk_word = ''
  encoder_hidden = encoder.init_hidden()
  #encoder_hidden = [1 x 1 x hidden dim]

  encoder_outputs = torch.zeros(max_length, encoder.hidden_size, device = device)
  #encoder_outputs = [seq length x hidden dim]

  input_length = input_tensor.size(0)
  output_length = target_tensor.size(0)

  loss = 0

  for encoder_index in range(0, input_length):
    encoder_output,encoder_hidden = encoder(input_tensor[encoder_index], attr_tensor[encoder_index], encoder_hidden)
    #input_tensor[encoder_index] = [1 x 1 x embed dim] (embed dim = hidden dim)
    #encoder_hidden = [1 x 1 x hidden dim] {encoder arg inp}
    #encoder_hidden = [n_layers*n direction x 1 x hidden dime] {encoder product}
    #encoder_output = [seq length x 1 x hidden dim*n direction]
    #seq length, n_layers, n direction = 1  

    encoder_outputs[encoder_index] = encoder_output[0,0] # [1 x hidden dim]
    #encoder_outputs: [seq length x hidden dim] ==> [seq length x hidden dim] (hidden state from all 0 to new)

  decoder_input = torch.tensor([word2Index_dec['<START>']],device=device)
  #decoder_input = [1 x 1]
  decoder_hidden = encoder_hidden
  #decoder_hidden = [1 x 1 x hidden dim]
  use_teacher_forcing = True if random.random() < teacher_forcing_ratio else False

  extended_vocab = psuInd2Word_dec.copy()
  reverse_extended_vocab = word2PsuInd_dec.copy()
  duplicate_words = {}
  extend_key = len(word2Index_dec.keys())
  input_list = input_tensor.tolist()
  i =0
  for input_word in input_list:
    if ind2Word_enc[input_word[0]] in word2Index_dec.keys():
      duplicate_words[i] = word2PsuInd_dec[ind2Word_enc[input_word[0]]]
    else:
      extended_vocab[extend_key] = ind2Word_enc[input_word[0]]
      reverse_extended_vocab[ind2Word_enc[input_word[0]]] = extend_key
      extend_key += 1
    i = i+1

  if use_teacher_forcing:
    for decoder_index in range(output_length):
      decoder_output,decoder_hidden,decoder_attention,pgen = decoder(decoder_input,decoder_hidden,encoder_outputs)
      #decoder_input = [1 x 1]
      #decoder_hidden = [1 x 1 x hidden dim]
      #encoder_outputs = [seq length x hidden dim]

      #decoder_output = [1 x output vocab size]
      #decoder_hidden = [1 x 1 x hidden dime]
      #decoder_attention = [1 x seq length]
      #pgen = [1 x 1]

      P_over_extended_vocab = torch.exp(decoder_output)*pgen.expand_as(torch.exp(decoder_output))
      #P_over_extended_vocab = [1 x output vocab size] (exp(decoder_output)*pgen)

      decoder_attention = decoder_attention.squeeze(0)[0:input_length].unsqueeze(0)
      #restricting decoder attention upto only input length
      #decoder_attention = [1 x input_length]
      p_duplicate_list = torch.zeros([input_length, P_over_extended_vocab.size(1)], device=device)
      #p_duplicate_list = [input_length x output vocab size] 

      p_duplicate_list = p_duplicate_list.tolist()
      for (duplicate_word_key,duplicate_word_value) in duplicate_words.items():
        p_duplicate_list[duplicate_word_key][duplicate_word_value] = 1 #making duplicate key,vals apparent
      
      p_duplicate = torch.tensor(p_duplicate_list, dtype=torch.float, device=device)
      p_diag = torch.mm(decoder_attention, p_duplicate)
      #p_diag = [1 x output vocab size]
      
      p_diag = p_diag*(torch.tensor([1], device=device).sub(pgen)).expand_as(p_diag)
      #p_diag = p_diag*(1 - pgen)

      p_add_diag = torch.diag(p_diag.squeeze(0),diagonal=0) #p_diag.squeeze(0) ==> [output vocab size]
      #p_add_diag = [output vocab size x output vocab size]

      P_over_extended_vocab = torch.mm(P_over_extended_vocab,p_add_diag).add(P_over_extended_vocab)
      #mm = [1 x output vocab size]
      #P_over_extended_vocab = [1 x output vocab size] (element wise summation)

      for i in range(input_length):
        if not (1 in p_duplicate_list[i]):
          P_over_extended_vocab = torch.cat((P_over_extended_vocab[0], torch.mm(decoder_attention.squeeze(0)[i].unsqueeze(0).unsqueeze(0), torch.tensor([1], device=device).sub(pgen).unsqueeze(0)).squeeze(0)),0).unsqueeze(0)

      try:
        loss += -torch.log(P_over_extended_vocab[0][ reverse_extended_vocab[ ind2Word_dec_big[ target_tensor[decoder_index].item() ] ] ] + 1e-12)
        loss.backward(retain_graph=True)
      except KeyError:
        loss += torch.tensor(0,dtype=torch.float,device=device)
      decoder_input = target_tensor[decoder_index]
  else:

    for decoder_index in range(output_length):
      decoder_output,decoder_hidden,decoder_attention,pgen = decoder(decoder_input,decoder_hidden,encoder_outputs) 
      P_over_extended_vocab = torch.exp(decoder_output)*pgen.expand_as(torch.exp(decoder_output))

      decoder_attention = decoder_attention.squeeze(0)[0:input_length].unsqueeze(0)
      p_duplicate_list = torch.zeros([input_length, P_over_extended_vocab.size(1)], device=device)
      p_duplicate_list = p_duplicate_list.tolist()
      for (duplicate_word_key,duplicate_word_value) in duplicate_words.items():
        p_duplicate_list[duplicate_word_key][duplicate_word_value] = 1
      p_duplicate = torch.tensor(p_duplicate_list, dtype=torch.float, device=device)
      p_diag = torch.mm(decoder_attention, p_duplicate)
      p_diag = p_diag*(torch.tensor([1], device=device).sub(pgen)).expand_as(p_diag)
      p_add_diag = torch.diag(p_diag.squeeze(0),diagonal=0)
      P_over_extended_vocab = torch.mm(P_over_extended_vocab,p_add_diag).add(P_over_extended_vocab)

      for i in range(input_length):
        if not (1 in p_duplicate_list[i]):
          P_over_extended_vocab = torch.cat((P_over_extended_vocab[0], torch.mm(decoder_attention.squeeze(0)[i].unsqueeze(0).unsqueeze(0), torch.tensor([1], device=device).sub(pgen).unsqueeze(0)).squeeze(0)),0).unsqueeze(0)

      try:
        loss += -torch.log(P_over_extended_vocab[0][ reverse_extended_vocab[ ind2Word_dec_big[ target_tensor[decoder_index].item() ] ] ] + 1e-12)
        loss.backward(retain_graph=True)
      except KeyError:
        loss += torch.tensor(0,dtype=torch.float,device=device)
      idx = torch.topk(P_over_extended_vocab, k=1, dim=1)[1]
      if idx.item() < len(word2Index_dec_big.keys()):   
        decoder_input = torch.tensor([idx.item()],dtype=torch.long,device=device)
      elif idx.item() >= len(word2Index_dec_big.keys()):
        #prev_unk_word = extended_vocab[idx.item()] # use <UNK> if doesn't work
        decoder_input = torch.tensor([0],dtype=torch.long,device=device)
      
      if (decoder_input.item() == word2Index_dec['<END>']):
        break

  if iters > 20000:
    torch.nn.utils.clip_grad_norm_(encoder.parameters(), clip)
    torch.nn.utils.clip_grad_norm_(decoder.parameters(), clip)

  encoder_optimizer.step()
  decoder_optimizer.step()

  return loss.item()/output_length

In [16]:
def validate(encoder, decoder, input_tensor, attr_tensor, target_tensor, criterion, max_length):
  '''
  Arg:
    encoder: encoder model trained
    decoder: decoder model trained
    input_tensor: source seq in tensor form [seq length x 1 x embed dim] batch size = 1
    target_tensor: target seq in tensor form [seq length x 1 x embed dim] batch size = 1
    citerion: 
    max length: maximum target length desired
  '''
  with torch.no_grad():
    
    #prev_unk_word = ''
    
    encoder_hidden = encoder.init_hidden()
    #encoder_hidden = [1 x 1 x hidden dim]

    encoder_outputs = torch.zeros(max_length, encoder.hidden_size, device = device)
    #encoder_outputs = [seq length x hidden dim]

    input_length = input_tensor.size(0)
    output_length = target_tensor.size(0)

    loss = 0

    for encoder_index in range(0, input_length):
      encoder_output,encoder_hidden = encoder(input_tensor[encoder_index], attr_tensor[encoder_index], encoder_hidden)
      #input_tensor[encoder_index] = [1 x 1 x embed dim] (embed dim = hidden dim)
      #encoder_hidden = [1 x 1 x hidden dim] {encoder arg inp}
      #encoder_hidden = [n_layers*n direction x 1 x hidden dime] {encoder product}
      #encoder_output = [seq length x 1 x hidden dim*n direction]
      #seq length, n_layers, n direction = 1  

      encoder_outputs[encoder_index] = encoder_output[0,0] # [1 x hidden dim]
      #encoder_outputs: [seq length x hidden dim] ==> [seq length x hidden dim] (hidden state from all 0 to new)

    decoder_input = torch.tensor([word2Index_dec['<START>']],device=device)   
    #decoder_input = [1 x 1]
    decoder_hidden = encoder_hidden
    #decoder_hidden = [1 x 1 x hidden dim]

    extended_vocab = psuInd2Word_dec.copy()
    reverse_extended_vocab = word2PsuInd_dec.copy()
    duplicate_words = {}
    extend_key = len(word2Index_dec.keys())
    input_list = input_tensor.tolist()
    i =0
    for input_word in input_list:
      if ind2Word_enc[input_word[0]] in word2Index_dec.keys():
        duplicate_words[i] = word2PsuInd_dec[ind2Word_enc[input_word[0]]]
      else:
        extended_vocab[extend_key] = ind2Word_enc[input_word[0]]
        reverse_extended_vocab[ind2Word_enc[input_word[0]]] = extend_key
        extend_key += 1
      i = i+1

    for decoder_index in range(output_length):
      decoder_output,decoder_hidden,decoder_attention,pgen = decoder(decoder_input,decoder_hidden,encoder_outputs)
      #decoder_input = [1 x 1]
      #decoder_hidden = [1 x 1 x hidden dim]
      #encoder_outputs = [seq length x hidden dim]

      #decoder_output = [1 x output vocab size]
      #decoder_hidden = [1 x 1 x hidden dime]
      #decoder_attention = [1 x seq length]
      #pgen = [1 x 1]

      P_over_extended_vocab = torch.exp(decoder_output)*pgen.expand_as(torch.exp(decoder_output))
      #P_over_extended_vocab = [1 x output vocab size] (exp(decoder_output)*pgen)

      decoder_attention = decoder_attention.squeeze(0)[0:input_length].unsqueeze(0)
      #restricting decoder attention upto only input length
      #decoder_attention = [1 x input_length]
      p_duplicate_list = torch.zeros([input_length, P_over_extended_vocab.size(1)], device=device)
      #p_duplicate_list = [input_length x output vocab size] 

      p_duplicate_list = p_duplicate_list.tolist()
      for (duplicate_word_key,duplicate_word_value) in duplicate_words.items():
        p_duplicate_list[duplicate_word_key][duplicate_word_value] = 1 #making duplicate key,vals apparent
      
      p_duplicate = torch.tensor(p_duplicate_list, dtype=torch.float, device=device)
      p_diag = torch.mm(decoder_attention, p_duplicate)
      #p_diag = [1 x output vocab size]
      
      p_diag = p_diag*(torch.tensor([1], device=device).sub(pgen)).expand_as(p_diag)
      #p_diag = p_diag*(1 - pgen)

      p_add_diag = torch.diag(p_diag.squeeze(0),diagonal=0) #p_diag.squeeze(0) ==> [output vocab size]
      #p_add_diag = [output vocab size x output vocab size]

      P_over_extended_vocab = torch.mm(P_over_extended_vocab,p_add_diag).add(P_over_extended_vocab)
      #mm = [1 x output vocab size]
      #P_over_extended_vocab = [1 x output vocab size] (element wise summation)

      for i in range(input_length):
        if not (1 in p_duplicate_list[i]):
          P_over_extended_vocab = torch.cat((P_over_extended_vocab[0], 
                                              torch.mm(decoder_attention.squeeze(0)[i].unsqueeze(0).unsqueeze(0), 
                                                      torch.tensor([1], device=device).sub(pgen).unsqueeze(0)).squeeze(0)),0).unsqueeze(0)

      try:
        loss += -torch.log(P_over_extended_vocab[0][ reverse_extended_vocab[ ind2Word_dec_big[ target_tensor[decoder_index].item() ] ] ] + 1e-12)
      except KeyError:
        loss += torch.tensor(0,dtype=torch.float,device=device)
      idx = torch.topk(P_over_extended_vocab, k=1, dim=1)[1]
      if idx.item() < len(word2Index_dec_big.keys()):   
        decoder_input = torch.tensor([idx.item()],dtype=torch.long,device=device)
      elif idx.item() >= len(word2Index_dec_big.keys()):
        #prev_unk_word = extended_vocab[idx.item()]
        decoder_input = torch.tensor([0],dtype=torch.long,device=device)
      if (decoder_input.item() == word2Index_dec['<END>']):
        break

  return loss.item()/output_length

In [17]:
import time
import math


def asMinutes(s):
    m = math.floor(s / 60)
    s -= m * 60
    return '%dm %ds' % (m, s)


def timeSince(since, percent):
    now = time.time()
    s = now - since
    if percent != 0:
      es = s / (percent)
      rs = es - s
      return '%s (- %s)' % (asMinutes(s), asMinutes(rs))
    else:
      return 0

In [18]:
import os

if not os.path.exists('checkpoints_luong/encoder'):
    os.makedirs('checkpoints_luong/encoder')
if not os.path.exists('checkpoints_luong/decoder'):
    os.makedirs('checkpoints_luong/decoder')

In [19]:
# Dictionary for creating loss graph
loss_graph = {}

def train_Iters(encoder,decoder,n_iters, print_every=1, plot_every=5,learning_rate = 0.0005):
  # start = time.time()
  plot_losses = []
  print_loss_total = 0  # Reset every print_every
  plot_loss_total = 0

  print_val_loss = 0

  encoder_optimizer = torch.optim.SGD(encoder.parameters(), lr=learning_rate)
  decoder_optimizer = torch.optim.SGD(decoder.parameters(), lr=learning_rate)
  
  train_attr, train_cell, train_cap = attr_cell_caption_split(train_file)
  attr_input = [[word2Index_att[word] if word in word2Index_att.keys() else word2Index_att['<UNK>'] for word in sentence.split()] for sentence in train_attr ]
  encoder_input = [[word2Index_enc[word] if word in word2Index_enc.keys() else word2Index_enc['<UNK>'] for word in sentence.split()] for sentence in train_cell ]
  decoder_input = [[word2Index_dec_big[word] if word in word2Index_dec_big.keys() else word2Index_dec_big['<UNK>'] for word in sentence.split()] for sentence in train_cap ]
  train_pairs = [[attr,enc,dec] for attr,enc,dec in zip(attr_input,encoder_input,decoder_input)]
  training_pairs = [random.choice(train_pairs) for i in range(n_iters)]
    
  
  val_attr, val_cell, val_cap = attr_cell_caption_split(val_file)
  attr_val = [[word2Index_att[word] if word in word2Index_att.keys() else word2Index_att['<UNK>'] for word in sentence.split()] for sentence in val_attr ]
  encoder_val = [[word2Index_enc[word] if word in word2Index_enc.keys() else word2Index_enc['<UNK>'] for word in sentence.split()] for sentence in val_cell ]
  decoder_val = [[word2Index_dec_big[word] if word in word2Index_dec_big.keys() else word2Index_dec_big['<UNK>'] for word in sentence.split()] for sentence in val_cap ]
  val_pairs = [[attr,enc,dec] for attr,enc,dec in zip(attr_val,encoder_val,decoder_val)]
  validation_pairs = [random.choice(val_pairs) for i in range(n_iters)]

  criterion = nn.NLLLoss()
  for iters in range(n_iters):
    training_pair = training_pairs[iters - 1]
    attr_tensor = training_pair[0]
    input_tensor = training_pair[1]
    target_tensor = training_pair[2]

    attr_tensor = torch.tensor(attr_tensor, dtype=torch.long, device = device).view(-1, 1)
    input_tensor = torch.tensor(input_tensor, dtype=torch.long, device = device).view(-1, 1)
    target_tensor = torch.tensor(target_tensor, dtype=torch.long, device = device).view(-1, 1)

    loss = train(encoder,decoder,input_tensor,attr_tensor,target_tensor,
                 encoder_optimizer,decoder_optimizer,criterion,max_source_length, iters=n_iters)
    print_loss_total += loss
    plot_loss_total += loss

    validation_pair = validation_pairs[iters - 1]
    val_attr_tensor = validation_pair[0]
    val_input_tensor = validation_pair[1]
    val_target_tensor = validation_pair[2]

    val_attr_tensor = torch.tensor(val_attr_tensor, dtype=torch.long, device = device).view(-1, 1)
    val_input_tensor = torch.tensor(val_input_tensor, dtype=torch.long, device = device).view(-1, 1)
    val_target_tensor = torch.tensor(val_target_tensor, dtype=torch.long, device = device).view(-1, 1)

    val_loss = validate(encoder, decoder, val_input_tensor,val_attr_tensor, val_target_tensor, criterion, max_source_length)
    print_val_loss += val_loss

    '''if iters % print_every == 0:
        torch.save(encoder, 'checkpoints_luong/encoder/encoder_{}.pt'.format(iters))
        torch.save(decoder, 'checkpoints_luong/decoder/decoder_{}.pt'.format(iters))'''
    
    if iters % print_every == 0:
        print_loss_avg = print_loss_total / print_every
        print_loss_total = 0
        val_loss_avg = print_val_loss / print_every
        print_val_loss = 0

        print('Iteration: {}, Train Loss: {:.4f}, Val Loss: {:.4f}'.format(iters, print_loss_avg, val_loss_avg)) # iters / len(arr) * 100, before
        evaluateRandomly(rnn_encoder, rnn_decoder, train_attr, train_cell, train_cap)
        if iters > 0:
          loss_graph[iters] = print_loss_avg

    if iters % plot_every == 0:
        plot_loss_avg = plot_loss_total / plot_every
        plot_losses.append(plot_loss_avg)
        plot_loss_total = 0

  # showPlot(plot_losses)
  

In [26]:
hidden_size = embed_dim = 128
rnn_encoder = Encoder(len(word2Index_att.keys()),len(word2Index_enc.keys()),embed_dim,hidden_size).to(device=device)
rnn_decoder = AttentionDecoder(len(word2Index_dec_big.keys()),embed_dim,hidden_size,max_source_length,0.2).to(device=device)

In [21]:
def evaluateRandomly(encoder, decoder, attr_seqs=None, source_seqs=None, target_seqs=None, n=1):
    for i in range(n):
        idx = random.choice(range(len(source_seqs)))
        source_seq,attr_seq = source_seqs[idx], attr_seqs[idx]
        attr_inp = [word2Index_att[word] if word in word2Index_att.keys() else word2Index_att['<UNK>'] for word in attr_seq.split()]
        source_inp = [word2Index_enc[word] if word in word2Index_enc.keys() else word2Index_enc['<UNK>'] for word in source_seq.split()]
        '''
        len_enc_vocab = len(word2Index_enc)
        source_inp = list()
        for word in source_seq.split():
            i = 0
            if word in word2Index_enc.keys():
                source_inp.append(word2Index_enc[word])
            else:
                word2Index_enc[word] = len_enc_vocab + i
                ind2Word_enc[len_enc_vocab + i] = word
                i += 1
                source_inp.append(word2Index_enc[word])
         '''       
        source_tensor = torch.tensor(source_inp,dtype=torch.long,device=device)
        output_words, attentions = evaluate(encoder, decoder, source_tensor)
        output_seq = ' '.join(output_words)
        print('ATTRIBUTE: ',attr_seq)
        print('   SOURCE: ',source_seq)
        if target_seqs is not None:
            target_seq = target_seqs[idx]
            print('   ACTUAL: ',target_seq)
            #target_out = [[word2Index_dec_big[word] if word in word2Index_dec_big.keys() else word2Index_dec_big['<UNK>'] for word in target_seq.split()]]
        print('PREDICTED: ',output_seq)

In [22]:
def evaluate(encoder, decoder, encoder_tensor, attr_tensor,
             max_source_length=max_source_length, max_summary_length=max_summary_length):
    with torch.no_grad():
        input_tensor = encoder_tensor
        input_length = input_tensor.size(0)
        encoder_hidden = encoder.init_hidden()

        #prev_unk_word = ''

        encoder_outputs = torch.zeros(max_source_length, encoder.hidden_size, device=device)

        for ei in range(input_length):
            encoder_output, encoder_hidden = encoder(input_tensor[ei].unsqueeze(0),attr_tensor[ei].unsqueeze(0),
                                                     encoder_hidden)
            encoder_outputs[ei] += encoder_output[0, 0]

        extended_vocab = psuInd2Word_dec.copy()
        duplicate_words = {}
        extend_key = len(word2Index_dec.keys())
        input_list = input_tensor.tolist()
        i =0
        for input_word in input_list:
          if ind2Word_enc[input_word] in word2Index_dec.keys():
            duplicate_words[i] = word2PsuInd_dec[ind2Word_enc[input_word]]
          else:
            extended_vocab[extend_key] = ind2Word_enc[input_word]
            extend_key += 1
          i = i+1

        decoder_input = torch.tensor([word2Index_dec['<START>']], device=device)  # SOS

        decoder_hidden = encoder_hidden

        decoded_words = []
        decoder_attentions = torch.zeros(max_summary_length, max_source_length)

        for di in range(max_summary_length):
            decoder_output, decoder_hidden, decoder_attention,pgen = decoder(
                decoder_input, decoder_hidden, encoder_outputs)
            decoder_attentions[di] = decoder_attention.data

            P_over_extended_vocab = torch.exp(decoder_output)*pgen.expand_as(torch.exp(decoder_output))

            decoder_attention = decoder_attention.squeeze(0)[0:input_length].unsqueeze(0)
            p_duplicate_list = torch.zeros([input_length, P_over_extended_vocab.size(1)], device=device)
            p_duplicate_list = p_duplicate_list.tolist()
            for (duplicate_word_key,duplicate_word_value) in duplicate_words.items():
              p_duplicate_list[duplicate_word_key][duplicate_word_value] = 1
            p_duplicate = torch.tensor(p_duplicate_list, dtype=torch.float, device=device)
            p_diag = torch.mm(decoder_attention, p_duplicate)
            p_diag = p_diag*(torch.tensor([1], device=device).sub(pgen)).expand_as(p_diag)
            p_add_diag = torch.diag(p_diag.squeeze(0),diagonal=0)
            P_over_extended_vocab = torch.mm(P_over_extended_vocab,p_add_diag).add(P_over_extended_vocab)

            for i in range(input_length):
              if not (1 in p_duplicate_list[i]):
                P_over_extended_vocab = torch.cat((P_over_extended_vocab[0], torch.mm(decoder_attention.squeeze(0)[i].unsqueeze(0).unsqueeze(0), torch.tensor([1], device=device).sub(pgen).unsqueeze(0)).squeeze(0)),0).unsqueeze(0)

            idx = torch.topk(P_over_extended_vocab, k=1, dim=1)[1]
            if idx.item() < len(word2Index_dec_big.keys()):   
              decoder_input = torch.tensor([idx.item()],dtype=torch.long,device=device)
              decoded_words.append(extended_vocab[idx.item()])
            elif idx.item() >= len(word2Index_dec_big.keys()):
              decoder_input = torch.tensor([0],dtype=torch.long,device=device)
              #prev_unk_word = extended_vocab[idx.item()] 
              decoded_words.append('<UNK>')
            if idx.item() == word2Index_dec['<END>']:
              decoded_words.append('<END>')
              break

        return decoded_words, decoder_attentions[:di + 1]

In [25]:
train_Iters(rnn_encoder,rnn_decoder,100, 1)

RuntimeError: size mismatch, m1: [2 x 128], m2: [256 x 128] at C:\cb\pytorch_1000000000000\work\aten\src\TH/generic/THTensorMath.cpp:41