## SET UP

In [2]:
#################################
# SET UP
#################################

import random
import re
import time
import numpy as np
import torch
import torch.nn as nn
from torch import optim
import torch.nn.functional as F
from torch.utils.data import TensorDataset, DataLoader, RandomSampler

import matplotlib.pyplot as plt
import matplotlib.ticker as ticker

plt.switch_backend('agg')
%matplotlib inline

# Device setup
device = torch.device("cuda" if torch.cuda.is_available() else "cpu")
print(f'device={device}')

#################################
# CONSTANTES
#################################

DATA_DIR = "../data/"
SOS_token = 0
EOS_token = 1

device=cuda


## Funções e Classes

In [3]:
#################################
# FUNÇÕES
#################################

def readDataset():
  file = DATA_DIR + 'datahumana-datamaquina.txt'
  # file = '' # Arquivo a ser lido na formatação entrada<tab>saida
  lines = open(file, encoding='utf-8').read().strip().split('\n')
  pairs = [[s for s in l.split('\t')] for l in lines]
  print(f'Read {len(pairs)} samples from {file}')
  print()
  print("\n".join(map(str, random.sample(pairs, 15))))
  return pairs
  
def prepareData(pairs):
  input_dict  = {"SOS": SOS_token, "EOS": EOS_token}
  output_dict = {"SOS": SOS_token, "EOS": EOS_token}

  input_dict_reverse  = {SOS_token: "SOS", EOS_token: "EOS"}
  output_dict_reverse = {SOS_token: "SOS", EOS_token: "EOS"}

  for pair in pairs:
    ipt, tgt = pair
    
    mask = '([^a-zA-Z0-9])'
    result = [token for token in re.split(mask, ipt) if token.strip()]
    for word_input in result:
      if word_input not in input_dict:
        next_int = max(input_dict.values()) + 1
        input_dict[word_input] = next_int
        input_dict_reverse[next_int] = word_input

    for word_output in re.split(mask, tgt):
      if word_output not in output_dict:
        next_int = max(output_dict.values()) + 1
        output_dict[word_output] = next_int
        output_dict_reverse[next_int] = word_output

  print(f'Number of input words: {len(input_dict.keys())}')
  print(f'Number of output words: {len(output_dict.keys())}')

  return input_dict, output_dict, input_dict_reverse, output_dict_reverse

def indexesFromSentence(language_dict, sentence, verbose=False):
  '''
    Função utilitária para converter uma sentença em índices
  '''
  encoded = [language_dict[word] for word in re.split('([^a-zA-Z0-9])', sentence) if word.strip()]
  if verbose:
    print(f"'{sentence}' => {encoded}")
  return encoded
  
# Funções utilitárias
def tensorFromSentence(language_dict, sentence):
  indexes = indexesFromSentence(language_dict, sentence)
  indexes.append(EOS_token)
  return torch.tensor(indexes, dtype=torch.long, device=device).view(1, -1)

def tensorsFromPair(pair):
    input_tensor = tensorFromSentence(input_dict, pair[0])
    target_tensor = tensorFromSentence(output_dict, pair[1])
    return (input_tensor, target_tensor)
  
def get_dataloader(batch_size, pairs, input_dict, output_dict):
  n = len(pairs)
  input_ids = np.zeros((n, MAX_LENGTH), dtype=np.int32)
  target_ids = np.zeros((n, MAX_LENGTH), dtype=np.int32)

  for idx, (inp, tgt) in enumerate(pairs):
    inp_ids = indexesFromSentence(input_dict, inp)
    tgt_ids = indexesFromSentence(output_dict, tgt)
    inp_ids.append(EOS_token)
    tgt_ids.append(EOS_token)
    input_ids[idx, :len(inp_ids)] = inp_ids
    target_ids[idx, :len(tgt_ids)] = tgt_ids

  train_data = TensorDataset(torch.LongTensor(input_ids).to(device), torch.LongTensor(target_ids).to(device))

  train_sampler = RandomSampler(train_data)
  train_dataloader = DataLoader(train_data, sampler=train_sampler, batch_size=batch_size)
  return train_dataloader
  
def train_epoch(dataloader, encoder, decoder, encoder_optimizer, decoder_optimizer, criterion):
  """
  Train the encoder and decoder for one epoch.
  
  Parameters:
  - dataloader: Iterable that provides batches of input and target tensors.
  - encoder: The encoder model.
  - decoder: The decoder model.
  - encoder_optimizer: Optimizer for updating the encoder's parameters.
  - decoder_optimizer: Optimizer for updating the decoder's parameters.
  - criterion: Loss function to measure the difference between the predicted and target outputs.

  Returns:
  - The average loss over the epoch.
  """
  total_loss = 0
  for data in dataloader:
    input_tensor, target_tensor = data

    encoder_optimizer.zero_grad()
    decoder_optimizer.zero_grad()

    encoder_outputs, encoder_hidden = encoder(input_tensor)
    decoder_outputs, _, _ = decoder(encoder_outputs, encoder_hidden, target_tensor)

    loss = criterion(
        decoder_outputs.view(-1, decoder_outputs.size(-1)),
        target_tensor.view(-1)
    )
    loss.backward()

    encoder_optimizer.step()
    decoder_optimizer.step()

    total_loss += loss.item()

  return total_loss / len(dataloader)
  
def train(train_dataloader, encoder, decoder, n_epochs, learning_rate=0.001):
  loss_points = []
  
  plot_losses = []
  print_loss_total = 0  # Reset every print_every
  plot_loss_total = 0  # Reset every plot_every

  encoder_optimizer = optim.Adam(encoder.parameters(), lr=learning_rate)
  decoder_optimizer = optim.Adam(decoder.parameters(), lr=learning_rate)
  criterion = nn.NLLLoss()

  for epoch in range(1, n_epochs + 1):
    loss = train_epoch(train_dataloader, encoder, decoder, encoder_optimizer, decoder_optimizer, criterion)
    loss_points.append(loss)

    if epoch % 10 == 0:
      print(f'Epoch: {epoch:4}/{n_epochs:4} - Loss: {loss:.4f}')

  
  return loss_points

def plot_train_loss(loss_points):
  plt.plot(loss_points);
  plt.title('Training Loss')
  plt.xlabel('Epochs')
  plt.ylabel('Loss')
  
def evaluate(encoder, decoder, sentence):
  encoder.eval()
  decoder.eval()
  with torch.no_grad():
    input_tensor = tensorFromSentence(input_dict, sentence)

    encoder_outputs, encoder_hidden = encoder(input_tensor)
    decoder_outputs, decoder_hidden, decoder_attn = decoder(encoder_outputs, encoder_hidden)

    _, topi = decoder_outputs.topk(1)
    decoded_ids = topi.squeeze()

    decoded_words = []
    for idx in decoded_ids:
      if idx.item() == EOS_token:
        # decoded_words.append('<EOS>')
        break
      decoded_words.append(output_dict_reverse[idx.item()])
  return decoded_words, decoder_attn
  
def evaluateRandomly(encoder, decoder, n=5):
  for i in range(n):
    pair = random.choice(pairs)
    print(f'Input: {pair[0]}')
    print(f'Ground truth: {pair[1]}')
    output_words, _ = evaluate(encoder, decoder, pair[0])
    output_sentence = ''.join(output_words)
    print(f'Predicted: {output_sentence}')
    print('')

  def forward(self, encoder_outputs, encoder_hidden, target_tensor=None):
    """
    Defines the forward pass for the attention decoder.
    
    Parameters:
    - encoder_outputs: The output sequences from the encoder (shape: batch_size, seq_len, hidden_size).
    - encoder_hidden: The last hidden state of the encoder (shape: 1, batch_size, hidden_size).
    - target_tensor: The target sequence for teacher forcing (optional).
    
    Returns:
    - decoder_outputs: The output probabilities (log-softmax) for each time step (shape: batch_size, seq_len, output_size).
    - decoder_hidden: The final hidden state of the decoder (shape: 1, batch_size, hidden_size).
    - attentions: The attention weights for each time step (shape: batch_size, seq_len).
    """        
    batch_size = encoder_outputs.size(0)
    decoder_input = torch.empty(batch_size, 1, dtype=torch.long, device=device).fill_(SOS_token)
    decoder_hidden = encoder_hidden
    decoder_outputs = []
    attentions = []

    # Loop through each time step to generate the output sequence.
    for i in range(MAX_LENGTH):
      # Call the forward_step function to get output, hidden state, and attention weights for the current time step.
      decoder_output, decoder_hidden, attn_weights = self.forward_step(decoder_input, decoder_hidden, encoder_outputs)
      decoder_outputs.append(decoder_output)
      attentions.append(attn_weights)

      if target_tensor is not None:
        # Teacher forcing: Feed the target token as the next input to the decoder.
        decoder_input = target_tensor[:, i].unsqueeze(1)
      else:
        # Without teacher forcing: Use its own predictions as the next input.
        _, topi = decoder_output.topk(1)
        decoder_input = topi.squeeze(-1).detach()  # detach from history as input

    # Concatenate all decoder outputs along the time step dimension.
    decoder_outputs = torch.cat(decoder_outputs, dim=1)

    # Apply log softmax to the outputs to get log-probabilities.
    decoder_outputs = F.log_softmax(decoder_outputs, dim=-1)

    # Concatenate all attention weights along the time step dimension.
    attentions = torch.cat(attentions, dim=1)

    # Return the outputs, final hidden state, and attention weights.
    return decoder_outputs, decoder_hidden, attentions


  def forward_step(self, input, hidden, encoder_outputs):
    """
    Processes a single step in the decoding sequence with attention.
    
    Parameters:
    - input: The current input to the decoder (shape: batch_size, 1).
    - hidden: The current hidden state of the decoder (shape: 1, batch_size, hidden_size).
    - encoder_outputs: The outputs from the encoder (shape: batch_size, seq_len, hidden_size).
    
    Returns:
    - output: The predicted output (shape: batch_size, 1, output_size).
    - hidden: The updated hidden state (shape: 1, batch_size, hidden_size).
    - attn_weights: The attention weights for the current input (shape: batch_size, seq_len).
    """       
    # Get the embedded representation of the current input with dropout for regularization. 
    embedded =  self.dropout(self.embedding(input))

    # Permute hidden state to match the query shape for the attention mechanism.
    query = hidden.permute(1, 0, 2)

    # Compute the context vector and attention weights using the attention mechanism.
    context, attn_weights = self.attention(query, encoder_outputs)

    # Concatenate the embedded input and the context vector for the GRU.
    input_gru = torch.cat((embedded, context), dim=2)

    # Pass the combined input through the GRU layer to get the output and updated hidden state.
    output, hidden = self.gru(input_gru, hidden)

    # Map the GRU output to the output vocabulary space using the linear layer.
    output = self.out(output)

    return output, hidden, attn_weights

  
#################################
# CLASSES
#################################
  
class EncoderRNN(nn.Module):
  def __init__(self, input_size, hidden_size, dropout_p=0.1):
    """
    Initializes the encoding layer (Encoder) of an RNN.
        
    Parameters:
    - input_size: The number of expected features in the input x
    - hidden_size: The number of features in the hidden state h
    - dropout_p: If non-zero, introduces a Dropout layer on the outputs of each GRU layer except the last layer
    """      
    super(EncoderRNN, self).__init__()
    self.hidden_size = hidden_size

    # Embedding layer that transforms word indices into dense vectors of size 'hidden_size'.
    self.embedding = nn.Embedding(input_size, hidden_size)

    # Defines the GRU (Gated Recurrent Unit) with both input and output dimensions of 'hidden_size'.
    # 'batch_first=True' indicates that the first input dimension is the batch (batch_size, seq_len, hidden_size).
    self.gru = nn.GRU(hidden_size, hidden_size, batch_first=True)

    # Dropout applied after the embedding layer to prevent overfitting.
    self.dropout = nn.Dropout(dropout_p)

  def forward(self, input):
    """
    Defines the forward pass of the model.
    
    Parameter:
    - input: Sequence of word indices of size (batch_size, seq_len).
    
    Returns:
    - output: GRU outputs for each step in the sequence (batch_size, seq_len, hidden_size).
    - hidden: The last hidden state vector of the GRU (1, batch_size, hidden_size).
        """    
    embedded = self.dropout(self.embedding(input))
    output, hidden = self.gru(embedded)
    return output, hidden 
    
class DecoderRNN(nn.Module):
  def __init__(self, hidden_size, output_size):
    """
    Initializes the decoding layer (Decoder) of an RNN.
    
    Parameters:
    - hidden_size: The size of the hidden state vector.
    - output_size: The size of the output vocabulary (number of unique words in the target language).
    """    
    super(DecoderRNN, self).__init__()

    # Embedding layer that transforms word indices into dense vectors of size 'hidden_size'.
    self.embedding = nn.Embedding(output_size, hidden_size)

    # Defines a GRU layer that processes input sequences. Both input and output dimensions are 'hidden_size'.
    self.gru = nn.GRU(hidden_size, hidden_size, batch_first=True)

    # Linear layer that maps the hidden state of the GRU to the output vocabulary space.
    self.out = nn.Linear(hidden_size, output_size)

  def forward(self, encoder_outputs, encoder_hidden, target_tensor=None):
    """
    Defines the forward pass for the decoder.
    
    Parameters:
    - encoder_outputs: The output sequence from the encoder (batch_size, seq_len, hidden_size).
    - encoder_hidden: The last hidden state of the encoder (1, batch_size, hidden_size).
    - target_tensor: The target sequence for teacher forcing (optional).
    
    Returns:
    - decoder_outputs: The output probabilities (log-softmax) for each time step (batch_size, seq_len, output_size).
    - decoder_hidden: The final hidden state of the decoder (1, batch_size, hidden_size).
    - None: Placeholder for consistency with other methods (e.g., attention).
    """    
    batch_size = encoder_outputs.size(0)

    # Initial decoder input is the <SOS> token for every sequence in the batch.
    decoder_input = torch.empty(batch_size, 1, dtype=torch.long, device=device).fill_(SOS_token)
    decoder_hidden = encoder_hidden
    decoder_outputs = []

    # Loop through each time step
    for i in range(MAX_LENGTH):
      # Call the forward_step function to get output and hidden state for the current time step.
      decoder_output, decoder_hidden  = self.forward_step(decoder_input, decoder_hidden)
      decoder_outputs.append(decoder_output)

      if target_tensor is not None:
        # Teacher forcing: Use the target token as the next input to the decoder.
        decoder_input = target_tensor[:, i].unsqueeze(1) 
      else:
        # Without teacher forcing: use its own predictions as the next input
        _, topi = decoder_output.topk(1)
        decoder_input = topi.squeeze(-1).detach()  

    # Concatenate all the decoder outputs along the time step dimension.
    decoder_outputs = torch.cat(decoder_outputs, dim=1)

    # Apply log softmax to the outputs to get log-probabilities.
    decoder_outputs = F.log_softmax(decoder_outputs, dim=-1)

    # Return the outputs, final hidden state, and `None` (for consistency with attention models implemented next).
    return decoder_outputs, decoder_hidden, None 

  def forward_step(self, input, hidden):
    """
    Processes a single step in the decoding sequence.
    
    Parameters:
    - input: The current input to the decoder (batch_size, 1).
    - hidden: The current hidden state of the decoder (1, batch_size, hidden_size).
    
    Returns:
    - output: The predicted output (batch_size, 1, output_size).
    - hidden: The updated hidden state (1, batch_size, hidden_size).
    """    
    output = self.embedding(input)
    output = F.relu(output)
    output, hidden = self.gru(output, hidden)
    output = self.out(output)
    return output, hidden    
    
    
    
# Bahdanau attention, also known as additive attention, is a commonly used
# attention mechanism in sequence-to-sequence models, particularly in
# neural machine translation tasks. It was introduced by Bahdanau et al.
# in their paper titled [Neural Machine Translation by Jointly Learning to
# Align and Translate](https://arxiv.org/pdf/1409.0473.pdf). This
# attention mechanism employs a learned alignment model to compute
# attention scores between the encoder and decoder hidden states. It
# utilizes a feed-forward neural network to calculate alignment scores.

class BahdanauAttention(nn.Module):
  def __init__(self, hidden_size):
    '''Initialize the BahdanauAttention class.
        
    Parameters:
    - hidden_size: The size of the hidden state used in the attention mechanism.
    '''
    super(BahdanauAttention, self).__init__()
    self.Wa = nn.Linear(hidden_size, hidden_size)
    self.Ua = nn.Linear(hidden_size, hidden_size)
    self.Va = nn.Linear(hidden_size, 1)

  def forward(self, query, keys):
    scores = self.Va(torch.tanh(self.Wa(query) + self.Ua(keys)))
    scores = scores.squeeze(2).unsqueeze(1)

    weights = F.softmax(scores, dim=-1)
    context = torch.bmm(weights, keys)
    return context, weights  # Return the context vector and attention weights.
    
class AttnDecoderRNN(nn.Module):
  def __init__(self, hidden_size, output_size, dropout_p=0.1):
    """
    Initializes the decoding layer (Decoder with Attention) of an RNN.
    
    Parameters:
    - hidden_size: The size of the hidden state vector.
    - output_size: The size of the output vocabulary (number of unique words in the target language).
    - dropout_p: The dropout probability for regularization during training.
    """    
    super(AttnDecoderRNN, self).__init__()
    
    # Embedding layer transforms word indices into dense vectors of size 'hidden_size'.
    self.embedding = nn.Embedding(output_size, hidden_size)
    
    # Attention mechanism to compute context vectors from encoder outputs.
    self.attention = BahdanauAttention(hidden_size)

    # GRU layer that processes the combined input of embeddings and context vectors.
    self.gru = nn.GRU(2 * hidden_size, hidden_size, batch_first=True)

    # Linear layer maps the GRU output to the output vocabulary space.
    self.out = nn.Linear(hidden_size, output_size)

    # Dropout layer for regularization to prevent overfitting during training.
    self.dropout = nn.Dropout(dropout_p)



## Seq2Seq data -> data

### Sem camada de atenção

In [4]:
#################################
# CONFIGURAÇÃO DO EXPERIMENTO
#################################
# MAX_LENGTH defines the sequence length limit
MAX_LENGTH = 18

hidden_size = 128
batch_size = 1024


In [5]:
#################################
# PREPARAÇÂO DOS DADOS
#################################

# leitura do dataset
print('Realizando a leitura do dataset:')
pairs = readDataset()

# preparação do data
print('\n\nRealizando a preparacao dos dados:')
input_dict, output_dict, input_dict_reverse, output_dict_reverse = prepareData(pairs)


indexesFromSentence(input_dict, 'segunda-feira 18 de outubro de 1999', verbose=True);
indexesFromSentence(output_dict, '30/09/2022', verbose=True);
tensorsFromPair(['segunda-feira 18 de outubro de 1999', '18/10/1999'])

Realizando a leitura do dataset:
Read 10000 samples from ../data/datahumana-datamaquina.txt

['domingo 8 de fevereiro de 1998', '08/02/1998']
['26.01.85', '26/01/1985']
['domingo 5 de outubro de 1980', '05/10/1980']
['16 de julho de 1998', '16/07/1998']
['11 outubro 2014', '11/10/2014']
['14 jul. 2014', '14/07/2014']
['18 janeiro 1977', '18/01/1977']
['agosto 28 1978', '28/08/1978']
['abril 21 2015', '21/04/2015']
['terca-feira 29 de novembro de 1988', '29/11/1988']
['02/01/2018', '02/01/2018']
['10/06/1993', '10/06/1993']
['3 marco 1978', '03/03/1978']
['26 de setembro de 2012', '26/09/2012']
['8 de jul. de 2018', '08/07/2018']


Realizando a preparacao dos dados:
Number of input words: 164
Number of output words: 89
'segunda-feira 18 de outubro de 1999' => [33, 11, 12, 102, 7, 66, 7, 39]
'30/09/2022' => [46, 3, 7, 3, 47]


RuntimeError: CUDA error: CUDA-capable device(s) is/are busy or unavailable
CUDA kernel errors might be asynchronously reported at some other API call, so the stacktrace below might be incorrect.
For debugging consider passing CUDA_LAUNCH_BLOCKING=1
Compile with `TORCH_USE_CUDA_DSA` to enable device-side assertions.


In [None]:
train_dataloader = get_dataloader(batch_size, pairs, input_dict, output_dict)

encoder = EncoderRNN(
  input_size = len(input_dict.keys()),
  hidden_size = hidden_size
).to(device)

decoder = DecoderRNN(
  hidden_size = hidden_size,
  output_size = len(output_dict.keys())
).to(device)


loss_points = train(train_dataloader, encoder, decoder, 100, learning_rate=0.001)
plot_train_loss(loss_points);
evaluateRandomly(encoder, decoder)



device=cuda
Read 10000 samples from ../data/datahumana-datamaquina.txt

['18 ago. 1994', '18/08/1994']
['19/10/2006', '19/10/2006']
['quarta-feira 29 de setembro de 2010', '29/09/2010']
['28 julho 2020', '28/07/2020']
['05 mar. 2004', '05/03/2004']
['30/04/1991', '30/04/1991']
['domingo 24 de outubro de 2021', '24/10/2021']
['2 nov. 1982', '02/11/1982']
['domingo 19 de agosto de 1984', '19/08/1984']
['sexta-feira 19 de abril de 1985', '19/04/1985']
['1 de setembro de 2019', '01/09/2019']
['sabado 8 de agosto de 1992', '08/08/1992']
['quinta-feira 16 de agosto de 2001', '16/08/2001']
['quarta-feira 25 de fevereiro de 1981', '25/02/1981']
['21 de out. de 2003', '21/10/2003']
Number of input words: 164
Number of output words: 89
'segunda-feira 18 de outubro de 1999' => [33, 11, 12, 102, 7, 66, 7, 39]
'30/09/2022' => [46, 3, 7, 3, 47]
Epoch:   10/ 100 - Loss: 0.6112
Epoch:   20/ 100 - Loss: 0.5300
Epoch:   30/ 100 - Loss: 0.3955
Epoch:   40/ 100 - Loss: 0.2605
Epoch:   50/ 100 - Loss: 0.17

### Com camada de Atenção

In [None]:
#################################
# PROCESSO COM ATENÇÃO
#################################

hidden_size = 128
batch_size = 1024

train_dataloader = get_dataloader(batch_size, pairs, input_dict, output_dict)

encoder = EncoderRNN(
  input_size = len(input_dict.keys()),
  hidden_size = hidden_size
).to(device)

# decoder = DecoderRNN(
decoder = AttnDecoderRNN(
  hidden_size = hidden_size,
  output_size = len(output_dict.keys())
).to(device)


loss_points = train(train_dataloader, encoder, decoder, 100, learning_rate=0.001)
plot_train_loss(loss_points)
evaluateRandomly(encoder, decoder)

%matplotlib inline
def showAttention(input_sentence, output_words, attentions):
  fig = plt.figure()
  ax = fig.add_subplot()
  data = attentions.cpu().numpy().squeeze()
  cax = ax.matshow(data, cmap='bone')
  fig.colorbar(cax)

  # Set up axes
  input_tokens = [token for token in re.split('([^a-zA-Z0-9])', input_sentence) if token.strip()]
  ax.set_xticks(ticks=range(len(input_tokens)), labels=input_tokens)
  ax.set_yticks(ticks=range(len(output_words)), labels=output_words)

  # Show label at every tick
  ax.xaxis.set_major_locator(ticker.MultipleLocator(1))
  ax.yaxis.set_major_locator(ticker.MultipleLocator(1))

  for (i, j), z in np.ndenumerate(data):
    ax.text(j, i, '{:0.1f}'.format(z), ha='center', va='center')

  plt.show()


def evaluateAndShowAttention(input_sentence):
  output_words, attentions = evaluate(encoder, decoder, input_sentence)
  print('input =', input_sentence)
  print('output =', ''.join(output_words))
  showAttention(input_sentence, output_words, attentions[0, :len(output_words), 1:])
  plt.show()



evaluateAndShowAttention('20 de novembro de 2015')

evaluateAndShowAttention('22 de maio 1981')

evaluateAndShowAttention('15.11.2013')

evaluateAndShowAttention('01 de jan. 1989')