https://pytorch.org/tutorials/intermediate/seq2seq_translation_tutorial.html

seq-to-seq  
encoder: seq to vector
decoder: vector to seq

character-level RNN  
a character -> one-hot vector, 'b' to [0, 1, ...] with len 26  
word-level RNN  
a word -> one-hot vector, 'the' to [0, 0, 1, 0, ... ,0] with large enough len, in the example


Attention  
If only the context vector is passed betweeen the encoder and decoder, that single vector carries the burden of encoding the entire sentence.  
Attention allows the decoder network to “focus” on a different part of the encoder’s outputs for every step of the decoder’s own outputs.  

a context vector is not enough, so using encoder hidden states additionally  
CHECK encoder output or hidden state?

In this example, encoder outputs are collected in encoder_outputs of (MAX_LENGTH=10, hidden_size)

어텐션의 기본 아이디어는 디코더에서 출력 단어를 예측하는 매 시점(time step)마다, 인코더에서의 전체 입력 문장을 다시 한 번 참고한다는 점입니다. 단, 전체 입력 문장을 전부 다 동일한 비율로 참고하는 것이 아니라, 해당 시점에서 예측해야할 단어와 연관이 있는 입력 단어 부분을 좀 더 집중(attention)해서 보게 됩니다.

The KEY difference than SimpleDecoder  
the first hidden for the gru is the same,  
but the input is processed through attention layers before passing to the gru

In [1]:
from io import open
import unicodedata
import string
import re
import random

import torch
import torch.nn as nn
from torch import optim
import torch.nn.functional as F

device = torch.device('cuda' if torch.cuda.is_available() else 'cpu')

AttributeError: module 'torch.cuda' has no attribute 'is_availabe'

In [15]:
SOS_token = 0
EOS_token = 1

class Lang:
    def __init__(self, name):
        self.name = name
        self.word2index = {}
        self.word2count = {}
        self.index2word = {0: 'SOS', 1: 'EOS'}
        self.n_words = 2
        
    def addSentence(self, sentence):
        for word in sentence.split(' '):
            self.addWord(word)
            
    def addWord(self, word):
        if word not in self.word2index:
            self.word2index[word] = self.n_words  # the new word index n if n words exits in Lang
            self.word2count[word] = 1
            self.index2word[self.n_words] = word
            self.n_words += 1
        else:
            self.word2count[word] += 1

In [4]:
def unicodeToAscii(s):
    return ''.join(
        c for c in unicodedata.normalize('NFD', s)
        if unicodedata.category(c) != 'Mn'
    )

# Lowercase, trim, and remove non-letter characters
def normalizeString(s):
    s = unicodeToAscii(s.lower().strip())
    s = re.sub(r"([.!?])", r" \1", s)
    s = re.sub(r"[^a-zA-Z.!?]+", r" ", s)
    return s

In [9]:
def readLangs(lang1, lang2, reverse=False):
    print("Reading lines...")
    
    lines = open(f"data/{lang1}-{lang2}.txt", encoding='utf-8').read().strip().split('\n')
    pairs = [[normalizeString(s) for s in l.split('\t')] for l in lines]
    
    if reverse:
        pairs = [list(reversed(p)) for p in pairs]
        input_lang = Lang(lang2)
        output_lang = Lang(lang1)
    else:
        input_lang = Lang(lang1)
        output_lang = Lang(lang2)
        
    return input_lang, output_lang, pairs

In [10]:
MAX_LENGTH = 10

eng_prefixes = (
    "i am ", "i m ",
    "he is", "he s ",
    "she is", "she s ",
    "you are", "you re ",
    "we are", "we re ",
    "they are", "they re "
)

def filterPair(p):  # sentences of words less than MAX_LENGTH, starting with eng_prefixes
    return len(p[0].split(' ')) < MAX_LENGTH and len(p[1].split(' ')) < MAX_LENGTH and p[1].startswith(eng_prefixes)
    
def filterPairs(pairs):
    return [pair for pair in pairs if filterPair(pair)]

In [12]:
def prepareData(lang1, lang2, reverse=False):
    input_lang, output_lang, pairs = readLangs(lang1, lang2, reverse)
    print(f"Read {len(pairs)} sentence pairs")
    pairs = filterPairs(pairs)
    print(f"Trimmed to {len(pairs)} sentence pairs")
    print("Counting words...")
    for pair in pairs:
        input_lang.addSentence(pair[0])
        output_lang.addSentence(pair[1])
    print("Counted words:")
    print(input_lang.name, input_lang.n_words)
    print(output_lang.name, output_lang.n_words)
    return input_lang, output_lang, pairs

input_lang, output_lang, pairs = prepareData('eng', 'fra', True)
print(random.choice(pairs))

Reading lines...
Read 135842 sentence pairs
Trimmed to 10599 sentence pairs
Counting words...
Counted words:
fra 4345
eng 2803
['je me brosse les dents .', 'i am brushing my teeth .']


In [17]:
def indexesFromSentence(lang, sentence):
    return [lang.word2index[word] for word in sentence.split(' ')]

def tensorFromSentence(lang, sentence):
    indexes = indexesFromSentence(lang, sentence)
    indexes.append(EOS_token)
    return torch.tensor(indexes, dtype=torch.long, device=device).view(-1, 1)

def tensorsFromPair(pair):
    input_tensor = tensorFromSentence(input_lang, pair[0])
    target_tensor = tensorFromSentence(output_lang, pair[1])
    return (input_tensor, target_tensor)

In [None]:
tensorsFromPair(random.choice(pairs))[0][ei], a word, will be the input to Encoder

In [34]:
_input = tensorsFromPair(random.choice(pairs))[0][1]

In [35]:
_output = nn.Embedding(input_lang.n_words, 100)(_input)

In [37]:
_input.shape
_output.shape

torch.Size([1, 100])

In [33]:
# nn.Embedding transforms an index to an embedding vector of size (1, 100)

class EncoderRNN(nn.Module):
    def __init__(self, input_size, hidden_size):
        # input_size: input_lang.n_words
        super(EncoderRNN, self).__init__()
        self.hidden_size = hidden_size
        
        self.embedding = nn.Embedding(input_size, hidden_size)
        self.gru = nn.GRU(hidden_size, hidden_size)
        
    def forward(self, input, hidden):
        embedded = self.embedding(input).view(1, 1, -1)
        # from (1, hidden_size) to (1, 1, hidden_size)
        # len_seq is 1 since the input is a word
        output, hidden = self.gru(output, hidden)
        return output, hidden
    
    def initHidden(self):
        return torch.zeros(1, 1, self.hidden_size, device=device)

In [38]:
# the the input will be <SOS> with the context vector as the hidden state

class DecoderRNN(nn.Module):
    def __init__(self, hidden_size, output_size):
        # output_size: output_lang.n_words
        super(DecoderRNN, self).__init__()
        self.hidden_size = hidden_size
        
        self.embedding = nn.Embedding(output_size, hidden_size)  # transform a word of the output language
        self.gru = nn.GRU(hidden_size, hidden_size)
        self.out = nn.Linear(hidden_size, output_size)
        self.softmax = nn.LogSoftmax(dim=1)
        
    def forward(self, input, hidden):
        output = self.embedding(input).view(1, 1, -1)
        output = F.relu(output)
        output, hidden = self.gru(output, hidden)
        output = self.softmax(self.out(output[0]))  # output: (1, 1, hidden_size), output[0]: (1, hidden_size)
        return output, hidden
    
    def initHidden(self):
        return torch.zeros(1, 1, self.hidden_size, device=device)

In [None]:
torch.bmm: batch-matrix-multiplication, bmm with (10, 3, 4) and (10, 4, 5) -> (10, 3, 5)

attn_weights (1, MAX_LENGTH)
encoder_outputs (MAX_LENGTH, hidden_size)
CHECK : why MAX_LENGTH? may be for accounting for inputs

bmm(attn_weights.unsqueeze(0), encoder_outputs.unsqueeze(0)) -> attn_applied
(1, 1, MAX_LENGTH) * (1, MAX_LENGTH, hidden_size) -> (1, 1, hidden_size)

encoder_outputs is not directly referred,
but is multiplied by attention weights in bmm and adjusted to attn_applied before using

one encoder output for one word is (1, hidden_size) vector
encoder_outputs (MAX_LENGTH, hidden_size) is a collection of (1, hidden_size) vectors of MAX_LENGTH words 
MAX_LENGTH encoder outputs multiplied by corresponding weights condensed to (1, hidden_size) like one word encoder output

each weight element <-> each input word
[0, 0, 0.5, 0.5, 0] means 3rd 4th words are important at this decoding step

weights are obtained from input and previous hidden as context

attn_combine
combines the information in the input and the transformed encoder_outputs 

In [None]:
# tensor[0] is just a squeezing, (1, 1, features) -> (1, features), for passing to Linear layers
# unsqueezed for passing to other layers

class AttnDecoderRNN(nn.Module):
    def __init__(self, hidden_size, output_size, dropout_p=0.1, max_length=MAX_LENGTH):
        super(AttnDecoderRNN, self).__init__()
        self.hidden_size = hidden_size
        self.output_size = output_size
        self.dropout_p = dropout_p
        self.max_length = max_length
        
        self.embedding = nn.Embedding(output_size, hidden_size)
        self.attn = nn.Linear(hidden_size * 2, max_length)
        self.attn_combine = nn.Linear(hidden_size *2, hidden_size)
        self.dropout = nn.Dropout(dropout_p)
        self.gru = nn.GRU(hidden_size, hidden_size)
        self.out = nn.Linear(hidden_size, output_size)
        
    def forward(self, input, hidden, encoder_outputs):
        embedded = self.embedding(input).view(1, 1, -1)
        embedded = self.dropout(embedded)
        
        _attn_weights = self.attn(torch.cat([embedded[0], hidden[0]], 1))  # two of (1, 1, hidden) are squeezed and concatenated then paased to Linear layer
        attn_weights = F.softmax(_attn_weights, dim=1)  # softmax along axis 1
        attn_applied = torch.bmm(attn_weights.unsqueeze(0), encoder_outputs.unsqueeze(0))
        
        output = self.attn_combine(torch.cat([embedded[0], attn_applied[0]], 1)).unsqueeze(0)
        # Linear layer outputs (1, hidden_size*2), need shaped to (1, 1, hidden_size*2) for GRU layer
        
        output = F.relu(output)
        output, hidden = self.gru(outupt, hidden)
        output = F.log_softmax(self.out(output[0]), dim=1)
        return output, hidden, attn_weights
    
    def initHidden(self):
        return torch.zeros(1, 1, self.hidden_size, device=device)
        
CHECK F.softmax v F.log_softmax        

In [6]:
re.sub(r"([.!?])", r" \1", ".!?")  # \1 refers the matched 

' . ! ?'

In [8]:
list(reversed([1,2,3,4,5]))

[5, 4, 3, 2, 1]