In [None]:
%matplotlib inline

from __future__ import unicode_literals, print_function, division
from io import open
import unicodedata
import string
import re
import random

import torch
import torch.nn as nn
from torch import optim
import torch.nn.functional as F

device = torch.device("cuda" if torch.cuda.is_available() else "cpu")

!wget https://download.pytorch.org/tutorial/data.zip

!unzip data.zip

--2023-04-13 04:10:59--  https://download.pytorch.org/tutorial/data.zip
Resolving download.pytorch.org (download.pytorch.org)... 18.65.39.99, 18.65.39.75, 18.65.39.6, ...
Connecting to download.pytorch.org (download.pytorch.org)|18.65.39.99|:443... connected.
HTTP request sent, awaiting response... 200 OK
Length: 2882130 (2.7M) [application/zip]
Saving to: ‘data.zip.1’


2023-04-13 04:10:59 (19.1 MB/s) - ‘data.zip.1’ saved [2882130/2882130]

Archive:  data.zip
replace data/eng-fra.txt? [y]es, [n]o, [A]ll, [N]one, [r]ename: 

In [None]:
SOS_token = 0
EOS_token = 1


class Lang:
    def __init__(self, name):
        self.name = name
        self.word2index = {}
        self.word2count = {}
        self.index2word = {0: "SOS", 1: "EOS"}
        self.n_words = 2  # Count SOS and EOS

    def addSentence(self, sentence):
        for word in sentence.split(' '):
            self.addWord(word)

    def addWord(self, word):
        if word not in self.word2index:
            self.word2index[word] = self.n_words
            self.word2count[word] = 1
            self.index2word[self.n_words] = word
            self.n_words += 1
        else:
            self.word2count[word] += 1

# Turn a Unicode string to plain ASCII, thanks to
# https://stackoverflow.com/a/518232/2809427
def unicodeToAscii(s):
    return ''.join(
        c for c in unicodedata.normalize('NFD', s)
        if unicodedata.category(c) != 'Mn'
    )

# Lowercase, trim, and remove non-letter characters


def normalizeString(s):
    s = unicodeToAscii(s.lower().strip())
    s = re.sub(r"([.!?])", r" \1", s)
    s = re.sub(r"[^a-zA-Z.!?]+", r" ", s)
    return s

def readLangs(lang1, lang2, reverse=False):
    print("Reading lines...")

    # Read the file and split into lines
    lines = open('data/%s-%s.txt' % (lang1, lang2), encoding='utf-8').\
        read().strip().split('\n')

    # Split every line into pairs and normalize
    pairs = [[normalizeString(s) for s in l.split('\t')] for l in lines]

    # Reverse pairs, make Lang instances
    if reverse:
        pairs = [list(reversed(p)) for p in pairs]
        input_lang = Lang(lang2)
        output_lang = Lang(lang1)
    else:
        input_lang = Lang(lang1)
        output_lang = Lang(lang2)

    return input_lang, output_lang, pairs


MAX_LENGTH = 10

eng_prefixes = (
    "i am ", "i m ",
    "he is", "he s ",
    "she is", "she s ",
    "you are", "you re ",
    "we are", "we re ",
    "they are", "they re "
)


def filterPair(p):
    return len(p[0].split(' ')) < MAX_LENGTH and \
        len(p[1].split(' ')) < MAX_LENGTH and \
        p[1].startswith(eng_prefixes)


def filterPairs(pairs):
    return [pair for pair in pairs if filterPair(pair)]


def prepareData(lang1, lang2, reverse=False):
    input_lang, output_lang, pairs = readLangs(lang1, lang2, reverse)
    print("Read %s sentence pairs" % len(pairs))
    pairs = filterPairs(pairs)
    print("Trimmed to %s sentence pairs" % len(pairs))
    print("Counting words...")
    for pair in pairs:
        input_lang.addSentence(pair[0])
        output_lang.addSentence(pair[1])
    print("Counted words:")
    print(input_lang.name, input_lang.n_words)
    print(output_lang.name, output_lang.n_words)
    return input_lang, output_lang, pairs


input_lang, output_lang, pairs = prepareData('eng', 'fra', True)
print(random.choice(pairs))

Reading lines...
Read 135842 sentence pairs
Trimmed to 10599 sentence pairs
Counting words...
Counted words:
fra 4345
eng 2803
['j adore la musique .', 'i am fond of music .']


In [None]:
type(pairs)

list

In [None]:
pairs[0:5]

[['j ai ans .', 'i m .'],
 ['je vais bien .', 'i m ok .'],
 ['ca va .', 'i m ok .'],
 ['je suis gras .', 'i m fat .'],
 ['je suis gros .', 'i m fat .']]

# The architecture we are building

![image](https://miro.medium.com/max/1838/1*tXchCn0hBSUau3WO0ViD7w.jpeg)

As we can see here, we will have an encoder, an attention mechanism block and decoder. In the final code the attention mechanicm block and decoder will be merged into single block as we need both to work together. 

As we can see here, we need to create a copy of h1, h2, h3 and h4. These are encoder outputs for a sentence with 4 words. 

# Encoder

We will build our encoder with a GRU, but that's all we know. Let's NOT strait away build a class, but see how to come up with one for the Encoder. We need to answer few questions first:
1. what would be the hidden size of our GRU
2. What would be the input size
3. What would be the embedding dimesions. 

For simplicity, lets keep 1. and 3. to be 256. 

We can't feed our input directly to GRU, we need to tensorize it, convert to embeddings first. 

`embedding = nn.Embedding(input_size, hidden_size) `

## What is input_size?

Remember the line below?

`input_lang, output_lang, pairs = prepareData('eng', 'fra', True)`

In [None]:
input_lang

<__main__.Lang at 0x7f7d1ce620d0>

In [None]:
help(input_lang)

Help on Lang in module __main__ object:

class Lang(builtins.object)
 |  Lang(name)
 |  
 |  Methods defined here:
 |  
 |  __init__(self, name)
 |      Initialize self.  See help(type(self)) for accurate signature.
 |  
 |  addSentence(self, sentence)
 |  
 |  addWord(self, word)
 |  
 |  ----------------------------------------------------------------------
 |  Data descriptors defined here:
 |  
 |  __dict__
 |      dictionary for instance variables (if defined)
 |  
 |  __weakref__
 |      list of weak references to the object (if defined)



In [None]:
input_lang.__dict__.items()

dict_items([('name', 'fra'), ('word2index', {'j': 2, 'ai': 3, 'ans': 4, '.': 5, 'je': 6, 'vais': 7, 'bien': 8, 'ca': 9, 'va': 10, 'suis': 11, 'gras': 12, 'gros': 13, 'en': 14, 'forme': 15, 'touche': 16, '!': 17, 'touchee': 18, 'malade': 19, 'triste': 20, 'timide': 21, 'mouille': 22, 'mouillee': 23, 'il': 24, 'est': 25, 'revenu': 26, 'me': 27, 'revoila': 28, 'chauve': 29, 'occupe': 30, 'occupee': 31, 'calme': 32, 'froid': 33, 'fini': 34, 'tout': 35, 'libre': 36, 'disponible': 37, 'repu': 38, 'rassasie': 39, 'content': 40, 'chez': 41, 'moi': 42, 'retard': 43, 'paresseux': 44, 'faineant': 45, 'paresseuse': 46, 'faineante': 47, 'porte': 48, 'securite': 49, 'certain': 50, 'sur': 51, 'sure': 52, 'grande': 53, 'mince': 54, 'ordonne': 55, 'ordonnee': 56, 'laid': 57, 'laide': 58, 'faible': 59, 'vieux': 60, 'dj': 61, 'bon': 62, 'riche': 63, 'ici': 64, 'flic': 65, 'un': 66, 'homme': 67, 'seule': 68, 'seul': 69, 'arme': 70, 'armee': 71, 'reveille': 72, 'aveugle': 73, 'fauche': 74, 'fou': 75, 'foll

In [None]:
input_size = input_lang.n_words
hidden_size = 256
input_size

4345

In [None]:
embedding = nn.Embedding(input_size, hidden_size)
gru = nn.GRU(hidden_size, hidden_size)

In [None]:
sample = random.choice(pairs)
sample

['vous devez venir avec moi .', 'you are to come with me .']

In [None]:
device

device(type='cuda')

In [None]:
# embedding_input = embedding(sample[0])

In [None]:
sample

['vous devez venir avec moi .', 'you are to come with me .']

In [None]:
input_sentence = sample[0]
output_sentence = sample[1]

input_lang.word2index['elle']

119

In [None]:
for word in input_sentence:
  print(word)

v
o
u
s
 
d
e
v
e
z
 
v
e
n
i
r
 
a
v
e
c
 
m
o
i
 
.


In [None]:
for word in input_sentence.split(' '):
  print(word)

vous
devez
venir
avec
moi
.


In [None]:
input_indices = [input_lang.word2index[word] for word in input_sentence.split(' ')]
output_indices = [output_lang.word2index[word] for word in output_sentence.split(' ')]
input_indices, output_indices

([118, 2941, 1819, 1073, 42, 5], [129, 124, 532, 990, 677, 343, 4])

In [None]:
# embedding_input = embedding(input_indices)

TypeError: ignored

In [None]:
input_indices.append(EOS_token)
output_indices.append(EOS_token)
input_indices, output_indices

([118, 2941, 1819, 1073, 42, 5, 1], [129, 124, 532, 990, 677, 343, 4, 1])

In [None]:
input_tensor = torch.tensor(input_indices, dtype=torch.long, device=device)
output_tensor = torch.tensor(output_indices, dtype=torch.long, device=device)

In [None]:
input_tensor.shape, output_tensor.shape

(torch.Size([7]), torch.Size([8]))

In [None]:
# embedding_input = embedding(input_tensor)

RuntimeError: ignored

In [None]:
embedding = nn.Embedding(input_size, hidden_size).to(device)
gru = nn.GRU(hidden_size, hidden_size).to(device)

In [None]:
embedding_input = embedding(input_tensor)
embedding_input.shape

torch.Size([7, 256])

In [None]:
input_tensor

tensor([ 118, 2941, 1819, 1073,   42,    5,    1], device='cuda:0')

In [None]:
input_tensor.shape, input_tensor.view(-1, 1).shape

(torch.Size([7]), torch.Size([7, 1]))

In [None]:
print(embedding_input.shape)
embedding_input = embedding(input_tensor.view(-1, 1))
print(embedding_input.shape)

torch.Size([7, 256])
torch.Size([7, 1, 256])


In [None]:
# output, hidden = gru(embedde_input, ?)
hidden = torch.zeros(1, 1, 256, device = device)

In [None]:
embedding_input = embedding(input_tensor.view(-1, 1))
output, hidden = gru(embedding_input, hidden)

output.shape, output[0, 0].shape

(torch.Size([7, 1, 256]), torch.Size([256]))

In [None]:
encoder_outputs = torch.zeros(MAX_LENGTH, 256, device=device)
encoder_outputs.shape

torch.Size([10, 256])

In [None]:
input_tensor.size()[0]

7

In [None]:
encoder_outputs = torch.zeros(MAX_LENGTH, 256, device=device)
hidden = torch.zeros(1, 1, 256, device = device)

for i in range(input_tensor.size()[0]):
  embedding_input = embedding(input_tensor[i].view(-1, 1))
  output, hidden = gru(embedding_input, hidden)
  encoder_outputs[i] += output[0, 0]

In [None]:
encoder_outputs.shape, hidden.shape

(torch.Size([10, 256]), torch.Size([1, 1, 256]))

In [None]:
encoder_outputs[0:4]

tensor([[-0.0349,  0.4887, -0.0850,  ...,  0.2338, -0.3539, -0.3723],
        [ 0.1534,  0.3510,  0.1890,  ...,  0.2318,  0.0094, -0.1518],
        [ 0.0856,  0.4238,  0.0250,  ...,  0.3733,  0.0673,  0.2829],
        [ 0.3812, -0.2869, -0.1479,  ..., -0.2405, -0.1512,  0.5838]],
       device='cuda:0', grad_fn=<SliceBackward0>)

In [None]:
encoder_outputs[7:10]

tensor([[0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0.,
         0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0.,
         0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0.,
         0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0.,
         0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0.,
         0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0.,
         0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0.,
         0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0.,
         0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0.,
         0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0.

# 😁

Finally our Encoder is fully ready. Now let's look at the class we wrote in the last class to see what we missed!

```
class EncoderRNN(nn.Module):
    def __init__(self, input_size, hidden_size):
        super(EncoderRNN, self).__init__()
        self.hidden_size = hidden_size

        self.embedding = nn.Embedding(input_size, hidden_size)
        self.gru = nn.GRU(hidden_size, hidden_size)

    def forward(self, input, hidden):
        embedded = self.embedding(input).view(1, 1, -1)
        output = embedded
        output, hidden = self.gru(output, hidden)
        return output, hidden

    def initHidden(self):
        return torch.zeros(1, 1, self.hidden_size, device=device)
```

Cool! Next let's build out Decoder where we have attention in-built.

# Decoder with Attention

Here is the plan. 

1. First input to the decoder will be SOS_token, later inputs would be the words it predicted (unless we implement teacher forcing)
2. decoder/GRU's hidden state will be initialized with the encoder's last hidden state
3. we will use gru's hidden state and last prediction to generate attention weight using a FC layer. 
4. this attention weight will be used to weigh the encoder_outputs using batch matric multiplication. This will give us a NEW view on how to look at encoder_states.
5. this attention applied encoder_states will then be concatenated with the input, and then sent a linear layer and _then_ sent to the GRU. 
6. GRU's output will be sent to a FC layer to predict one of the output_language words

Let's prepare all the inputs we need to do this


In [None]:
decoder_input = torch.tensor([[SOS_token]], device=device)
decoder_hidden = hidden
decoded_words = []

In [None]:
# decoder s0
output_size = output_lang.n_words
embedding = nn.Embedding(output_size, 256).to(device)
embedded = embedding(decoder_input)
embedded.shape  # decoder_input is S0. embedded is first output of decoder (maybe embedded S0) in embedding form

torch.Size([1, 1, 256])

In [None]:
# 256 * 2 >> after concatenation
attn_weight_layer = nn.Linear(256 * 2, 10).to(device)  # why 256 * 2 -> this is because we are concatenating S0 and H1

In [None]:
embedded.shape, decoder_hidden.shape

(torch.Size([1, 1, 256]), torch.Size([1, 1, 256]))

In [None]:
torch.cat((embedded[0], decoder_hidden[0]), 1).shape # S0H1

torch.Size([1, 512])

In [None]:
attn_weight_layer = nn.Linear(256 * 2, 10).to(device)
attn_weights = attn_weight_layer(torch.cat((embedded[0], decoder_hidden[0]), 1))  # Linear network sent data to get values of alpha - attn weights
attn_weights  # these are 10 because there can be at max 10 words in the sentence as set by max length parameter.

tensor([[-0.3210,  0.0267, -0.0545, -0.2105,  0.1906, -0.5162, -0.4084, -0.4496,
          0.0484, -0.4991]], device='cuda:0', grad_fn=<AddmmBackward0>)

In [None]:
import torch.nn.functional as F
attn_weight_layer = nn.Linear(256 * 2, 10).to(device)
attn_weights = attn_weight_layer(torch.cat((embedded[0], decoder_hidden[0]), 1))
attn_weights = F.softmax(attn_weights, dim = 1)
attn_weights


tensor([[0.0400, 0.0859, 0.1655, 0.0431, 0.1778, 0.0720, 0.0991, 0.0903, 0.1208,
         0.1055]], device='cuda:0', grad_fn=<SoftmaxBackward0>)

In [None]:
attn_weights.shape, encoder_outputs.shape

(torch.Size([1, 10]), torch.Size([10, 256]))

In [None]:
# attn_applied = torch.bmm(attn_weights, encoder_outputs)

RuntimeError: ignored

In [None]:
attn_weights.unsqueeze(0).shape, encoder_outputs.unsqueeze(0).shape

(torch.Size([1, 1, 10]), torch.Size([1, 10, 256]))

In [None]:
attn_applied = torch.bmm(attn_weights.unsqueeze(0), encoder_outputs.unsqueeze(0))
attn_applied.shape

torch.Size([1, 1, 256])

So, now we have this 256dm attn_applied encoder_outputs capturing what we should focus on on this step. We also have the input we already generated. That's 256dm again. GRU is gonna take 256 only. So we need to concatenate them, send to a linear layer to reduce dimensions, and then send to Gru
![image](https://static.wikia.nocookie.net/mycun-the-movie/images/c/c2/Gru-icon.png/revision/latest/scale-to-width-down/250?cb=20151223171656)

In [None]:
input_to_gru_layer = nn.Linear(256 * 2, 256).to(device)
embedded.shape, attn_applied.shape  # embedded is S0 and attn_applied is C1. we send both to GRU unit.

(torch.Size([1, 1, 256]), torch.Size([1, 1, 256]))

In [None]:
input_to_gru = input_to_gru_layer(torch.cat((embedded[0], attn_applied[0]), 1))
input_to_gru.shape

torch.Size([1, 256])

In [None]:
gru = nn.GRU(256, 256).to(device)
decoder_hidden.shape, input_to_gru.shape  # not sure why is decoder_hidden used here.

(torch.Size([1, 1, 256]), torch.Size([1, 256]))

In [None]:
input_to_gru = input_to_gru_layer(torch.cat((embedded[0], attn_applied[0]), 1))
input_to_gru = input_to_gru.unsqueeze(0)
decoder_hidden.shape, input_to_gru.shape

(torch.Size([1, 1, 256]), torch.Size([1, 1, 256]))

In [None]:
output, decoder_hidden = gru(decoder_hidden, input_to_gru)
output.shape, decoder_hidden.shape

(torch.Size([1, 1, 256]), torch.Size([1, 1, 256]))

In [None]:
output_word_layer = nn.Linear(256, output_lang.n_words).to(device)

In [None]:
output = F.relu(output)
output = F.softmax(output_word_layer(output[0]), dim=1)
output.shape, output, output.data.topk(1)

(torch.Size([1, 2803]),
 tensor([[0.0003, 0.0003, 0.0003,  ..., 0.0004, 0.0003, 0.0003]],
        device='cuda:0', grad_fn=<SoftmaxBackward0>),
 torch.return_types.topk(
 values=tensor([[0.0005]], device='cuda:0'),
 indices=tensor([[1490]], device='cuda:0')))

In [None]:
topv, topi = output.data.topk(1)
output_lang.index2word[topi.item()]

'restaurant'

In [None]:
decoder_input = torch.tensor([[SOS_token]], device=device)
decoder_hidden = hidden #decoder_hidden = encoder_hidden
output_size = output_lang.n_words
embedding = nn.Embedding(output_size, 256).to(device)
embedded = embedding(decoder_input)
attn_weight_layer = nn.Linear(256 * 2, 10).to(device)
attn_weights = attn_weight_layer(torch.cat((embedded[0], decoder_hidden[0]), 1))
attn_weights = F.softmax(attn_weights, dim = 1)
attn_applied = torch.bmm(attn_weights.unsqueeze(0), encoder_outputs.unsqueeze(0))
input_to_gru_layer = nn.Linear(256 * 2, 256).to(device)
input_to_gru = input_to_gru_layer(torch.cat((embedded[0], attn_applied[0]), 1))
gru = nn.GRU(256, 256).to(device)
input_to_gru = input_to_gru.unsqueeze(0)
output, decoder_hidden = gru(input_to_gru, decoder_hidden)
output_word_layer = nn.Linear(256, output_lang.n_words).to(device)
output = F.relu(output)
output = F.softmax(output_word_layer(output[0]), dim = 1)
top_value, top_index = output.data.topk(1)
output_lang.index2word[top_index.item()]



'shortcomings'

In [None]:
embedding = nn.Embedding(output_size, 256).to(device)
attn_weight_layer = nn.Linear(256 * 2, 10).to(device)
input_to_gru_layer = nn.Linear(256 * 2, 256).to(device)
gru = nn.GRU(256, 256).to(device)
output_word_layer = nn.Linear(256, output_lang.n_words).to(device)

decoder_input = torch.tensor([[SOS_token]], device=device)
decoder_hidden = hidden
output_size = output_lang.n_words
embedded = embedding(decoder_input)
attn_weights = attn_weight_layer(torch.cat((embedded[0], decoder_hidden[0]), 1))
attn_weights = F.softmax(attn_weights, dim = 1)
attn_applied = torch.bmm(attn_weights.unsqueeze(0), encoder_outputs.unsqueeze(0))
input_to_gru = input_to_gru_layer(torch.cat((embedded[0], attn_applied[0]), 1))
input_to_gru = input_to_gru.unsqueeze(0)
output, decoder_hidden = gru(input_to_gru, decoder_hidden)
output = F.relu(output)
output = F.softmax(output_word_layer(output[0]), dim = 1)
top_value, top_index = output.data.topk(1)
output_lang.index2word[top_index.item()], attn_weights

('test',
 tensor([[0.0630, 0.0873, 0.1406, 0.0490, 0.1016, 0.0761, 0.1071, 0.1146, 0.0457,
          0.2151]], device='cuda:0', grad_fn=<SoftmaxBackward0>))

In [None]:
decoder_input = torch.tensor([[SOS_token]], device=device)
decoder_hidden = hidden
output_size = output_lang.n_words
embedded = embedding(decoder_input)
attn_weights = attn_weight_layer(torch.cat((embedded[0], decoder_hidden[0]), 1))
attn_weights = F.softmax(attn_weights, dim = 1)
attn_applied = torch.bmm(attn_weights.unsqueeze(0), encoder_outputs.unsqueeze(0))
input_to_gru = input_to_gru_layer(torch.cat((embedded[0], attn_applied[0]), 1))
input_to_gru = input_to_gru.unsqueeze(0)
output, decoder_hidden = gru(input_to_gru, decoder_hidden)
output = F.relu(output)
output = F.softmax(output_word_layer(output[0]), dim = 1)
top_value, top_index = output.data.topk(1)
output_lang.index2word[top_index.item()], attn_weights

('test',
 tensor([[0.0630, 0.0873, 0.1406, 0.0490, 0.1016, 0.0761, 0.1071, 0.1146, 0.0457,
          0.2151]], device='cuda:0', grad_fn=<SoftmaxBackward0>))

In [None]:
decoder_input = torch.tensor([[top_index.item()]], device=device)
decoder_hidden = hidden
output_size = output_lang.n_words
embedded = embedding(decoder_input)
attn_weights = attn_weight_layer(torch.cat((embedded[0], decoder_hidden[0]), 1))
attn_weights = F.softmax(attn_weights, dim = 1)
attn_applied = torch.bmm(attn_weights.unsqueeze(0), encoder_outputs.unsqueeze(0))
input_to_gru = input_to_gru_layer(torch.cat((embedded[0], attn_applied[0]), 1))
input_to_gru = input_to_gru.unsqueeze(0)
output, decoder_hidden = gru(input_to_gru, decoder_hidden)
output = F.relu(output)
output = F.softmax(output_word_layer(output[0]), dim = 1)
top_value, top_index = output.data.topk(1)
output_lang.index2word[top_index.item()], attn_weights

('herself',
 tensor([[0.0611, 0.1441, 0.1188, 0.0708, 0.0604, 0.1543, 0.0831, 0.0641, 0.1237,
          0.1195]], device='cuda:0', grad_fn=<SoftmaxBackward0>))

In [None]:
decoder_input = torch.tensor([[top_index.item()]], device=device)
decoder_hidden = hidden
output_size = output_lang.n_words
embedded = embedding(decoder_input)
attn_weights = attn_weight_layer(torch.cat((embedded[0], decoder_hidden[0]), 1))
attn_weights = F.softmax(attn_weights, dim = 1)
attn_applied = torch.bmm(attn_weights.unsqueeze(0), encoder_outputs.unsqueeze(0))
input_to_gru = input_to_gru_layer(torch.cat((embedded[0], attn_applied[0]), 1))
input_to_gru = input_to_gru.unsqueeze(0)
output, decoder_hidden = gru(input_to_gru, decoder_hidden)
output = F.relu(output)
output = F.softmax(output_word_layer(output[0]), dim = 1)
top_value, top_index = output.data.topk(1)
output_lang.index2word[top_index.item()], attn_weights

('fortunate',
 tensor([[0.1066, 0.0916, 0.1007, 0.0608, 0.1345, 0.0851, 0.1197, 0.1049, 0.0708,
          0.1252]], device='cuda:0', grad_fn=<SoftmaxBackward0>))

In [None]:
for i in range(6):
  decoder_input = torch.tensor([[output_indices[i]]], device=device)
  decoder_hidden = hidden
  output_size = output_lang.n_words
  embedded = embedding(decoder_input)
  attn_weights = attn_weight_layer(torch.cat((embedded[0], decoder_hidden[0]), 1))
  attn_weights = F.softmax(attn_weights, dim = 1)
  attn_applied = torch.bmm(attn_weights.unsqueeze(0), encoder_outputs.unsqueeze(0))
  input_to_gru = input_to_gru_layer(torch.cat((embedded[0], attn_applied[0]), 1))
  input_to_gru = input_to_gru.unsqueeze(0)
  output, decoder_hidden = gru(input_to_gru, decoder_hidden)
  output = F.relu(output)
  output = F.softmax(output_word_layer(output[0]), dim = 1)
  top_value, top_index = output.data.topk(1)
  print(output_sentence.split(" ")[i], output_indices[i], output_lang.index2word[top_index.item()], top_index.item() )
  print(attn_weights)

you 129 hungry 100
tensor([[0.0501, 0.0856, 0.1523, 0.0685, 0.0717, 0.1069, 0.1756, 0.1306, 0.0960,
         0.0627]], device='cuda:0', grad_fn=<SoftmaxBackward0>)
are 124 hungry 100
tensor([[0.0320, 0.1305, 0.1038, 0.1246, 0.1644, 0.1112, 0.1186, 0.0976, 0.0481,
         0.0692]], device='cuda:0', grad_fn=<SoftmaxBackward0>)
to 532 attic 781
tensor([[0.0941, 0.1130, 0.1371, 0.0826, 0.0745, 0.1217, 0.0901, 0.0864, 0.0857,
         0.1149]], device='cuda:0', grad_fn=<SoftmaxBackward0>)
come 990 skeptical 1167
tensor([[0.1627, 0.0374, 0.0885, 0.0759, 0.1759, 0.1122, 0.0703, 0.0702, 0.1147,
         0.0922]], device='cuda:0', grad_fn=<SoftmaxBackward0>)
with 677 hungry 100
tensor([[0.0777, 0.1719, 0.1664, 0.0473, 0.0580, 0.0804, 0.1062, 0.1564, 0.0822,
         0.0534]], device='cuda:0', grad_fn=<SoftmaxBackward0>)
me 343 hungry 100
tensor([[0.0633, 0.1197, 0.2250, 0.0522, 0.0934, 0.0704, 0.1305, 0.0642, 0.0788,
         0.1027]], device='cuda:0', grad_fn=<SoftmaxBackward0>)


In [None]:
output_indices, output_sentence, input_sentence

([129, 124, 532, 990, 677, 343, 4, 1],
 'you are to come with me .',
 'vous devez venir avec moi .')

In [None]:
%matplotlib inline

In [None]:
from __future__ import unicode_literals, print_function, division
from io import open
import unicodedata
import string
import re
import random

import torch
import torch.nn as nn
from torch import optim
import torch.nn.functional as F

device = torch.device("cuda" if torch.cuda.is_available() else "cpu")

In [None]:
device

device(type='cuda')

In [None]:
!wget https://download.pytorch.org/tutorial/data.zip

!unzip data.zip

--2023-04-13 06:08:11--  https://download.pytorch.org/tutorial/data.zip
Resolving download.pytorch.org (download.pytorch.org)... 18.65.39.99, 18.65.39.56, 18.65.39.75, ...
Connecting to download.pytorch.org (download.pytorch.org)|18.65.39.99|:443... connected.
HTTP request sent, awaiting response... 200 OK
Length: 2882130 (2.7M) [application/zip]
Saving to: ‘data.zip.2’


2023-04-13 06:08:11 (19.8 MB/s) - ‘data.zip.2’ saved [2882130/2882130]

Archive:  data.zip
replace data/eng-fra.txt? [y]es, [n]o, [A]ll, [N]one, [r]ename: y
  inflating: data/eng-fra.txt        
replace data/names/Arabic.txt? [y]es, [n]o, [A]ll, [N]one, [r]ename: A
  inflating: data/names/Arabic.txt   
  inflating: data/names/Chinese.txt  
  inflating: data/names/Czech.txt    
  inflating: data/names/Dutch.txt    
  inflating: data/names/English.txt  
  inflating: data/names/French.txt   
  inflating: data/names/German.txt   
  inflating: data/names/Greek.txt    
  inflating: data/names/Irish.txt    
  inflating: data

In [None]:
SOS_token = 0
EOS_token = 1


class Lang:
    def __init__(self, name):
        self.name = name
        self.word2index = {}
        self.word2count = {}
        self.index2word = {0: "SOS", 1: "EOS"}
        self.n_words = 2  # Count SOS and EOS

    def addSentence(self, sentence):
        for word in sentence.split(' '):
            self.addWord(word)

    def addWord(self, word):
        if word not in self.word2index:
            self.word2index[word] = self.n_words
            self.word2count[word] = 1
            self.index2word[self.n_words] = word
            self.n_words += 1
        else:
            self.word2count[word] += 1

In [None]:
# Turn a Unicode string to plain ASCII, thanks to
# https://stackoverflow.com/a/518232/2809427
def unicodeToAscii(s):
    return ''.join(
        c for c in unicodedata.normalize('NFD', s)
        if unicodedata.category(c) != 'Mn'
    )

# Lowercase, trim, and remove non-letter characters


def normalizeString(s):
    s = unicodeToAscii(s.lower().strip())
    s = re.sub(r"([.!?])", r" \1", s)
    s = re.sub(r"[^a-zA-Z.!?]+", r" ", s)
    return s

In [None]:
def readLangs(lang1, lang2, reverse=False):
    print("Reading lines...")

    # Read the file and split into lines
    lines = open('data/%s-%s.txt' % (lang1, lang2), encoding='utf-8').\
        read().strip().split('\n')

    # Split every line into pairs and normalize
    pairs = [[normalizeString(s) for s in l.split('\t')] for l in lines]

    # Reverse pairs, make Lang instances
    if reverse:
        pairs = [list(reversed(p)) for p in pairs]
        input_lang = Lang(lang2)
        output_lang = Lang(lang1)
    else:
        input_lang = Lang(lang1)
        output_lang = Lang(lang2)

    return input_lang, output_lang, pairs

In [None]:
MAX_LENGTH = 10

eng_prefixes = (
    "i am ", "i m ",
    "he is", "he s ",
    "she is", "she s ",
    "you are", "you re ",
    "we are", "we re ",
    "they are", "they re "
)


def filterPair(p):
    return len(p[0].split(' ')) < MAX_LENGTH and \
        len(p[1].split(' ')) < MAX_LENGTH and \
        p[1].startswith(eng_prefixes)


def filterPairs(pairs):
    return [pair for pair in pairs if filterPair(pair)]

In [None]:
def prepareData(lang1, lang2, reverse=False):
    input_lang, output_lang, pairs = readLangs(lang1, lang2, reverse)
    print("Read %s sentence pairs" % len(pairs))
    pairs = filterPairs(pairs)
    print("Trimmed to %s sentence pairs" % len(pairs))
    print("Counting words...")
    for pair in pairs:
        input_lang.addSentence(pair[0])
        output_lang.addSentence(pair[1])
    print("Counted words:")
    print(input_lang.name, input_lang.n_words)
    print(output_lang.name, output_lang.n_words)
    return input_lang, output_lang, pairs


input_lang, output_lang, pairs = prepareData('eng', 'fra', True)
print(random.choice(pairs))

Reading lines...
Read 135842 sentence pairs
Trimmed to 10599 sentence pairs
Counting words...
Counted words:
fra 4345
eng 2803
['il n est pas un saint .', 'he s no saint .']


In [None]:
class EncoderRNN(nn.Module):
    def __init__(self, input_size, hidden_size):
        super(EncoderRNN, self).__init__()
        self.hidden_size = hidden_size

        self.embedding = nn.Embedding(input_size, hidden_size)
        self.gru = nn.GRU(hidden_size, hidden_size)

    def forward(self, input, hidden):
        embedded = self.embedding(input).view(1, 1, -1)
        output = embedded
        output, hidden = self.gru(output, hidden)
        return output, hidden

    def initHidden(self):
        return torch.zeros(1, 1, self.hidden_size, device=device)

In [None]:
class AttnDecoderRNN(nn.Module):
    def __init__(self, hidden_size, output_size, dropout_p=0.1, max_length=MAX_LENGTH):
        super(AttnDecoderRNN, self).__init__()
        self.hidden_size = hidden_size
        self.output_size = output_size
        self.dropout_p = dropout_p
        self.max_length = max_length

        self.embedding = nn.Embedding(self.output_size, self.hidden_size)
        self.attn = nn.Linear(self.hidden_size * 2, self.max_length)
        self.attn_combine = nn.Linear(self.hidden_size * 2, self.hidden_size)
        self.dropout = nn.Dropout(self.dropout_p)
        self.gru = nn.GRU(self.hidden_size, self.hidden_size)
        self.out = nn.Linear(self.hidden_size, self.output_size)

    def forward(self, input, hidden, encoder_outputs):
        embedded = self.embedding(input).view(1, 1, -1)  # this is SOS token
        embedded = self.dropout(embedded)

        attn_weights = F.softmax(
            self.attn(torch.cat((embedded[0], hidden[0]), 1)), dim=1)
        attn_applied = torch.bmm(attn_weights.unsqueeze(0),
                                 encoder_outputs.unsqueeze(0))

        output = torch.cat((embedded[0], attn_applied[0]), 1)
        output = self.attn_combine(output).unsqueeze(0)

        output, hidden = self.gru(output, hidden)
        output = F.relu(output)

        output = F.log_softmax(self.out(output[0]), dim=1)
        return output, hidden, attn_weights

    def initHidden(self):
        return torch.zeros(1, 1, self.hidden_size, device=device)

In [None]:
def indexesFromSentence(lang, sentence):
    return [lang.word2index[word] for word in sentence.split(' ')]


def tensorFromSentence(lang, sentence):
    indexes = indexesFromSentence(lang, sentence)
    indexes.append(EOS_token)
    return torch.tensor(indexes, dtype=torch.long, device=device).view(-1, 1)


def tensorsFromPair(pair):
    input_tensor = tensorFromSentence(input_lang, pair[0])
    target_tensor = tensorFromSentence(output_lang, pair[1])
    return (input_tensor, target_tensor)

In [None]:
teacher_forcing_ratio = 0.5


def train(input_tensor, target_tensor, encoder, decoder, encoder_optimizer, decoder_optimizer, criterion, max_length=MAX_LENGTH):
    encoder_hidden = encoder.initHidden()

    encoder_optimizer.zero_grad()
    decoder_optimizer.zero_grad()

    input_length = input_tensor.size(0)
    target_length = target_tensor.size(0)

    encoder_outputs = torch.zeros(max_length, encoder.hidden_size, device=device)

    loss = 0

    for ei in range(input_length):
        encoder_output, encoder_hidden = encoder(
            input_tensor[ei], encoder_hidden)
        encoder_outputs[ei] = encoder_output[0, 0]

    decoder_input = torch.tensor([[SOS_token]], device=device)

    decoder_hidden = encoder_hidden

    use_teacher_forcing = True if random.random() < teacher_forcing_ratio else False

    if use_teacher_forcing:
        # Teacher forcing: Feed the target as the next input
        for di in range(target_length):
            decoder_output, decoder_hidden, decoder_attention = decoder(
                decoder_input, decoder_hidden, encoder_outputs)
            loss += criterion(decoder_output, target_tensor[di])
            decoder_input = target_tensor[di]  # Teacher forcing

    else:
        # Without teacher forcing: use its own predictions as the next input
        for di in range(target_length):
            decoder_output, decoder_hidden, decoder_attention = decoder(
                decoder_input, decoder_hidden, encoder_outputs)
            topv, topi = decoder_output.topk(1)
            decoder_input = topi.squeeze().detach()  # detach from history as input

            loss += criterion(decoder_output, target_tensor[di])
            if decoder_input.item() == EOS_token:
                break

    loss.backward()

    encoder_optimizer.step()
    decoder_optimizer.step()

    return loss.item() / target_length

In [None]:
import time
import math


def asMinutes(s):
    m = math.floor(s / 60)
    s -= m * 60
    return '%dm %ds' % (m, s)


def timeSince(since, percent):
    now = time.time()
    s = now - since
    es = s / (percent)
    rs = es - s
    return '%s (- %s)' % (asMinutes(s), asMinutes(rs))



In [None]:
def trainIters(encoder, decoder, n_iters, print_every=1000, plot_every=100, learning_rate=0.01):
    start = time.time()
    plot_losses = []
    print_loss_total = 0  # Reset every print_every
    plot_loss_total = 0  # Reset every plot_every

    encoder_optimizer = optim.SGD(encoder.parameters(), lr=learning_rate)
    decoder_optimizer = optim.SGD(decoder.parameters(), lr=learning_rate)
    training_pairs = [tensorsFromPair(random.choice(pairs))
                      for i in range(n_iters)]
    criterion = nn.NLLLoss()

    for iter in range(1, n_iters + 1):
        training_pair = training_pairs[iter - 1]
        input_tensor = training_pair[0]
        target_tensor = training_pair[1]

        loss = train(input_tensor, target_tensor, encoder,
                     decoder, encoder_optimizer, decoder_optimizer, criterion)
        print_loss_total += loss
        plot_loss_total += loss

        if iter % print_every == 0:
            print_loss_avg = print_loss_total / print_every
            print_loss_total = 0
            print('%s (%d %d%%) %.4f' % (timeSince(start, iter / n_iters),
                                         iter, iter / n_iters * 100, print_loss_avg))

        if iter % plot_every == 0:
            plot_loss_avg = plot_loss_total / plot_every
            plot_losses.append(plot_loss_avg)
            plot_loss_total = 0

    showPlot(plot_losses)

In [None]:
import matplotlib.pyplot as plt
plt.switch_backend('agg')
import matplotlib.ticker as ticker
import numpy as np


def showPlot(points):
    plt.figure()
    fig, ax = plt.subplots()
    # this locator puts ticks at regular intervals
    loc = ticker.MultipleLocator(base=0.2)
    ax.yaxis.set_major_locator(loc)
    plt.plot(points)

In [None]:
def evaluate(encoder, decoder, sentence, max_length=MAX_LENGTH):
    with torch.no_grad():
        input_tensor = tensorFromSentence(input_lang, sentence)
        input_length = input_tensor.size()[0]
        encoder_hidden = encoder.initHidden()

        encoder_outputs = torch.zeros(max_length, encoder.hidden_size, device=device)

        for ei in range(input_length):
            encoder_output, encoder_hidden = encoder(input_tensor[ei],
                                                     encoder_hidden)
            encoder_outputs[ei] += encoder_output[0, 0]

        decoder_input = torch.tensor([[SOS_token]], device=device)  # SOS

        decoder_hidden = encoder_hidden

        decoded_words = []
        decoder_attentions = torch.zeros(max_length, max_length)

        for di in range(max_length):
            decoder_output, decoder_hidden, decoder_attention = decoder(
                decoder_input, decoder_hidden, encoder_outputs)
            decoder_attentions[di] = decoder_attention.data
            topv, topi = decoder_output.data.topk(1)
            if topi.item() == EOS_token:
                decoded_words.append('<EOS>')
                break
            else:
                decoded_words.append(output_lang.index2word[topi.item()])

            decoder_input = topi.squeeze().detach()

        return decoded_words, decoder_attentions[:di + 1]

In [None]:
def evaluateRandomly(encoder, decoder, n=10):
    for i in range(n):
        pair = random.choice(pairs)
        print('>', pair[0])
        print('=', pair[1])
        output_words, attentions = evaluate(encoder, decoder, pair[0])
        output_sentence = ' '.join(output_words)
        print('<', output_sentence)
        print('')

In [None]:
hidden_size = 256
encoder1 = EncoderRNN(input_lang.n_words, hidden_size).to(device)
attn_decoder1 = AttnDecoderRNN(hidden_size, output_lang.n_words, dropout_p=0.1).to(device)

trainIters(encoder1, attn_decoder1, 75000, print_every=5000)

1m 5s (- 15m 22s) (5000 6%) 2.9590
2m 8s (- 13m 56s) (10000 13%) 2.4354
3m 11s (- 12m 46s) (15000 20%) 2.2446
4m 14s (- 11m 41s) (20000 26%) 2.1022
5m 18s (- 10m 36s) (25000 33%) 1.9860
6m 21s (- 9m 32s) (30000 40%) 1.8936
7m 24s (- 8m 28s) (35000 46%) 1.8562
8m 27s (- 7m 24s) (40000 53%) 1.7869
9m 31s (- 6m 20s) (45000 60%) 1.7840
10m 34s (- 5m 17s) (50000 66%) 1.8139
11m 37s (- 4m 13s) (55000 73%) 1.8486
12m 39s (- 3m 9s) (60000 80%) 2.1487
13m 39s (- 2m 6s) (65000 86%) 2.6762
14m 40s (- 1m 2s) (70000 93%) 2.7531
15m 40s (- 0m 0s) (75000 100%) 2.8134
