<a href="https://colab.research.google.com/github/ronenbendavid/IDC_NLP/blob/master/Notebook_5_Seq2seq.ipynb" target="_parent"><img src="https://colab.research.google.com/assets/colab-badge.svg" alt="Open In Colab"/></a>

In [None]:
import torch
import numpy
from torch import autograd, nn, optim
import torch.nn.functional as F
import re
import random
import time
import math
from google.colab import drive

In [None]:
drive.mount('/content/drive')

Go to this URL in a browser: https://accounts.google.com/o/oauth2/auth?client_id=947318989803-6bn6qk8qdgf4n4g3pfee6491hc0brc4i.apps.googleusercontent.com&redirect_uri=urn%3aietf%3awg%3aoauth%3a2.0%3aoob&response_type=code&scope=email%20https%3a%2f%2fwww.googleapis.com%2fauth%2fdocs.test%20https%3a%2f%2fwww.googleapis.com%2fauth%2fdrive%20https%3a%2f%2fwww.googleapis.com%2fauth%2fdrive.photos.readonly%20https%3a%2f%2fwww.googleapis.com%2fauth%2fpeopleapi.readonly

Enter your authorization code:
··········
Mounted at /content/drive


In [None]:
SOS_TOKEN = 0
EOS_TOKEN = 1
UNKNOWN = 2
MAX_LENGTH = 10

class Vocab:
    def __init__(self, name):
        self.name = name
        self.word2index = {}
        self.index2word = {UNKNOWN:'__unk__'}
        self.n_words = 3
    
    def index_sentence(self, sentence, write=True):
        indexes = []
        for w in sentence.split(' '):
            indexes.append(self.index_word(w, write))
        return indexes
            
    def index_word(self, word, write=True):
        if word not in self.word2index:
          if write:
            self.word2index[word] = self.n_words
            self.index2word[self.n_words] = word
            self.n_words = self.n_words + 1
          else:
            return UNKNOWN
        return self.word2index[word]

In [None]:
def normalize_string(s):
    s = s.lower().strip()
    s = re.sub(r"([.!?])", r" \1", s)
    s = re.sub(r"[^a-zA-Zא-ת.!?]+", r" ", s)
    return s

In [None]:
def read_langs(lang1, lang2, reverse=False):
    print("Reading lines...")

    # Read the file and split into lines
    lines = open('/content/drive/My Drive/Courses/Practical ML - BIU 2020/notebooks/data/%s-%s.txt' % (lang1, lang2)).read().strip().split('\n')
    
    # Split every line into pairs and normalize
    pairs = [[normalize_string(s) for s in l.split('\t')] for l in lines]
    
    if reverse:
        pairs = [list(reversed(p)) for p in pairs]
        input_vocab = Vocab(lang2)
        output_vocab = Vocab(lang1)
    else:
        input_vocab = Vocab(lang1)
        output_vocab = Vocab(lang2)
        
    return input_vocab, output_vocab, pairs


def print_pair(p):
    print(p[0])
    print(p[1])

In [None]:
def filter_pair(p):
  return len(p[0].split(' ')) < MAX_LENGTH and len(p[1].split(' ')) < MAX_LENGTH

def filter_pairs(pairs):
    return [pair for pair in pairs if filter_pair(pair)]

In [None]:
def prepare_data(lang1_name, lang2_name, reverse=False):
    input_vocab, output_vocab, pairs = read_langs(lang1_name, lang2_name, reverse)
    print("Read %s sentence pairs" % len(pairs))
    
    pairs = filter_pairs(pairs)
    print("Trimmed to %s sentence pairs" % len(pairs))

    print("Indexing words...")
    for pair in pairs:
        input_vocab.index_sentence(pair[0])
        output_vocab.index_sentence(pair[1])

    return input_vocab, output_vocab, pairs

input_vocab, output_vocab, pairs = prepare_data('eng', 'heb', True)

# Print an example pair
print_pair(random.choice(pairs))

Reading lines...
Read 125860 sentence pairs
Trimmed to 104771 sentence pairs
Indexing words...
היא בכנסיה ברגע זה .
she is at church right now .


In [None]:
class EncoderRNN(nn.Module):
    def __init__(self, input_size, embedding_size, hidden_size, n_layers=1):
        super(EncoderRNN, self).__init__()
        
        self.input_size = input_size
        self.hidden_size = hidden_size
        self.n_layers = n_layers
        
        self.embedding = nn.Embedding(input_size, embedding_size)
        self.lstm = nn.LSTM(embedding_size, hidden_size, n_layers)
        
    def forward(self, word_inputs, hidden):
        seq_len = len(word_inputs)
        embedded = self.embedding(word_inputs).view(seq_len, 1, -1)
        output, hidden = self.lstm(embedded, hidden)
        return output, hidden

    def init_hidden(self):
        hidden = (torch.zeros(self.n_layers, 1, self.hidden_size).cuda(),
                  torch.zeros(self.n_layers, 1, self.hidden_size).cuda())
        return hidden

In [None]:
# This is a simple decode - not using Attention at all

class SimpleDecoderRNN(nn.Module):
    def __init__(self, hidden_size, embedding_size, output_size, n_layers=1):
        super(SimpleDecoderRNN, self).__init__()
        
        # Keep parameters for reference
        self.hidden_size = hidden_size
        self.embedding_size = embedding_size
        self.output_size = output_size
        self.n_layers = n_layers
        
        # Define layers
        self.embedding = nn.Embedding(output_size, embedding_size)
        self.lstm = nn.LSTM(embedding_size, hidden_size, n_layers)
        self.out = nn.Linear(hidden_size, output_size)
        
    def forward(self, word_input, last_hidden):
        # Note: we run this one step at a time (word by word...)
        
        # Get the embedding of the current input word (last output word)
        word_embedded = self.embedding(word_input).view(1, 1, -1) # S=1 x B x N
        
        # run through LSTM
        rnn_output, hidden = self.lstm(word_embedded, last_hidden)
        
        # Final output layer (next word prediction) 
        rnn_output = rnn_output.squeeze(0) # S=1 x B x N -> B x N
        output = F.log_softmax(self.out(rnn_output), 1)
        
        # Return final output and hidden state
        return output, hidden

In [None]:
# Testing with Simple Decoder
#===============================

encoder_test = EncoderRNN(10, 30, 10, 2).cuda()
decoder_test = SimpleDecoderRNN(10, 30, 10, 2).cuda()

encoder_hidden = encoder_test.init_hidden()

word_input = torch.LongTensor([1, 2, 3]).cuda()
encoder_outputs, encoder_hidden = encoder_test(word_input, encoder_hidden)

word_inputs = torch.LongTensor([1, 2, 3]).cuda()
decoder_hidden = encoder_hidden

for i in range(3):
    decoder_output, decoder_hidden = decoder_test(word_inputs[i], decoder_hidden)
    print('Word', i, decoder_output)


Word 0 tensor([[-2.1110, -2.1440, -2.5603, -2.2097, -2.7003, -2.2141, -2.3971, -2.2770,
         -2.4199, -2.1565]], device='cuda:0', grad_fn=<LogSoftmaxBackward>)
Word 1 tensor([[-2.0954, -2.1332, -2.5680, -2.1901, -2.7035, -2.2075, -2.4106, -2.2944,
         -2.4133, -2.1815]], device='cuda:0', grad_fn=<LogSoftmaxBackward>)
Word 2 tensor([[-2.0944, -2.1305, -2.5776, -2.1799, -2.7041, -2.1961, -2.4235, -2.3031,
         -2.4072, -2.1868]], device='cuda:0', grad_fn=<LogSoftmaxBackward>)


In [None]:
# Now, let's add Aditive Attention

class Attn(nn.Module):
    def __init__(self, hidden_size):
        super(Attn, self).__init__()
        
        self.hidden_size = hidden_size
        self.v = nn.Parameter(torch.FloatTensor(self.hidden_size).cuda())
        self.attn = nn.Linear(self.hidden_size * 2, hidden_size)
        

    def forward(self, hidden, encoder_outputs):
        seq_len = len(encoder_outputs)

        attn_energies = torch.zeros(seq_len).cuda() 

        for i in range(seq_len):
            attn_energies[i] = self.score(hidden, encoder_outputs[i])

        # Normalize energies to weights in range 0 to 1, resize to 1 x 1 x seq_len
        return F.softmax(attn_energies, 0).unsqueeze(0).unsqueeze(0)
    
    def score(self, hidden, encoder_output):        
        '''Aditive Attention'''
        attn_input = torch.cat((hidden, encoder_output), 1)
        energy = self.attn(attn_input)
        energy = self.v.dot(energy.view(-1))
        return energy

In [None]:
class AttnDecoderRNN(nn.Module):
    def __init__(self, hidden_size, embedding_size, output_size, n_layers=1):
        super(AttnDecoderRNN, self).__init__()
        
        # Keep parameters for reference
        self.hidden_size = hidden_size
        self.embedding_size = embedding_size
        self.output_size = output_size
        self.n_layers = n_layers
        
        # Define layers
        self.embedding = nn.Embedding(output_size, embedding_size)
        self.lstm = nn.LSTM(embedding_size + hidden_size, hidden_size, n_layers)
        self.out = nn.Linear(hidden_size, output_size)
        
        self.attn = Attn(hidden_size)
    
    def forward(self, word_input, last_context, last_hidden, encoder_outputs):
        # Note: we run this one step at a time
        
        # Get the embedding of the current input word (last output word)
        word_embedded = self.embedding(word_input).view(1, 1, -1) # S=1 x B x N
        
        # Combine embedded input word and last context, run through RNN
        rnn_input = torch.cat((word_embedded, last_context.unsqueeze(0)), 2)
        rnn_output, hidden = self.lstm(rnn_input, last_hidden)

        # Calculate attention from current RNN state and all encoder outputs; apply to encoder outputs
        attn_weights = self.attn(rnn_output.squeeze(0), encoder_outputs)
        context = attn_weights.bmm(encoder_outputs.transpose(0, 1)) # B x 1 x N
        
        # Final output layer (next word prediction) using the RNN hidden state and context vector
        rnn_output = rnn_output.squeeze(0) # S=1 x B x N -> B x N
        context = context.squeeze(1)       # B x S=1 x N -> B x N
        output = F.log_softmax(self.out(rnn_output), 1)
        
        # Return final output, hidden state
        return output, context, hidden

In [None]:
# Testing with Attention

encoder_test = EncoderRNN(10, 30, 10, 2).cuda()
decoder_test = AttnDecoderRNN(10, 30, 10, 2).cuda()

encoder_hidden = encoder_test.init_hidden()

word_input = torch.LongTensor([1, 2, 3]).cuda()
encoder_outputs, encoder_hidden = encoder_test(word_input, encoder_hidden)

word_inputs = torch.LongTensor([1, 2, 3]).cuda()
decoder_hidden = encoder_hidden
decoder_context = torch.zeros(1, decoder_test.hidden_size).cuda()

for i in range(3):
    decoder_output, decoder_context, decoder_hidden = decoder_test(word_inputs[i], decoder_context, decoder_hidden, encoder_outputs)
    print('Word', i, decoder_output)


Word 0 tensor([[-2.2632, -2.0632, -2.0785, -2.6232, -2.1132, -2.2636, -2.3090, -2.3885,
         -2.5113, -2.6021]], device='cuda:0', grad_fn=<LogSoftmaxBackward>)
Word 1 tensor([[-2.2646, -2.0828, -2.0934, -2.5988, -2.1272, -2.2789, -2.3145, -2.3581,
         -2.4633, -2.6067]], device='cuda:0', grad_fn=<LogSoftmaxBackward>)
Word 2 tensor([[-2.2497, -2.0978, -2.1015, -2.5749, -2.1509, -2.2803, -2.3242, -2.3564,
         -2.4179, -2.6165]], device='cuda:0', grad_fn=<LogSoftmaxBackward>)


In [None]:
teacher_forcing_ratio = 0.5

def train(input_tensor, target_tensor, encoder, decoder, 
          encoder_optimizer, decoder_optimizer, criterion, max_length=MAX_LENGTH):

    # Zero gradients of both optimizers
    encoder_optimizer.zero_grad()
    decoder_optimizer.zero_grad()
    loss = 0 # Added onto for each word

    # Get size of input and target sentences
    input_length = input_tensor.size()[0]
    target_length = target_tensor.size()[0]

    # Run words through encoder
    encoder_hidden = encoder.init_hidden()
    encoder_outputs, encoder_hidden = encoder(input_tensor, encoder_hidden)
    
    # Prepare input and output variables
    decoder_input = torch.LongTensor([[SOS_TOKEN]]).cuda()
    decoder_context = torch.zeros(1, decoder.hidden_size).cuda()
    decoder_hidden = encoder_hidden # Use last hidden state from encoder to start decoder

    # Choose whether to use teacher forcing
    use_teacher_forcing = random.random() < teacher_forcing_ratio
    if use_teacher_forcing:
        
        # Teacher forcing: Use the ground-truth target as the next input
        for di in range(target_length):
            decoder_output, decoder_context, decoder_hidden = decoder(decoder_input, decoder_context, decoder_hidden, encoder_outputs)
            loss += criterion(decoder_output[0].view(-1).unsqueeze(0), target_tensor[di].unsqueeze(0))
            decoder_input = target_tensor[di] # Next target is next input

    else:
        # Without teacher forcing: use network's own prediction as the next input
        for di in range(target_length):
            decoder_output, decoder_context, decoder_hidden = decoder(decoder_input, decoder_context, decoder_hidden, encoder_outputs)           
            loss += criterion(decoder_output[0].view(-1).unsqueeze(0), target_tensor[di].unsqueeze(0))
            
            # Get most likely word index (highest value) from output
            topv, topi = decoder_output.topk(1)
            ni = topi[0][0]
            
            decoder_input = torch.LongTensor([[ni]]).cuda() # Chosen word is next input

            # Stop at end of sentence (not necessary when using known targets)
            if ni == EOS_TOKEN: break

    # Backpropagation
    loss.backward()
    encoder_optimizer.step()
    decoder_optimizer.step()
    
    return loss.item() / target_length

In [None]:
hidden_size = 500
embedding_size = 300
n_layers = 2
dropout_p = 0.05

# Initialize models
encoder = EncoderRNN(input_vocab.n_words, embedding_size, hidden_size, n_layers).cuda()
decoder = AttnDecoderRNN(hidden_size, embedding_size, output_vocab.n_words, n_layers, dropout_p=dropout_p).cuda()

# Initialize optimizers and criterion
encoder_optimizer = optim.Adam(encoder.parameters(), lr=0.0001)
decoder_optimizer = optim.Adam(decoder.parameters(), lr=0.0001)
`

In [None]:
n_epochs = 50000
print_every = 200

print_loss_total = 0 # Reset every print_every


In [None]:
# Begin!
for epoch in range(1, n_epochs + 1):
    
    # Get training data for this cycle
    #training_pair = variables_from_pair(random.choice(pairs))
    pair = random.choice(pairs)
    input_tensor = torch.tensor(input_vocab.index_sentence(pair[0], write=False)).cuda()
    target_tensor = torch.tensor(output_vocab.index_sentence(pair[1], write=False)).cuda()

    # Run the train function
    loss = train(input_tensor, target_tensor, encoder, decoder, encoder_optimizer, decoder_optimizer, criterion)

    # Keep track of loss
    print_loss_total += loss

    if epoch == 0: continue

    if epoch % print_every == 0:
        print_loss_avg = print_loss_total / print_every
        print_loss_total = 0
        print('(Epoch %d) %.4f' % (epoch, print_loss_avg))
        
print("Done!")

(Epoch 200) 7.3780
(Epoch 400) 5.9807
(Epoch 600) 5.8746
(Epoch 800) 5.7627
(Epoch 1000) 5.4878
(Epoch 1200) 5.4703
(Epoch 1400) 5.4229
(Epoch 1600) 5.3556
(Epoch 1800) 5.3679
(Epoch 2000) 5.2351
(Epoch 2200) 5.1931
(Epoch 2400) 5.3306
(Epoch 2600) 5.1281
(Epoch 2800) 5.1100
(Epoch 3000) 5.0842
(Epoch 3200) 4.9742
(Epoch 3400) 5.0244
(Epoch 3600) 4.9717
(Epoch 3800) 4.9417
(Epoch 4000) 5.1147
(Epoch 4200) 5.0232
(Epoch 4400) 5.0495
(Epoch 4600) 4.9528
(Epoch 4800) 4.7680
(Epoch 5000) 4.9683
(Epoch 5200) 4.9016
(Epoch 5400) 4.9039
(Epoch 5600) 4.8233
(Epoch 5800) 4.7177
(Epoch 6000) 4.7143
(Epoch 6200) 4.8374
(Epoch 6400) 4.7655
(Epoch 6600) 4.7540
(Epoch 6800) 4.7064
(Epoch 7000) 4.7293
(Epoch 7200) 4.7990
(Epoch 7400) 4.7441
(Epoch 7600) 4.6468
(Epoch 7800) 4.6972
(Epoch 8000) 4.9011
(Epoch 8200) 4.8244
(Epoch 8400) 4.7138
(Epoch 8600) 4.7208
(Epoch 8800) 4.7072
(Epoch 9000) 4.7683
(Epoch 9200) 4.5813
(Epoch 9400) 4.6413
(Epoch 9600) 4.6094
(Epoch 9800) 4.5139
(Epoch 10000) 4.6866
(Ep

In [None]:
def evaluate(sentence, max_length=MAX_LENGTH):
    input_tensor = torch.tensor(input_vocab.index_sentence(sentence)).cuda()
    input_length = input_tensor.size()[0]
    
    # Run through encoder
    encoder_hidden = encoder.init_hidden()
    encoder_outputs, encoder_hidden = encoder(input_tensor, encoder_hidden)

    # Create starting vectors for decoder
    decoder_input = torch.LongTensor([[SOS_TOKEN]]).cuda() # SOS
    decoder_context = torch.zeros(1, decoder.hidden_size).cuda()

    decoder_hidden = encoder_hidden
    
    decoded_words = []
    decoder_attentions = torch.zeros(max_length, max_length).cuda()
    
    # Run through decoder
    for di in range(max_length):
        decoder_output, decoder_context, decoder_hidden = decoder(decoder_input, decoder_context, decoder_hidden, encoder_outputs)
        # Choose top word from output
        topv, topi = decoder_output.cpu().topk(1)
        ni = topi[0][0].item()
        if ni == EOS_TOKEN:
            decoded_words.append('<EOS>')
            break
        else:
            decoded_words.append(output_vocab.index2word[ni])
            
        # Next input is chosen word
        decoder_input = torch.LongTensor([[ni]]).cuda()
    
    return decoded_words

In [None]:
def evaluate_randomly():
    pair = random.choice(pairs)
    
    output_words = evaluate(pair[0])
    output_sentence = ' '.join(output_words)
    
    print(pair[0])
    print(pair[1])
    print(output_sentence)


In [None]:
for i in range(5):
    evaluate_randomly()
    print('\n')


תחכו עד מחר בבוקר .
wait until tomorrow morning .
i tomorrow tomorrow tomorrow . tomorrow . . . .


אתם יכולים ללכת במקומי בבקשה ?
can you please go for me ?
can you please please please please ? ? ? ?


כבר ראיתי אותו פעם .
i have seen him once .
i have already to him . . . . .


לך ודבר עם טום .
go and talk to tom .
you you have to tom . . . . .


אעדיף בהרבה להיות בבית .
i d much rather be at home .
you must be home to be home . . .


