In [None]:
# Basic utilities needed in the code

import torch
import spacy
from torchtext.data.metrics import bleu_score

def translate_sentence(model, sentence, german, english, device, max_length=50):
    """
    This function translates the input german sentence to the english sentence.
    German sentence --> German Vector --> Encoder --> context vector --> Decoder --> English Vector --> English Sentence

    :param model: the sequence-to-sequnce model
    :param sentence: the input "german" sentence
    :param german: the german Field object
    :param english : the english Field object
    :param device: cuda / cpu
    :param max_length : maximum length of the translated sentence
    """

    spacy_german = spacy.load("de")

    if type(sentence) == str:
        tokens = [token.text.lower() for token in spacy_german(sentence)]

    else:
        tokens = [token.lower() for token in sentence]

    # insert the start and end sequence
    tokens.insert(0, german.init_token)
    tokens.append(german.eos_token)

    text_to_indicies = [german.vocab.stoi[token] for token in tokens]

    # (N, ) --> (1 X N)
    sentence_tensor = torch.LongTensor(text_to_indicies).unsqueeze(0).to(device)

    # Retrieve the hidden_state and cell_state from the encoder
    with torch.no_grad():
        hidden_state, cell_state = model.Encoder_LSTM(sentence_tensor)

    # start the decoding part using start sequence and the (hidden_state, cell_state)
    outputs = [english.vocab.stoi["<sos>"]]

    for _ in range(max_length):
        previous_word = torch.LongTensor([outputs[-1]]).to(device)

        with torch.no_grad():
            output, hidden_state, cell_state = model.Decoder_LSTM(previous_word, hidden_state, cell_state)

            # shape received : 1 X 1 X |Eng_Vocab|; squeeze it
            # output = output.squeeze(0)

            best_guess = output.argmax(1).item()

        outputs.append(best_guess)

        # Model stops predicting if it predicts <eos> token (index)
        if output.argmax(1).item() == english.vocab.stoi["<eos>"]:
            break

    # We have the indicies of the translated sentence in english
    # Now, we will predict the sentence
    translated_sentence = [english.vocab.itos[idx] for idx in outputs]

    return translated_sentence[1:]

def bleu(data, model, german, english, device):
    """
    *** reference : https://www.youtube.com/watch?v=DejHQYAGb7Q ***
    :param data: the batch containing german and english sentences
    :param model: the model
    :param german: the german Field object
    :param english: the english Field object
    :param device: cuda / cpu
    """

    targets = []
    outputs = []

    for example in data:
        ger_sent = vars(example)["ger_sent"]
        eng_sent = vars(example)["eng_sent"]
        
        prediction = translate_sentence(model, ger_sent, german, english, device)

        # remove the <eos> token from the end
        prediction = prediction[:-1]

        targets.append([eng_sent])
        outputs.append(prediction)

    return bleu_score(outputs, targets)


In [None]:
import torch
import torch.nn as nn
import random

# ---------------------------- ENCODER ----------------------------
class Encoder(nn.Module):

    def __init__(self, input_size, embedding_size, hidden_size, num_layers, drop_prob):
        """
        :param input_size: the size of the input sequence
        :param embedding_size: the embedding dimension
        :param hidden_size: the hidden dimension used in the LSTM model
        :param num_layers: number of layers in the LSTM model
        :param drop_prob: the probability of dropout
        """

        # self.param_dict = {
        #     'input_size' : input_size,
        #     'embedding_size' : embedding_size,
        #     'hidden_size' : hidden_size,
        #     'num_layers' : num_layers,
        #     'drop_prob' : drop_prob
        # }

        super(Encoder, self).__init__()

        self.dropout = nn.Dropout(drop_prob)  # for Regularization

        self.embedding = nn.Embedding(input_size, embedding_size)
        
        # the rnn cell
        self.rnn = nn.LSTM(input_size = embedding_size,
                        hidden_size = hidden_size,
                        num_layers = num_layers,
                        dropout=drop_prob,
                        batch_first=True
        )

    def forward(self, x):
        """
        :param x: the vector form of the sentence 
                  (containing the indicies mapped in the vocab)
        """

        # pass the data
        # N X T --> N X T X D
        x = self.dropout(self.embedding(x))

        output, (hidden_state, cell_state) = self.rnn(x)

        # return the context vectors
        # their shape : L X N X H (num_layers X batch_size X hidden_size)
        return hidden_state, cell_state




# ---------------------------- DECODER ----------------------------
class Decoder(nn.Module):

    def __init__(self, input_size, embedding_size, hidden_size, num_layers, drop_prob, output_size):
        """
        :param input_size: the size of the input sequence
        :param embedding_size: the embedding dimension
        :param hidden_size: the hidden dimension used in the LSTM model
        :param num_layers: number of layers in the LSTM model
        :param drop_prob: the probability of dropout
        :param output_size: the output size of the linear layer after the decoding
        """

        # self.param_dict = {
        #     'input_size' : input_size,
        #     'embedding_size' : embedding_size,
        #     'hidden_size' : hidden_size,
        #     'num_layers' : num_layers,
        #     'drop_prob' : drop_prob,
        #     'output_size' : output_size
        # }

        super(Decoder, self).__init__()

        self.dropout = nn.Dropout(drop_prob)  # for Regularization

        self.embedding = nn.Embedding(input_size, embedding_size)

        self.rnn = nn.LSTM(input_size=embedding_size,
                            hidden_size=hidden_size,
                            num_layers=num_layers,
                            dropout=drop_prob,
                            # batch_first=True
        )

        self.fc = nn.Linear(hidden_size, output_size)

    def forward(self, x, hidden_state, cell_state):

        # unsqueeze x
        # shape becomes : 1 X N
        x = x.unsqueeze(0)

        # 1 X N --> 1 X N X D
        x = self.dropout(self.embedding(x))

        # shape of outputs : 1 X N X H (1 X batch_size X Hidden_size)
        # shape of hidden and cell states : L X N X H
        outputs, (hidden_state, cell_state) = self.rnn(x, (hidden_state, cell_state))

        # 1 X N X H --> 1 X N X output_size
        predictions = self.fc(outputs)

        # 1 X N X output_size --> N X output_size
        predictions = predictions.squeeze(0)

        return predictions, hidden_state, cell_state




# ---------------------------- SEQUENCE-TO-SEQUENCE ----------------------------
class Seq2Seq(nn.Module):

    def __init__(self, Encoder_LSTM, Decoder_LSTM):
        """
        :param Encoder_LSTM: the encoder part for the Seq2Seq model
        :param Decoder_LSTM: the decoder part for the Seq2Seq model
        """

        super(Seq2Seq, self).__init__()
        self.Encoder_LSTM = Encoder_LSTM
        self.Decoder_LSTM = Decoder_LSTM

    def forward(self, source, target, eng_vocab_size, tfr=0.5):
        """
        :param source: padded sentences in German
                       shape : [(sentence length German + some padding), #Sentences]
        :param target: padded sentences in English
                       shape : [(sentence length English + some padding), #Sentences]
        :param eng_vocab_size : size of the english vocab
        :param tfr: teach force ratio
        """

        # # Convert it into Batch Size X Sequence Length
        # target = target.permute(1, 0)

        batch_size = source.shape[0]
        target_len = target.shape[0]

        outputs = torch.zeros(target_len, batch_size, eng_vocab_size).to(device)

        # retaining the context vector from the encoder
        hidden_state, cell_state = self.Encoder_LSTM(source)

        x = target[0]

        for i in range(1, target_len):

            # output : batch_size X |Eng_Vocab_Size|
            output, hidden_state, cell_state = self.Decoder_LSTM(x, hidden_state, cell_state)

            outputs[i] = output

            best_guess = output.argmax(1)  # the most suitable word embedding

            # Teach force ratio
            # Either pass the next correct word from the dataset
            # or use the predicted word
            x = target[i] if random.random() < tfr else best_guess

        return outputs


if __name__ == '__main__':

    # ..................... Some testing code .....................

    # for encoder
    input_size_encoder = 5000  # vocab size
    encoder_embedding_size = 300
    hidden_size = 1024
    num_layers = 2
    encoder_dropout = float(0.5)
    
    device = "cuda" if torch.cuda.is_available() else "cpu"
    encoder_lstm = Encoder(input_size_encoder, encoder_embedding_size, 
                            hidden_size, num_layers, encoder_dropout).to(device)

    # print(encoder_lstm)

    # for decoder
    input_size_decoder = 4500
    decoder_embedding_size = 300
    hidden_size = 1024
    num_layers = 2
    decoder_dropout = float(0.5)
    output_size = 4500
    
    decoder_lstm = Decoder(input_size_decoder, decoder_embedding_size, hidden_size, 
                            num_layers, decoder_dropout, output_size).to(device)

    # print(decoder_lstm)

    model = Seq2Seq(encoder_lstm, decoder_lstm)
    print(model)

In [None]:
def testing_Iterators(train_iterator, test_iterator, GERMAN_VOCAB, ENGLISH_VOCAB):
    """
    This function just prints the batches

    :param train_iterator: iterator for training
    :param test_iterator: iterator for testing
    :param GERMAN_VOCAB: the German vocab
    :param ENGLISH_VOCAB: the English vocab
    
    """
    for data in train_iterator:
        # print(f"Length : {data.ger_sent.shape}")  # "German :", *data.ger_sent, 
        # print(f"Length : {data.eng_sent.shape}")  # "English :", *data.eng_sent, 
        
        print("-------------- GERMAN SENTENCES ------------")
        print()
        temp = data.ger_sent.permute(1, 0)
        for ele in temp:
            for num in ele:
                print(GERMAN_VOCAB.itos[num.item()], end=" ")

            print()

        print()

        print("-------------- ENGLISH SENTENCES ------------")
        print()
        temp = data.eng_sent.permute(1, 0)
        for ele in temp:
            for num in ele:
                print(ENGLISH_VOCAB.itos[num.item()], end=" ")

            print()

        print()
        break

    for data in test_iterator:
        # print(f"Length : {data.ger_sent.shape}")  # "German :", *data.ger_sent, 
        # print(f"Length : {data.eng_sent.shape}")  # "English :", *data.eng_sent, 
        
        print("-------------- GERMAN SENTENCES ------------")
        print()
        temp = data.ger_sent.permute(1, 0)
        for ele in temp:
            for num in ele:
                print(GERMAN_VOCAB.itos[num.item()], end=" ")

            print()

        print()

        print("-------------- ENGLISH SENTENCES ------------")
        print()
        temp = data.eng_sent.permute(1, 0)
        for ele in temp:
            for num in ele:
                print(ENGLISH_VOCAB.itos[num.item()], end=" ")

            print()

        print()
        break


In [None]:
# declare constants here...
learning_rate = 0.003
epochs = 100
train_batch_size = 128
test_batch_size = 256

In [None]:
import torch
import torch.nn as nn

def train(data_loader, model, optimizer, criterion, english_vocab_size, device):
    """
    This is the main training function that trains the model and
    returns training loss

    :param data_loader: this is the torch data loader
    :param model: model (encoder - decoder model)
    :param optimizer: torch optimizer, e.g. adam, sgd, etc.
    :param criterion: loss function
    :param english_vocab_size: size of the english vocabulary
    :param device: this can be "cuda" or "cpu"
    """

    # set the model to training mode
    model.train()

    batch_loss = 0.0
    batches = 0
    for data in data_loader:

        input = data.ger_sent.to(device)
        target = data.eng_sent.to(device)

        input = input.permute(1, 0)
        
        optimizer.zero_grad()

        # pass the input and target for model's forward method
        output = model(input, target, english_vocab_size)

        output = output.permute(1, 0, 2)

        # print(output.shape)

        output = output[1:].reshape(-1, output.shape[2])

        target = target.permute(1, 0)
        target = target[1:].reshape(-1)

        # calculate the loss
        loss = criterion(output, target)

        # back-prop
        loss.backward()

        # clip the gradient value if it exceeds 1 => called NORM clipping  (https://www.youtube.com/watch?v=_-CZr06R5CQ)
        nn.utils.clip_grad_norm_(model.parameters(), max_norm=1)

        # update the weight values
        optimizer.step()

        batches += 1.0
        batch_loss += loss.item()

    return batch_loss/batches


def evaluate(data_loader, model, criterion, device):
    """
    This function is used for returning loss

    :param data_loader: this is the torch data loader
    :param model: model (encoder - decoder model)
    :param criterion: loss function
    :param device: this can be "cuda" or "cpu"
    """

    batch_loss = 0.0
    batches = 0

    # put the model in evaluation mode
    model.eval()

    with torch.no_grad():

        for data in data_loader:

            input = data.ger_sent.to(device)
            target = data.eng_sent.to(device)

            # pass the input and target for model's forward method
            output = model(input, target, eng_vocab_size)

            loss = criterion(output, target)

            batches += 1.0
            batch_loss += loss.item()

    return batch_loss/batches


In [None]:
!python -m spacy download en
!python -m spacy download de

In [None]:
def checkpoint_and_save(model, best_loss, epoch, optimizer, epoch_loss):
    print('saving')
    print()
    state = {'model': model,'best_loss': best_loss,'epoch': epoch,'rng_state': torch.get_rng_state(), 'optimizer': optimizer.state_dict(),}
    torch.save(state, '/kaggle/working/checkpoint-NMT-BEST.pth')
    torch.save(model.state_dict(),'/kaggle/working/checkpoint-NMT-BEST-SD.pth')

In [None]:
import torch
import torch.nn as nn
import torchtext
import spacy
from torchtext.data.metrics import bleu_score
from torchtext.data import Field, TabularDataset, BucketIterator
import torch.optim as optim
import warnings
warnings.simplefilter('ignore')


def tokenize_german(text):
    """
    tokenizer for German language
    """
    return [token.text for token in spacy_german.tokenizer(text)]

def tokenize_english(text):
    """
    tokenizer for English language
    """
    return [token.text for token in spacy_english.tokenizer(text)]


if __name__ == '__main__':

    # tokenizers for German and English
    spacy_german = spacy.load("de")
    spacy_english = spacy.load("en")

    # Field Object for German
    german = Field(tokenize=tokenize_german,
                    lower=True,
                    init_token="<sos>",
                    eos_token="<eos>"
    )

    # Field Object for English
    english = Field(tokenize=tokenize_english,
                    lower=True,
                    init_token="<sos>",
                    eos_token="<eos>"
    )

    # dataset object
    dataset = TabularDataset(path="../input/german-to-english/dataset.csv",
                            format='csv',
                            skip_header=True,
                            fields=[('ger_sent', german), ('eng_sent', english)]
    )

    # 80% training
    train_dataset, test_dataset = dataset.split(split_ratio=0.80)

    # BUILDING THE VOCAB
    german.build_vocab(train_dataset, max_size=10000, min_freq=3)
    english.build_vocab(train_dataset, max_size=10000, min_freq=3)

    GERMAN_VOCAB = german.vocab
    ENGLISH_VOCAB = english.vocab

    print(f"German Vocab Size : {len(GERMAN_VOCAB)}")
    print(f"English Vocab Size : {len(ENGLISH_VOCAB)}")

    # set up the device to cuda
    device = torch.device("cuda" if torch.cuda.is_available() else "cpu")

    TRAIN_BATCH_SIZE = train_batch_size
    TEST_BATCH_SIZE = test_batch_size

    # Iterators
    train_iterator, test_iterator = BucketIterator.splits(
        (train_dataset, test_dataset),
        batch_sizes=(TRAIN_BATCH_SIZE,TEST_BATCH_SIZE),
        sort_within_batch = True,
        sort_key=lambda x: len(x.ger_sent),
        device=device
    )

    # if we wanna explore the data in train and test iterators
    # use this function
    testing_Iterators(train_iterator, test_iterator, GERMAN_VOCAB, ENGLISH_VOCAB)

In [None]:
# Let's create the model
# ENCODER : 
input_size_encoder = len(GERMAN_VOCAB)  # vocab size
encoder_embedding_size = 300
hidden_size = 1024
num_layers = 2
encoder_dropout = float(0.5)

encoder_lstm = Encoder(input_size_encoder, encoder_embedding_size, 
                        hidden_size, num_layers, encoder_dropout).to(device)

# DECODER : 
input_size_decoder = len(ENGLISH_VOCAB)
decoder_embedding_size = 300
hidden_size = 1024
num_layers = 2
decoder_dropout = float(0.5)
output_size = len(ENGLISH_VOCAB)

decoder_lstm = Decoder(input_size_decoder, decoder_embedding_size, hidden_size, 
                        num_layers, decoder_dropout, output_size).to(device)


my_model = Seq2Seq(encoder_lstm, decoder_lstm).to(device)

# Let's train the model
print("Model Training started :)")

EPOCHS = epochs
learning_rate = learning_rate

epoch_loss = 0.0
best_loss = 10**7
best_epoch = -1
optimizer = optim.Adam(my_model.parameters(), lr=learning_rate)
pad_idx = ENGLISH_VOCAB.stoi["<pad>"]
criterion = nn.CrossEntropyLoss(ignore_index=pad_idx)
early_stopping_counter = 0
print(my_model, end="\n")

# for checking the model at every step
sample_sentence = "ein mann in einem blauen hemd steht auf einer leiter und putzt ein fenster"

train_losses = []
test_bleu_scores = []
print(translate_sentence(my_model, sample_sentence, german, english, device))
for epoch in range(EPOCHS):

    epoch_loss = train(train_iterator, my_model, optimizer, criterion, len(ENGLISH_VOCAB), device)

    # Append the training loss
    train_losses.append(epoch_loss)
    print(f"Epoch : {epoch} ; Epoch Loss : {epoch_loss}")

    # print the bleu bleu score for testing # update to 1:100
    print(f"Testing Bleu Score : {bleu(test_dataset[1:100], my_model, german, english, device)}")

    if epoch_loss < best_loss:
        best_loss = epoch_loss
        best_epoch = epoch
        checkpoint_and_save(my_model, best_loss, epoch, optimizer, epoch_loss)

    else:
        early_stopping_counter += 1

    if early_stopping_counter > 5:
        print("Early Stopping...")
        break

    print(translate_sentence(my_model, sample_sentence, german, english, device))

In [None]:
## Loading the model

In [None]:
chk = torch.load("./checkpoint-NMT-BEST.pth")
mm = chk['model']

sd = torch.load("./checkpoint-NMT-BEST-SD.pth")
mm.load_state_dict(sd)

In [None]:
print(translate_sentence(mm, sample_sentence, german, english, device))