In [1]:
#dictionary class that keeps track of all words in a given language as well as assigning a
#token to them

PAD_TOKEN = 0
SOS_TOKEN = 1
EOS_TOKEN = 2

#from pytorch's documentation website on NLP
class Dictionary:
    def __init__(self, name):
        self.name = name
        self.word2index = {}
        self.word2count = {}
        self.index2word = {PAD_TOKEN: "PAD", SOS_TOKEN: "SOS", EOS_TOKEN: "EOS"}
        self.n_count = 3

    def add_sentence(self, sentence):
        for word in sentence.split(' '):
            self.add_word(word)

    def add_word(self, word):
        if word not in self.word2index:
            self.word2index[word] = self.n_count
            self.word2count[word] = 1
            self.index2word[self.n_count] = word 
            self.n_count += 1
        else:
            self.word2count[word] += 1


In [2]:
import re
import unicodedata
import torch

PAD_TOKEN = 0
SOS_TOKEN = 1
EOS_TOKEN = 2

def unicodeToAscii(s):
    return ''.join(
        c for c in unicodedata.normalize('NFD', s)
        if unicodedata.category(c) != 'Mn'
    )

def normalizeString(s):
    s = re.compile(r'https?://\S+|www\.\S+').sub(r'', s)
    s = unicodeToAscii(s.lower().strip())
    s = re.sub(r"([.!?])", r" \1", s)
    s = re.sub(r"[^a-zA-Z.!?]+", r" ", s)
    return s

from string import digits
remove_digits = str.maketrans('', '', digits)
import string
abc = string.punctuation
abc = abc[:12]+abc[14:]

def normalizeString_hindi(s):
    s = re.compile(r'https?://\S+|www\.\S+').sub(r'', s)
    s = re.sub("\n", "", s)
    s = re.sub("-"," ",s)
    s = re.sub("“"," ",s)
    s = re.sub("”"," ",s)
    s = re.sub("’’"," ",s)
    s = re.sub("‘‘"," ",s)
    s = re.sub("‘"," ",s)
    s = re.sub("’"," ",s)
    s = s.translate(str.maketrans('', '', abc ))
    s = re.sub("'", ' ', s)
    s = s.translate(remove_digits)
    s = re.sub("[२३०८१५७९४६]", " ", s)
    s = re.sub("[a-zA-Z]", " ", s)
    s= s.strip()
    s = re.sub(" +", " ", s)
    return s

#load language into lists of sentences where corresponding indeces are translations from
#one language to the other

#reverse controls with language is input and output
#FALSE: lang1 is input and lang2 is output
#TRUE: lang2 is input and lang1 is output
def load_files(lang1, lang2, reverse=True, MAX_FILE_SIZE=100000, MAX_LENGTH=60):
    #load first language to list
    lang1_list = []
    lang1_file = open('../input/' + lang1 + '1' + lang2 + '/' + lang1 + '.txt', 'r', encoding='utf8')
    for i, (line) in enumerate(lang1_file):
        if i < MAX_FILE_SIZE:
            lang1_list.append(line)
        else:
            break

    # load second langauge to list
    lang2_list = []
    lang2_file = open('../input/' + lang1 + '1' + lang2 + '/' + lang2 + '.txt', 'r', encoding='utf8')
    for i, (line) in enumerate(lang2_file):
        if i < MAX_FILE_SIZE:
            lang2_list.append(line)
        else:
            break

    #preprocess strings

    if(lang1 == "hindi"):
        lang1_normalized = list(map(normalizeString_hindi, lang1_list))
    else:
        lang1_normalized = list(map(normalizeString, lang1_list))

    if(lang2 == "hindi"):
        lang2_normalized = list(map(normalizeString_hindi, lang2_list))
    else:
        lang2_normalized = list(map(normalizeString, lang2_list))

   

    lang1_sentences = []
    lang2_sentences = []

    for i in range(len(lang1_normalized)):
        tokens1 = lang1_normalized[i].split(' ')
        tokens2 = lang2_normalized[i].split(' ')
        if len(tokens1) <= MAX_LENGTH and len(tokens2) <= MAX_LENGTH:
            lang1_sentences.append(lang1_normalized[i])
            lang2_sentences.append(lang2_normalized[i])

    del lang1_normalized
    del lang2_normalized

    if reverse:
        input_dic = Dictionary(lang2)
        output_dic = Dictionary(lang1)
        return input_dic, output_dic, lang2_sentences, lang1_sentences
    else:
        input_dic = Dictionary(lang1)
        output_dic = Dictionary(lang2)
        return input_dic, output_dic, lang1_sentences, lang2_sentences

#takes in a sentence and dictionary, and tokenizes based on dictionary
def tokenize(sentence, dictionary, MAX_LENGTH=60):
    split_sentence = [word for word in sentence.split(' ')]
    token = [SOS_TOKEN]
    token += [dictionary.word2index[word] for word in sentence.split(' ')]
    token.append(EOS_TOKEN)
    token += [PAD_TOKEN]*(MAX_LENGTH - len(split_sentence))
    return token

#create dataloader from a batch size and the two language lists
def load_batches(input_lang, output_lang, batch_size, device):
    data_loader = []
    for i in range(0, len(input_lang), batch_size):
        seq_length = min(len(input_lang) - batch_size, batch_size)
        input_batch = input_lang[i:i+seq_length][:]
        target_batch = output_lang[i:i+seq_length][:]
        input_tensor = torch.LongTensor(input_batch).to(device)
        target_tensor = torch.LongTensor(target_batch).to(device)
        data_loader.append([input_tensor, target_tensor])
    return data_loader

In [3]:
import torch 
import torch.nn as nn

class MultiHeadAttentionLayer(nn.Module):
    def __init__(self, hidden_size, n_heads, dropout, device):
        super().__init__()
        
        assert hidden_size % n_heads == 0
        
        self.hidden_size = hidden_size
        self.n_heads = n_heads
        self.head_size = hidden_size // n_heads
        
        self.fc_query = nn.Linear(hidden_size, hidden_size)
        self.fc_key = nn.Linear(hidden_size, hidden_size)
        self.fc_value = nn.Linear(hidden_size, hidden_size)
        self.fc_out = nn.Linear(hidden_size, hidden_size)
    
        self.dp = nn.Dropout(dropout)
        
        self.coefficient = torch.sqrt(torch.FloatTensor([self.head_size])).to(device)
        
    def forward(self, query, key, value, mask=None):
        b_size = query.shape[0]
   
        query_output = self.fc_query(query)
        key_output = self.fc_key(key)
        value_output = self.fc_value(value)
     
        query_output = query_output.view(b_size, -1, self.n_heads, self.head_size).permute(0, 2, 1, 3)
        key_output = key_output.view(b_size, -1, self.n_heads, self.head_size).permute(0, 2, 1, 3)
        value_output = value_output.view(b_size, -1, self.n_heads, self.head_size).permute(0, 2, 1, 3)
      
        energy = torch.matmul(query_output, key_output.permute(0, 1, 3, 2)) / self.coefficient
        if mask is not None:
            energy = energy.masked_fill(mask == 0, -1e10)
        
        attention = torch.softmax(energy, dim = -1)    
        output = torch.matmul(self.dp(attention), value_output)
        output = output.permute(0, 2, 1, 3).contiguous()
        output = output.view(b_size, -1, self.hidden_size)  
        output = self.fc_out(output)
        return output, attention



class FeedForwardLayer(nn.Module):
    def __init__(self, hidden_size, ff_size, dropout):
        super().__init__()

        self.ff_layer = nn.Sequential(
            nn.Linear(hidden_size, ff_size),
            nn.ReLU(),
            
            nn.Dropout(dropout),
            nn.Linear(ff_size, hidden_size)
        )
        
    def forward(self, input):
        output = self.ff_layer(input)
        return output

class EncoderLayer(nn.Module):
    def __init__(self, hidden_size, n_heads, ff_size,  dropout, device):
        super().__init__()
        
        self.self_atten = MultiHeadAttentionLayer(hidden_size, n_heads, dropout, device)
        self.self_atten_norm = nn.LayerNorm(hidden_size)
        self.ff_layer = FeedForwardLayer(hidden_size, ff_size, dropout)
        self.dp = nn.Dropout(dropout)
        self.ff_layer_norm = nn.LayerNorm(hidden_size)
        
    def forward(self, input, input_mask):
        #self attention
        atten_result, _ = self.self_atten(input, input, input, input_mask)
        
        atten_norm = self.self_atten_norm(input + self.dp(atten_result))
        ff_result = self.ff_layer(atten_norm)
        
        output = self.ff_layer_norm(atten_norm + self.dp(ff_result))
        return output

class Encoder(nn.Module):
    def __init__(self, input_size, hidden_size, n_layers, n_heads, ff_size,dropout, device, MAX_LENGTH=100):
        super().__init__()

        self.device = device
        
        self.te = nn.Embedding(input_size, hidden_size)
        self.pe = nn.Embedding(MAX_LENGTH, hidden_size)
        
        encoding_layers = []
        for _ in range(n_layers):
            encoding_layers.append(EncoderLayer(hidden_size, n_heads, ff_size, dropout, device))
        self.encode_sequence = nn.Sequential(*encoding_layers)
        
        self.dp = nn.Dropout(dropout)
        
        self.coefficient = torch.sqrt(torch.FloatTensor([hidden_size])).to(device)
        
    def forward(self, input, input_mask):
        b_size = input.shape[0]
        input_size = input.shape[1]
        
        pos = torch.arange(0, input_size).unsqueeze(0).repeat(b_size, 1).to(self.device)
        input = self.dp((self.te(input) * self.coefficient) + self.pe(pos))

        for layer in self.encode_sequence:
            input = layer(input, input_mask)
  
        return input

class DecoderLayer(nn.Module):
    def __init__(self, hidden_size, n_heads, ff_size, dropout, device):
        super().__init__()
        
        self.self_atten = MultiHeadAttentionLayer(hidden_size, n_heads, dropout, device)
        self.self_atten_norm = nn.LayerNorm(hidden_size)
        self.encoder_atten = MultiHeadAttentionLayer(hidden_size, n_heads, dropout, device)
        self.encoder_atten_norm = nn.LayerNorm(hidden_size)
        self.ff_layer = FeedForwardLayer(hidden_size, ff_size, dropout)
        self.ff_layer_norm = nn.LayerNorm(hidden_size)
        self.dp = nn.Dropout(dropout)
        
    def forward(self, target, encoded_input, target_mask, input_mask):
        #self attention
        atten_result, _ = self.self_atten(target, target, target, target_mask)
        
        atten_norm = self.self_atten_norm(target + self.dp(atten_result))

        atten_encoded, attention = self.encoder_atten(atten_norm, encoded_input, encoded_input, input_mask)
        
        encoded_norm = self.encoder_atten_norm(atten_norm + self.dp(atten_encoded))

        ff_result = self.ff_layer(encoded_norm)

        output = self.ff_layer_norm(encoded_norm + self.dp(ff_result))

        return output, attention

class Decoder(nn.Module):
    def __init__(self, output_size, hidden_size, n_layers, n_heads, ff_size, dropout, device, MAX_LENGTH=100):
        super().__init__()
        
        self.device = device
        
        self.te = nn.Embedding(output_size, hidden_size)
        self.pe = nn.Embedding(MAX_LENGTH, hidden_size)

        decoding_layers = []
        for _ in range(n_layers):
            decoding_layers.append(DecoderLayer(hidden_size, n_heads, ff_size, dropout, device))
        
        self.decode_sequence = nn.Sequential(*decoding_layers) 
        
        self.fc_out = nn.Linear(hidden_size, output_size)
        
        self.dp = nn.Dropout(dropout)
        
        self.coefficient = torch.sqrt(torch.FloatTensor([hidden_size])).to(device)
        
    def forward(self, target, encoded_input, target_mask, input_mask):    
        b_size = target.shape[0]
        target_size = target.shape[1]
        
        pos = torch.arange(0, target_size).unsqueeze(0).repeat(b_size, 1).to(self.device)
        target = self.dp((self.te(target) * self.coefficient) + self.pe(pos))
        for layer in self.decode_sequence:
            target, attention = layer(target, encoded_input, target_mask, input_mask)

        output = self.fc_out(target)
        return output, attention

class Transformer(nn.Module):
    def __init__(self, encoder, decoder, device, padding_index=0):
        super().__init__()
        
        self.encoder = encoder
        self.decoder = decoder
        self.padding_index = padding_index
        self.device = device
        
    def make_input_mask(self, input):

        input_mask = (input != self.padding_index).unsqueeze(1).unsqueeze(2)
        return input_mask
    
    def make_target_mask(self, target):

        target_pad_mask = (target != self.padding_index).unsqueeze(1).unsqueeze(2)
        target_sub_mask = torch.tril(torch.ones((target.shape[1], target.shape[1]), device = self.device)).bool()
        target_mask = target_pad_mask & target_sub_mask
        return target_mask

    def forward(self, input, target):   
        input_mask = self.make_input_mask(input)
        target_mask = self.make_target_mask(target)

        #encoder feed through
        encoded_input = self.encoder(input, input_mask)

        #decoder feed_through
        output, attention = self.decoder(target, encoded_input, target_mask, input_mask)

        return output, attention

In [4]:
import torch
import torchvision
import torchvision.transforms as transforms
import numpy as np 
import time
import matplotlib.pyplot as plt
import argparse
import torch.optim as optim
from random import shuffle
import pickle

torch.cuda.empty_cache()

PAD_TOKEN = 0
SOS_TOKEN = 1
EOS_TOKEN = 2

class Trainer():

    def initialize_weights(self, model):
        if hasattr(model, 'weight') and model.weight.dim() > 1:
            nn.init.xavier_uniform_(model.weight.data)

    def save_dictionary(self, dictionary, input=True):
        if input is True:
            with open('/kaggle/working/' + 'input_dic.pkl', 'wb') as f:
                pickle.dump(dictionary, f, pickle.HIGHEST_PROTOCOL)
        else:
            with open('/kaggle/working/' + 'output_dic.pkl', 'wb') as f:
                pickle.dump(dictionary, f, pickle.HIGHEST_PROTOCOL)

    def __init__(self, lang1, lang2,reverse, MAX_LENGTH, MAX_FILE_SIZE, batch_size, lr=0.0005, hidden_size=256, encoder_layers=3, decoder_layers=3,
                 encoder_heads=8, decoder_heads=8, encoder_ff_size=512, decoder_ff_size=512, encoder_dropout=0.1, decoder_dropout=0.1, device='cpu'):

        self.MAX_LENGTH = MAX_LENGTH
        self.MAX_FILE_SIZE = MAX_FILE_SIZE
        self.device = device

        self.input_lang_dic, self.output_lang_dic, self.input_lang_list, self.output_lang_list = load_files(lang1, lang2,reverse, self.MAX_FILE_SIZE, self.MAX_LENGTH)
        
        for sentence in self.input_lang_list:
            self.input_lang_dic.add_sentence(sentence)

        for sentence in self.output_lang_list:
            self.output_lang_dic.add_sentence(sentence)

        self.save_dictionary(self.input_lang_dic, input=True)
        self.save_dictionary(self.output_lang_dic, input=False)

        self.tokenized_input_lang = [tokenize(sentence, self.input_lang_dic, self.MAX_LENGTH) for sentence in self.input_lang_list]
        self.tokenized_output_lang = [tokenize(sentence, self.output_lang_dic, self.MAX_LENGTH) for sentence in self.output_lang_list]

        self.batch_size = batch_size

        self.data_loader = load_batches(self.tokenized_input_lang, self.tokenized_output_lang, self.batch_size, self.device)

        input_size = self.input_lang_dic.n_count
        output_size = self.output_lang_dic.n_count

        #define encoder and decoder parts of transformer
        encoder_part = Encoder(input_size, hidden_size, encoder_layers, encoder_heads, encoder_ff_size, encoder_dropout, self.device)
        decoder_part = Decoder(output_size, hidden_size, decoder_layers, decoder_heads, decoder_ff_size, decoder_dropout, self.device)

        self.transformer = Transformer(encoder_part, decoder_part, self.device, PAD_TOKEN).to(self.device)
        self.transformer.apply(self.initialize_weights)

        self.loss_func = nn.CrossEntropyLoss(ignore_index=PAD_TOKEN)
        self.optimizer = optim.Adam(self.transformer.parameters(), lr=lr)


    def train(self, epochs, saved_model_directory):
        start_time = time.time()

        for epoch in range(epochs):
            #shuffle batches to prevent overfitting
            shuffle(self.data_loader)

            start_time = time.time()
            train_loss = 0

            for input, target in self.data_loader:
                #zero gradient
                self.optimizer.zero_grad()

                #pass through transformer
                output, _ = self.transformer(input, target[:,:-1])
                output_dim = output.shape[-1]

                #flatten and omit SOS from target
                output = output.contiguous().view(-1, output_dim)
                target = target[:,1:].contiguous().view(-1)

                #loss
                loss = self.loss_func(output, target)

                #backprop
                loss.backward()
                nn.utils.clip_grad_norm_(self.transformer.parameters(), 1)
                self.optimizer.step()

                train_loss += loss.item()
                
            train_loss /= len(self.data_loader)

            end_time = int(time.time() - start_time)
            torch.save(self.transformer.state_dict(), '/kaggle/working/transformer_model_{}.pt'.format(epoch))

            print('Epoch: {},   Time: {}s,  Estimated {} seconds remaining.'.format(epoch, end_time, (epochs-epoch)*end_time))
            print('\tTraining Loss: {:.4f}\n'.format(train_loss))
        print('Training finished!')

def main():
    
    lang1 = "hindi"
    lang2 = "english"
    reverse = 1
    MAX_LENGTH = 60
    MAX_FILE_SIZE = 200000
    batch_size = 64
    lr = 0.0005
    hidden_size = 256
    encoder_layers = 3
    decoder_layers = 3
    encoder_heads = 8
    decoder_heads = 8
    encoder_ff_size = 512
    decoder_ff_size = 512
    encoder_dropout = 0.1
    decoder_dropout = 0.1
    device = 'cuda'
    epochs = 10
    saved_model_directory = '/kaggle/working/'

    transformer = Trainer(lang1, lang2, reverse, MAX_LENGTH, MAX_FILE_SIZE, batch_size, lr, hidden_size, encoder_layers, decoder_layers, 
                            encoder_heads, decoder_heads, encoder_ff_size, decoder_ff_size, encoder_dropout, decoder_dropout, device)
    transformer.train(epochs, saved_model_directory)


if __name__ == "__main__":
    main()


Epoch: 0,   Time: 640s,  Estimated 6400 seconds remaining.
	Training Loss: 5.4790

Epoch: 1,   Time: 639s,  Estimated 5751 seconds remaining.
	Training Loss: 4.1972

Epoch: 2,   Time: 637s,  Estimated 5096 seconds remaining.
	Training Loss: 3.5283

Epoch: 3,   Time: 638s,  Estimated 4466 seconds remaining.
	Training Loss: 3.0398

Epoch: 4,   Time: 638s,  Estimated 3828 seconds remaining.
	Training Loss: 2.6605

Epoch: 5,   Time: 638s,  Estimated 3190 seconds remaining.
	Training Loss: 2.3742

Epoch: 6,   Time: 637s,  Estimated 2548 seconds remaining.
	Training Loss: 2.1617

Epoch: 7,   Time: 638s,  Estimated 1914 seconds remaining.
	Training Loss: 2.0004

Epoch: 8,   Time: 638s,  Estimated 1276 seconds remaining.
	Training Loss: 1.8728

Epoch: 9,   Time: 638s,  Estimated 638 seconds remaining.
	Training Loss: 1.7650

Training finished!


In [5]:
import torch
import argparse
import pickle
import sys


SOS_TOKEN = 1

def load_dictionary(directory):
    with open(directory, 'rb') as f:
        return pickle.load(f)

def translate_sentence(sentence, input_dic, output_dic, model, device, max_len):
    
    model.eval()
    normalized_sentence = normalizeString(sentence)
    tokens = tokenize(normalized_sentence, input_dic)
    input_tensor = torch.LongTensor(tokens).unsqueeze(0).to(device)
    input_mask = model.make_input_mask(input_tensor)
    
    with torch.no_grad():
        encoded_input = model.encoder(input_tensor, input_mask)

    target_tokens = [SOS_TOKEN]

    for i in range(max_len):

        target_tensor = torch.LongTensor(target_tokens).unsqueeze(0).to(device)
        target_mask = model.make_target_mask(target_tensor)
    
        with torch.no_grad():
            output, attention = model.decoder(target_tensor, encoded_input, target_mask, input_mask)
        
        pred_token = output.argmax(2)[:,-1].item()
        target_tokens.append(pred_token)
        if pred_token == EOS_TOKEN:
            break
    
    target_results = [output_dic.index2word[i] for i in target_tokens]
    print(target_results)
    return ' '.join(target_results[1:-1]), attention

def translate(text):

    input_text = text
    input_lang = "english"
    output_lang = "hindi"
    models_dir = "."
    device = 'cuda' if torch.cuda.is_available() else 'cpu'

    #hyper parameters
    MAX_LENGTH = 60
    hidden_size = 256
    encoder_layers = 3
    decoder_layers = 3
    encoder_heads = 8
    decoder_heads = 8
    encoder_ff_size = 512
    decoder_ff_size = 512
    encoder_dropout = 0.1
    decoder_dropout = 0.1

    transformer_location = models_dir + '/' 

    #load dictionaries
    input_lang_dic = load_dictionary(transformer_location + 'input_dic.pkl')
    output_lang_dic = load_dictionary(transformer_location + 'output_dic.pkl')

    input_size = input_lang_dic.n_count
    output_size = output_lang_dic.n_count
    
    #define models
    encoder_part = Encoder(input_size, hidden_size, encoder_layers, encoder_heads, encoder_ff_size, encoder_dropout, device)
    decoder_part = Decoder(output_size, hidden_size, decoder_layers, decoder_heads, decoder_ff_size, decoder_dropout, device)

    translator = Transformer(encoder_part, decoder_part, device).to(device)
    translator.load_state_dict(torch.load(transformer_location + 'transformer_model_9.pt'))

    translation, attention = translate_sentence(input_text, input_lang_dic, output_lang_dic, translator, device, MAX_LENGTH)
    print(input_lang + ': ' + input_text)
    print('\n' + output_lang + ': ')
    return translation



In [6]:
s = translate("Several agreements are expected to be signed after the talks.")
print(s)

['SOS', 'बातचीत', 'के', 'बाद', 'कई', 'समझौतों', 'पर', 'हस्ताक्षर', 'किए', 'जा', 'सकते', 'हैं।', 'EOS']
english: Several agreements are expected to be signed after the talks.

hindi: 
बातचीत के बाद कई समझौतों पर हस्ताक्षर किए जा सकते हैं।


In [7]:
sentence = 'मौके पर पहुंची पुलिस ने तफ्तीश की.'
for word in sentence.split(' '):
  print(word)

मौके
पर
पहुंची
पुलिस
ने
तफ्तीश
की.
