In [91]:
from torchtext.datasets import Multi30k
import numpy as np
import spacy
import random
import torch
import torch.nn as nn
import torch.optim as optim
from torch.utils.data import Dataset, DataLoader

In [None]:
class Vocab:
    def __init__(self):
        self.clear()
        
    def build_vocab(self,tokenized_text,max_len=None,unk_token="<UNK>", others=None):
        
        if self.__vocab_built:
            raise Exception("Vocab already built. Please use the .clear() methode if you want to rebuild!")
        
        self.clear()
        self.__word_to_index[unk_token]=self.__special_cnt
        self.__index_to_word[self.__special_cnt] = unk_token
        self.__special_cnt+=1
        
        if others:
            for token in others:
                if token not in self.__word_to_index:
                    self.__word_to_index[token] = self.__special_cnt
                    self.__index_to_word[self.__special_cnt] = token
                    self.__special_cnt += 1
        
        
        tokens =  list(np.concatenate(tokenized_text).flatten())
        tokens =  sorted(tokens)
        word_to_count={}
        
        if max_len:
            if(max_len<self.__special_cnt):
                raise Exception('Max length must be larger than number of special tokens')
                
            self.max_len = max_len 
            
            for token in tokens:
                if token in word_to_count:
                    word_to_count[token] += 1
                else:
                     word_to_count[token] = 1
        
        
        tokens = list(set(tokens))
            
        if max_len:
            current_length = self.__special_cnt + len(tokens)
            if current_length > max_len:
                sorted_count = dict(sorted(word_to_count.items(), key = lambda x: x[1]))
                    
                for i, (word,count) in enumerate(sorted_count.items()):
                    if i >= current_length - max_len:
                        break
                    tokens.remove(word)
                    
                    
        for token in tokens:
            self.__word_to_index[token] = self.__special_cnt + self.__len
            self.__index_to_word[self.__special_cnt + self.__len] = token
            self.__len +=1
    
        self.__vocab_built=True
        
        
    def __len__(self):
        return self.__len+self.__special_cnt
        
    def clear(self):
        self.__special_cnt=0
        self.__len =0
        self.__word_to_index={}
        self.__index_to_word={}
        self.max_len=0
        self.__vocab_built = False
                
    def tokens_to_bow(self,tokenized_sentence):
        if not self.__vocab_built:
            raise Exception("Vocabulary not built yet!")

        out =[]
        for token in tokenized_sentence:
            out.append(self.__word_to_index[token] if token in self.__word_to_index else 0)
        return out
    
    def bow_to_tokens(self, bow):
        if not self.__vocab_built:
            raise Exception("Vocabulary not built yet!")
        
        out=[]
        for idx in bow:
            if idx not in self.__index_to_word:
                raise Exception('Invalid index')
            out.append(self.__index_to_word[idx])
        return out
    
    def get_vocab(self):
        if not self.__vocab_built:
            raise Exception("Vocabulary not built yet!")
            
        return self.__word_to_index

In [2]:
spacy_ger = spacy.load('de_core_news_sm')
spacy_eng = spacy.load('en_core_web_sm')

In [3]:
def tokenizer_en(text):
    return [tok.text for tok in spacy_eng.tokenizer(text.lower())]
def tokenizer_de(text):
    return [tok.text for tok in spacy_ger.tokenizer(text.lower())]

In [121]:
train_data, validation_data, test_data = Multi30k()

In [5]:
en_train = []
de_train = []

for (de,en) in train_data:
    de_train.append(tokenizer_de(de))
    en_train.append(tokenizer_en(en))


In [6]:
english_vocab = Vocab()
english_vocab.build_vocab(en_train, max_len=10000, unk_token='<UNK>', others=['<SOS>','<EOS>','<PAD>'])

In [7]:
german_vocab = Vocab()
german_vocab.build_vocab(de_train,max_len=10000, unk_token='<UNK>', others=['<SOS>','<EOS>','<PAD>'])

In [8]:
class Train_dataset(Dataset):
    def __init__(self):
        self.de = de_train
        self.en = en_train
        self.len = len(de_train)
        
    def __getitem__(self,idx):
        return self.de[idx] , self.en[idx]
    
    def __len__(self):
        return self.len
train_dataset =  Train_dataset()

In [9]:
def collate_fn(batch):
    out_de, out_en = [],[] 
    
    
    for de, en in batch:
        out_de.append(german_vocab.tokens_to_bow(['<SOS>']+de+['<EOS>']))
        out_en.append(english_vocab.tokens_to_bow(['<SOS>']+en+['<EOS>']))
    
    
    
    de_pad = german_vocab.tokens_to_bow(['<PAD>'])
    en_pad = english_vocab.tokens_to_bow(['<PAD>'])
    
    
    max_len_de = len(max(out_de,key=len))
    max_len_en = len(max(out_en,key=len))
    
    for i in range(len(out_de)):
        out_de[i].extend(de_pad*(max_len_de - len(out_de[i])))
    
    for i in range(len(out_en)):
        out_en[i].extend(en_pad*(max_len_en - len(out_en[i])))
        
    return torch.tensor(out_de,dtype=torch.int64),torch.tensor(out_en,dtype=torch.int64)

In [10]:
class Encoder(nn.Module):
    def __init__(self, input_size , embedding_size, hidden_size, num_layers, dropout_p):
        super(Encoder,self).__init__()
    
        self.hidden_size = hidden_size
        self.num_layers = num_layers
        
        self.dropout=nn.Dropout(p=dropout_p)
        self.embedding=nn.Embedding(input_size,embedding_size)
        self.lstm = nn.LSTM(embedding_size,hidden_size,num_layers=num_layers, dropout=dropout_p, batch_first=True)
        
    def forward(self,x):
        
        
        out = self.dropout(self.embedding(x))
        
        out, (hidden,cell) = self.lstm(out)
        
        return hidden,cell

In [11]:
class Decoder(nn.Module):
    def __init__(self, input_size, embedding_size, hidden_size, output_size, num_layers, dropout_p):
        super(Decoder,self).__init__()
        
        self.hidden_size = hidden_size
        self.num_layers = num_layers
        
        self.dropout=nn.Dropout(p=dropout_p)
        self.embedding=nn.Embedding(input_size,embedding_size)
        self.lstm = nn.LSTM(embedding_size,hidden_size,num_layers=num_layers, dropout=dropout_p, batch_first=True)
        self.fc = nn.Linear(hidden_size, output_size)
    
    def forward(self,x, hidden, cell):
        out = self.dropout(self.embedding(x))
        
        out, (hidden,cell) = self.lstm(out,(hidden,cell))
        
        out = self.fc(out)
        
        return out, hidden, cell
        
        

In [12]:
class Seq2Seq(nn.Module):
    def __init__(self,encoder,decoder):
        super(Seq2Seq,self).__init__()
        
        self.encoder = encoder
        self.decoder = decoder
        
    def forward(self,x,y,teacher_force_ratio=0.5):
        
        hidden,cell = self.encoder(x)
        
        batch_size = x.shape[0]
        sentence_length = y.shape[1]
        target_vocab_size = len(english_vocab)
        
        # output =  length of batch*sentence* target vocabulary size
        output = torch.zeros(batch_size,sentence_length,target_vocab_size).to(device)
        
        # first token passed will be <SOS> for eng (i.e. y)
        # y is batchsize*len
        # decoder will take 1 word of all samples in batch at a time
        # so decoder will batch_size * 1
        
        inp = y[:,0].reshape(-1,1)
        
        for i in range(1,y.shape[1]):
            #print(inp.shape)
            out,hidden,cell = self.decoder(inp,hidden,cell)
            out =  out.reshape(batch_size,-1)
            output[:,i,:] = out
            
            # out is in the shape of batch_size * vocab_size 
            #print('out',out.shape)
            guess = out.argmax(1).reshape(-1,1)
            
            inp = guess if random.random()<teacher_force_ratio else y[:,i].reshape(-1,1)
        
        return output
        
        
        
        

In [13]:
num_epochs = 100
learning_rate = 0.001
batch_size = 64

# Model hyperparameters
load_model = False
device = torch.device("cuda" if torch.cuda.is_available() else "cpu")
input_size_encoder = len(german_vocab)
input_size_decoder = len(english_vocab)
output_size = len(english_vocab)
encoder_embedding_size = 300
decoder_embedding_size = 300
hidden_size = 1024  # Needs to be the same for both RNN's
num_layers = 2
enc_dropout = 0.5
dec_dropout = 0.5

In [14]:
train_loader = DataLoader(dataset = train_dataset, batch_size = batch_size, shuffle = True, collate_fn=collate_fn)

In [15]:
encoder = Encoder(input_size_encoder,encoder_embedding_size,hidden_size,num_layers,enc_dropout).to(device)
decoder = Decoder(input_size_decoder,decoder_embedding_size,hidden_size,output_size,num_layers,dec_dropout).to(device)
model = Seq2Seq(encoder,decoder).to(device)

In [16]:
pad_idx = english_vocab.tokens_to_bow(['<PAD>'])[0]

In [21]:
def Save_model(file_name = 'data.pth'):
    data = {
        'input_size_encoder':input_size_encoder,
        'input_size_decoder':input_size_decoder,
        'output_size' : output_size,
        'encoder_embedding_size' : encoder_embedding_size,
        'decoder_embedding_size' : decoder_embedding_size,
        'hidden_size'  : hidden_size,
        'num_layers'   : num_layers,
        'enc_dropout'  : enc_dropout,
        'dec_dropout'  : dec_dropout,
        'tokenizer_en' : tokenizer_en,
        'tokenizer_de' : tokenizer_de,
        'english_vocab': english_vocab,
        'german_vocab' : german_vocab,
        'encoder_sate' : encoder.state_dict(),
        'decoder_state': decoder.state_dict(),
        'seq2seq_state': model.state_dict(),
    }
    torch.save(data,file_name)
def Save_state_dicts():
    torch.save(encoder.state_dict(),'enc.pth')
    torch.save(decoder.state_dict(),'dec.pth')
    torch.save(model.state_dict(),'s2s.pth')

In [18]:
optimizer = optim.Adam(model.parameters(),lr=learning_rate)
criterion = nn.CrossEntropyLoss(ignore_index=pad_idx)

In [19]:
if(load_model):
    encoder.load_state_dict(torch.load('enc.pth'))
    decoder.load_state_dict(torch.load('dec.pth'))
    model.load_state_dict(torch.load('s2s.pth'))

for epoch in range(num_epochs):
    for de,en  in train_loader:
        de , en = de.to(device), en.to(device)
        out = model(de,en)
        
        en = en[:,1:].reshape(-1)
        out = out[:,1:,:].reshape(-1,output_size)
        
        loss = criterion(out,en)
        
        optimizer.zero_grad()
        loss.backward()
        optimizer.step()
        
    Save_state_dicts()
        
    print(loss.item())

3.9907619953155518
4.4972243309021
3.3224780559539795
3.1153604984283447
2.545489549636841
1.8848296403884888
3.277812957763672
1.836174726486206
2.100801706314087
2.7182295322418213
2.0909359455108643
3.1290619373321533
2.2757534980773926
1.7318495512008667
2.0310328006744385
1.7104735374450684
1.560773491859436
1.074495553970337
2.125704050064087
1.9563441276550293
1.9879615306854248
1.1547415256500244
1.4417831897735596
1.9300510883331299
1.01298987865448
1.0182204246520996
1.2741928100585938
1.151524543762207
0.9243612885475159
1.2822558879852295
0.9856067299842834
0.6004963517189026
0.5417779088020325
1.3924328088760376
0.6913865804672241
0.8250875473022461
0.9933826327323914
0.8478679656982422
0.852372407913208
1.1250052452087402
0.8091843724250793
0.8083226084709167
0.7912936210632324
0.8868674039840698
0.8334425091743469
0.3057456612586975
1.2683159112930298
0.3969021439552307
0.680704653263092
1.0086941719055176
0.9130188226699829
0.7296795845031738
0.5330160856246948
0.456276

In [22]:
Save_model()

In [23]:
model.eval()

Seq2Seq(
  (encoder): Encoder(
    (dropout): Dropout(p=0.5, inplace=False)
    (embedding): Embedding(10000, 300)
    (lstm): LSTM(300, 1024, num_layers=2, batch_first=True, dropout=0.5)
  )
  (decoder): Decoder(
    (dropout): Dropout(p=0.5, inplace=False)
    (embedding): Embedding(9796, 300)
    (lstm): LSTM(300, 1024, num_layers=2, batch_first=True, dropout=0.5)
    (fc): Linear(in_features=1024, out_features=9796, bias=True)
  )
)

In [75]:
def translate_sentence(german_sentence):
    
    tokens = ['<SOS>'] + tokenizer_de(german_sentence) + ['<EOS>']
    bow_ger = german_vocab.tokens_to_bow(tokens)
    bow_ger = torch.tensor(bow_ger,dtype=torch.int64).reshape(1,-1).to(device)
        
    english_translation= english_vocab.tokens_to_bow(['<SOS>'])
    eos_token = english_vocab.tokens_to_bow(['<EOS>'])[0]
    max_len = 50
    
    with torch.no_grad():   
        hidden,cell = encoder(bow_ger)
    
        for i in range(1,50):
            if(english_translation[-1] == eos_token):
                break
            last_word = torch.tensor(english_translation[-1],dtype=torch.int64).reshape(1,1).to(device)
        
            predicted,hidden,cell = decoder(last_word,hidden,cell)
        
            english_translation.append(predicted.reshape(-1).argmax().item())
        return  english_vocab.bow_to_tokens(english_translation[1:-1])
        
        

In [122]:
def get_validation_predictions():
    correct_eng =[]
    predicted_eng = []
    for german, eng in validation_data:
        correct_eng.append(eng)
        predicted_eng.append(' '.join(translate_sentence(german)))
    
   
    return correct_eng,predicted_eng
        
    

In [123]:
correct,predicted = get_validation_predictions()

In [124]:
for i,j  in zip(correct[:20],predicted[:20]):
    print(i,':',j)

A group of men are loading cotton onto a truck
 : group of men loading organizing a a long boat . 

A man sleeping in a green room on a couch.
 : a man is on a green couch in a bed . 

A boy wearing headphones sits on a woman's shoulders.
 : a boy boy with a face is sitting on a woman 's shoulders . 

Two men setting up a blue ice fishing hut on an iced over lake
 : two men are a blue on a red container on a concrete surface . 

A balding man wearing a red life jacket is sitting in a small boat.
 : a man with red short no no shirt sitting in a small car . 

A lady in a red coat, holding a bluish hand bag likely of asian descent, jumping off the ground for a snapshot.
 : a woman in a red coat holds a digital camera out of a into the plastic to another a red bucket with a far out . 

A brown dog is running after the black dog.
 : a brown dog runs over the and white . 

A young boy wearing a Giants jersey swings a baseball bat at an incoming pitch.
 : a young boy wearing a baseball hat is