In [0]:
import pandas as pd
import numpy as np
import string
from string import digits
import matplotlib.pyplot as plt
%matplotlib inline
import re
import random
from sklearn.utils import shuffle
from sklearn.model_selection import train_test_split

In [0]:
import torch
import torchvision
import torchvision.transforms as transforms

import torch.nn as nn
import torch.nn.functional as F
import torch.optim as optim

In [0]:
lines = pd.read_table('mar.txt', names=['eng', 'mar', 'fff'])
lines = lines.iloc[:,0:2]
lines.eng = lines.eng.apply(lambda x : x.lower())


# Remove quotes
lines.eng=lines.eng.apply(lambda x: re.sub("'", '', x))
lines.mar=lines.mar.apply(lambda x: re.sub("'", '', x))
exclude = set(string.punctuation) # Set of all special characters

# Remove all the special characters
lines.eng=lines.eng.apply(lambda x: ''.join(ch for ch in x if ch not in exclude))
lines.mar=lines.mar.apply(lambda x: ''.join(ch for ch in x if ch not in exclude))

# Remove all numbers from text
remove_digits = str.maketrans('', '', digits)
lines.eng=lines.eng.apply(lambda x: x.translate(remove_digits))
lines.mar = lines.mar.apply(lambda x: re.sub("[२३०८१५७९४६]", "", x))

# Remove extra spaces
lines.eng=lines.eng.apply(lambda x: x.strip())
lines.mar=lines.mar.apply(lambda x: x.strip())
lines.eng=lines.eng.apply(lambda x: re.sub(" +", " ", x))
lines.mar=lines.mar.apply(lambda x: re.sub(" +", " ", x))

# Add start and end tokens to target sequences
lines.mar = lines.mar.apply(lambda x : 'START '+ x + ' END')
#lines.eng = lines.eng.apply(lambda x : 'START '+ x + ' END')

In [0]:
print(lines.eng.shape)
print(lines.mar.shape)

(36503,)
(36503,)


In [0]:
from keras.preprocessing.text import Tokenizer
from keras.preprocessing.sequence import pad_sequences

Using TensorFlow backend.


In [0]:
tokenizer_eng = Tokenizer(oov_token='<OOV>')
tokenizer_eng.fit_on_texts(lines.eng)
word_index_eng = tokenizer_eng.word_index
sequences_eng = tokenizer_eng.texts_to_sequences(lines.eng)
padded_eng = pad_sequences(sequences_eng, padding='post')

In [0]:
tokenizer_mar = Tokenizer(oov_token='<OOV>')
tokenizer_mar.fit_on_texts(lines.mar)
word_index_mar = tokenizer_mar.word_index
sequences_mar = tokenizer_mar.texts_to_sequences(lines.mar)
padded_mar = pad_sequences(sequences_mar, padding='post')

In [0]:
word_index_mar['0']=0
word_index_eng['0']=0

In [0]:
rev_word_index_eng = dict((i, word) for word, i in word_index_eng.items())
rev_word_index_mar = dict((i, word) for word, i in word_index_mar.items())

In [0]:
print(padded_eng.shape)
print(padded_mar.shape)
print(len(word_index_eng))
print(len(word_index_mar))

(36503, 34)
(36503, 37)
5490
13063


In [0]:
print(type(padded_eng))
print(type(padded_mar))
max_length_eng = padded_eng.shape[1]
max_length_mar = padded_mar.shape[1]
print(max_length_eng)
print(max_length_mar)

<class 'numpy.ndarray'>
<class 'numpy.ndarray'>
34
37


In [0]:
device = torch.device('cuda' if torch.cuda.is_available() else 'cpu')

In [0]:
from sklearn.model_selection import train_test_split
X, y = padded_eng, padded_mar
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size = 0.1)

In [0]:
print(X_train.shape)
print(X_test.shape)

(32852, 34)
(3651, 34)


In [0]:
class Encoder(nn.Module):
  def __init__(self, vocab_size, emb_dim, hidden_units, n_layers, dropout):
    super().__init__()
        
    self.hidden_units = hidden_units
    self.n_layers = n_layers
       
    self.embedding = nn.Embedding(vocab_size, emb_dim)
    self.lstm = nn.LSTM(emb_dim, hidden_units, n_layers, dropout = dropout, batch_first = True)
    self.dropout = nn.Dropout(dropout)
        
  def forward(self, src):   
    embedded = self.dropout(self.embedding(src))
    outputs, (hidden, cell) = self.lstm(embedded)
    return hidden, cell

In [0]:
class Decoder(nn.Module):
  def __init__(self, vocab_size, emb_dim, hidden_units, n_layers, dropout):
    super().__init__()
    self.vocab_size = vocab_size
    self.hidden_units = hidden_units
    self.n_layers = n_layers
    
    self.embedding = nn.Embedding(vocab_size, emb_dim)
    self.lstm = nn.LSTM(emb_dim, hidden_units, n_layers, dropout = dropout, batch_first = True)
    self.fc_out = nn.Linear(hidden_units, vocab_size)
    self.dropout = nn.Dropout(dropout)
    
  def forward(self, input, hidden, cell):
    #input = [batch size]
    input = input.unsqueeze(1)
    #input = [batch size, 1]
    embedded = self.dropout(self.embedding(input))
    
    #embedded = [batch size, 1, emb dim]
            
    output, (hidden, cell) = self.lstm(embedded, (hidden, cell))
    #print('output.shape ', output.shape)
    
    prediction = self.fc_out(output)
    #print('prediction.shape ', prediction.shape)
    
    #prediction = [batch size, 1, output dim]
    
    return prediction, hidden, cell

In [0]:
class Seq2Seq(nn.Module):
  def __init__(self, encoder, decoder, device):
    super().__init__()
    
    self.encoder = encoder
    self.decoder = decoder
    self.device = device
        
  def forward(self, src, trg, teacher_forcing_ratio, eval):
    
    batch_size = trg.shape[0]
    trg_len = trg.shape[1]
    trg_vocab_size = self.decoder.vocab_size
    

    hidden, cell = self.encoder(src)
    input = trg[:,0]

    if eval == 1:
      for t in range(10):
        # input.shape  torch.Size([1]) i.e torch.Size([batch_size])
        prediction, hidden, cell = self.decoder(input, hidden, cell)
        # output is nothing but prediction ---->  prediction.shape  torch.Size([1, 1, 13063])
        
        if t==0:
          p = prediction
        elif t==1:
          outputs = torch.cat((p, prediction), 1)
        elif t>1:
          outputs = torch.cat((outputs, prediction), 1)
        
        teacher_force = random.random() < teacher_forcing_ratio
        top1 = prediction.argmax(2) 
        top1 = top1.squeeze(1)
        input = top1

    elif eval == 0:   
      for t in range(0, trg_len-1):
        # input.shape  torch.Size([128]) i.e torch.Size([batch_size])
        output, hidden, cell = self.decoder(input, hidden, cell)
        # output is nothing but prediction ---->  prediction.shape  torch.Size([128, 1, 13063])
        
        if t==0:
          p = output
        elif t==1:
          outputs = torch.cat((p, output), 1)
        elif t>1:
          outputs = torch.cat((outputs, output), 1)
        
        teacher_force = random.random() < teacher_forcing_ratio
        top1 = output.argmax(2)
        top1 = top1.squeeze(1)
        input = trg[:, t+1] if teacher_force else top1
    
    outputs.to(self.device)
    return outputs

In [0]:
vocab_size_eng = len(word_index_eng)
vocab_size_mar = len(word_index_mar)
emb_dim = 256
hidden_units = 512
n_layers = 2
ENC_DROPOUT = 0.5
DEC_DROPOUT = 0.5
batch_size = 128

encoder = Encoder(vocab_size_eng, emb_dim, hidden_units, n_layers, ENC_DROPOUT)
decoder = Decoder(vocab_size_mar, emb_dim, hidden_units, n_layers, DEC_DROPOUT)

model = Seq2Seq(encoder, decoder, device).to(device)

In [0]:
optimizer = optim.Adam(model.parameters())

In [0]:
# TRAINING 
src = X_train
trg = y_train
src = torch.tensor(src)
trg = torch.tensor(trg)
src = src.long()
trg = trg.long()
src = src.to(device)
trg = trg.to(device)

end = int(X_train.shape[0]/batch_size)
print(end)

for epoch in range(100):
  epoch_loss = 0
  for i in range(  end ):   
    
    optimizer.zero_grad()
    a = src[ (0 + i*batch_size) : (  (i+1) * batch_size ) ,  : ]
    b = trg[ (0 + i*batch_size) : (  (i+1) * batch_size ) ,  : ]

    output = model(a, b, 1, 0)
    output = output.view(  batch_size*(max_length_mar - 1), len(word_index_mar)  )
    
    vocab_size_mar = output.shape[1]
    
    b = b[:, 1:]
    b = b.reshape(  (max_length_mar-1) * batch_size,  )
    
    
    loss = F.cross_entropy(output, b)
    loss.backward()
    torch.nn.utils.clip_grad_norm_(model.parameters(), 1)
    optimizer.step()
    epoch_loss += loss.item()
  print("epoch loss : ", epoch, ' ',  epoch_loss/end)

256
epoch loss :  0   1.0438387931790203
epoch loss :  1   0.8356543455738574
epoch loss :  2   0.7313334529753774
epoch loss :  3   0.6362467114813626
epoch loss :  4   0.5524572368012741
epoch loss :  5   0.48183869605418295
epoch loss :  6   0.42273825756274164
epoch loss :  7   0.37268149852752686
epoch loss :  8   0.3304470519069582
epoch loss :  9   0.29504168493440375
epoch loss :  10   0.2647537913871929
epoch loss :  11   0.240221394225955
epoch loss :  12   0.21832720295060426
epoch loss :  13   0.19955158763332292
epoch loss :  14   0.18343240965623409
epoch loss :  15   0.1695384561899118
epoch loss :  16   0.15666136422078125
epoch loss :  17   0.1448961968999356
epoch loss :  18   0.13558041770011187
epoch loss :  19   0.12688005427480675
epoch loss :  20   0.11913271885714494
epoch loss :  21   0.11086317172157578
epoch loss :  22   0.10492519781109877
epoch loss :  23   0.09881433265400119
epoch loss :  24   0.09276260642218404
epoch loss :  25   0.08798053406644613
epo

In [0]:
torch.save(model.state_dict(), 'nmt_weight.pth')

In [0]:
model = Seq2Seq(encoder, decoder, device).to(device)
model.load_state_dict(torch.load('nmt_weight.pth'))

In [0]:
#TESTING
src = X_train
trg = y_train
src = torch.tensor(src)
trg = torch.tensor(trg)
src = src.long()
trg = trg.long()
src = src.to(device)
trg = trg.to(device)

end = int(X_test.shape[0]/batch_size)
print(end)

for epoch in range(10):
  epoch_loss = 0
  for i in range(  end ):   
    
    optimizer.zero_grad()
    a = src[ (0 + i*batch_size) : (  (i+1) * batch_size ) ,  : ]
    b = trg[ (0 + i*batch_size) : (  (i+1) * batch_size ) ,  : ]

    output = model(a, b, 0, 0)
    output = output.view(  batch_size*(max_length_mar - 1) , len(word_index_mar) )
    
    vocab_size_mar = output.shape[1]
    
    b = b[:, 1:]
    b = b.reshape( (max_length_mar - 1)*batch_size,)
    
    
    loss = F.cross_entropy(output, b)
    loss.backward()
    torch.nn.utils.clip_grad_norm_(model.parameters(), 1)
    optimizer.step()
    epoch_loss += loss.item()
  print("epoch loss : ", epoch, ' ',  epoch_loss/20)

In [0]:
def translation(eng_sent):
  eng_sent = pd.Series(eng_sent)
  eng_sent = eng_sent.apply(lambda x : x.lower())
  sequences_eng_sent = tokenizer_eng.texts_to_sequences(eng_sent)
  padded_eng_sent = pad_sequences(sequences_eng_sent, padding='post')
  padded_eng_sent = torch.tensor(padded_eng_sent)
  padded_eng_sent = padded_eng_sent.long()
  padded_eng_sent = padded_eng_sent.to(device)
  c = torch.tensor([[ word_index_mar['start'] ]]).long()
  c = c.to(device)
  output = model(padded_eng_sent, c, 0, 1)
  q = output.argmax(2)
  q = q.squeeze(0)
  translated = ''
  for i in q:
    translated = translated + ' ' +  rev_word_index_mar[int(i)]  
  return translated

In [0]:
eng_sent = 'i will give you this book' # Your Engish Sentence
sentence = translation(eng_sent)
print(sentence)

 मी तुला एक घर देईन end 0 0 0 0


In [0]:
eng_sent = 'tom was eating a sandwich' # Your Engish Sentence
sentence = translation(eng_sent)
print(sentence)

 टॉम एक कप होता end 0 0 0 0 0
