In [1]:
from __future__ import unicode_literals, print_function, division
from io import open
import unicodedata
import string
import re
import random

import torch
import torch.nn as nn
from torch import optim
import torch.nn.functional as F

device = torch.device("cuda" if torch.cuda.is_available() else "cpu")

In [2]:
!wget https://download.pytorch.org/tutorial/data.zip

--2023-02-15 16:39:37--  https://download.pytorch.org/tutorial/data.zip
Resolving download.pytorch.org (download.pytorch.org)... 13.33.88.59, 13.33.88.36, 13.33.88.63, ...
Connecting to download.pytorch.org (download.pytorch.org)|13.33.88.59|:443... connected.
HTTP request sent, awaiting response... 200 OK
Length: 2882130 (2.7M) [application/zip]
Saving to: ‘data.zip’


2023-02-15 16:39:37 (74.0 MB/s) - ‘data.zip’ saved [2882130/2882130]



In [3]:
!unzip data.zip

Archive:  data.zip
   creating: data/
  inflating: data/eng-fra.txt        
   creating: data/names/
  inflating: data/names/Arabic.txt   
  inflating: data/names/Chinese.txt  
  inflating: data/names/Czech.txt    
  inflating: data/names/Dutch.txt    
  inflating: data/names/English.txt  
  inflating: data/names/French.txt   
  inflating: data/names/German.txt   
  inflating: data/names/Greek.txt    
  inflating: data/names/Irish.txt    
  inflating: data/names/Italian.txt  
  inflating: data/names/Japanese.txt  
  inflating: data/names/Korean.txt   
  inflating: data/names/Polish.txt   
  inflating: data/names/Portuguese.txt  
  inflating: data/names/Russian.txt  
  inflating: data/names/Scottish.txt  
  inflating: data/names/Spanish.txt  
  inflating: data/names/Vietnamese.txt  


In [4]:
SOS_token = 0
EOS_token = 1

class Lang:
    def __init__(self, name):
        self.name = name
        self.word2index = {}
        self.word2count = {}
        self.index2word = {0: "SOS", 1: "EOS"}
        self.n_words = 2  # Count SOS and EOS

    def addSentence(self, sentence):
        for word in sentence.split(' '):
            self.addWord(word)

    def addWord(self, word):
        if word not in self.word2index:
            self.word2index[word] = self.n_words
            self.word2count[word] = 1
            self.index2word[self.n_words] = word
            self.n_words += 1
        else:
            self.word2count[word] += 1

In [5]:
def unicodeToAscii(s):
    return ''.join(
        c for c in unicodedata.normalize('NFD', s)
        if unicodedata.category(c) != 'Mn'
    )

# Lowercase, trim, and remove non-letter characters


def normalizeString(s):
    s = unicodeToAscii(s.lower().strip())
    s = re.sub(r"([.!?])", r" \1", s)
    s = re.sub(r"[^a-zA-Z.!?]+", r" ", s)
    return s

In [6]:
def readLangs(lang1, lang2, reverse=False):
    print("Reading lines...")

    # Read the file and split into lines
    lines = open('data/%s-%s.txt' % (lang1, lang2), encoding='utf-8').\
        read().strip().split('\n')

    # Split every line into pairs and normalize
    pairs = [[normalizeString(s) for s in l.split('\t')] for l in lines]

    # Reverse pairs, make Lang instances
    if reverse:
        pairs = [list(reversed(p)) for p in pairs]
        input_lang = Lang(lang2)
        output_lang = Lang(lang1)
    else:
        input_lang = Lang(lang1)
        output_lang = Lang(lang2)

    return input_lang, output_lang, pairs

In [7]:
max_length = 15

# eng_prefixes = (
#     "i am ", "i m ",
#     "he is", "he s ",
#     "she is", "she s ",
#     "you are", "you re ",
#     "we are", "we re ",
#     "they are", "they re "
# )


def filterPair(p):
    return len(p[0].split(' ')) < max_length and \
        len(p[1].split(' ')) < max_length #and \
        # p[0].startswith(eng_prefixes)


def filterPairs(pairs):
    return [pair for pair in pairs if filterPair(pair)]

In [8]:
def prepareData(lang1, lang2, reverse=False):
    input_lang, output_lang, pairs = readLangs(lang1, lang2, reverse)
    print("Read %s sentence pairs" % len(pairs))
    pairs = filterPairs(pairs)
    print("Trimmed to %s sentence pairs" % len(pairs))
    print("Counting words...")
    for pair in pairs:
        input_lang.addSentence(pair[0])
        output_lang.addSentence(pair[1])
    print("Counted words:")
    print(input_lang.name, input_lang.n_words)
    print(output_lang.name, output_lang.n_words)
    return input_lang, output_lang, pairs


input_lang, output_lang, pairs = prepareData('eng', 'fra', False)
# print(random.choice(pairs))

Reading lines...
Read 135842 sentence pairs
Trimmed to 130143 sentence pairs
Counting words...
Counted words:
eng 12362
fra 20391


In [9]:
len(input_lang.word2count)

12360

In [10]:
random.choice(pairs)

['don t expect anyone to help you .',
 'ne t attends pas a ce que qui que ce soit t aide !']

In [11]:
input_lang.word2index['come']

52

In [12]:
def indexesFromSentence(lang,sentence):
  return [lang.word2index[word] for word in sentence.split(' ')]

def tensorFromSentence(lang,sentence):
  indexes=indexesFromSentence(lang,sentence)
  indexes.append(EOS_token)
  return torch.tensor(indexes,dtype=torch.long,device=device).view(-1,1)

def tensorsFromPair(input_lang,output_lang,pair):
  input_tensor = tensorFromSentence(input_lang, pair[0])
  target_tensor = tensorFromSentence(output_lang, pair[1])
  return (input_tensor, target_tensor)

In [13]:
sent=pairs[90000][0]
indexesFromSentence(input_lang,sent)

[164, 108, 951, 2410, 200, 235, 53, 3]

In [25]:
class EncoderRNN(nn.Module):
  def __init__(self,input_size,hidden_size,embedding_size=256,num_layers=1):
    super(EncoderRNN,self).__init__()
    self.input_size=input_size
    self.hidden_size=hidden_size
    self.embedding_size=embedding_size
    self.num_layers=num_layers
    self.embedding=nn.Embedding(input_size,self.embedding_size)
    self.gru=nn.GRU(self.embedding_size,self.hidden_size,num_layers=self.num_layers)

  def forward(self,input,hidden):
    embedded=self.embedding(input).view(1,1,-1)
    output=embedded
    output,hidden=self.gru(output,hidden)
    return output,hidden

  def init_hidden(self):
    return torch.zeros(self.num_layers,1,self.hidden_size,device=device)

In [26]:
class DecoderRNN(nn.Module):
  def __init__(self,output_size,hidden_size,embedding_size=256,num_layers=1):
    super().__init__()
    self.output_size=output_size
    self.hidden_size=hidden_size
    self.embedding_size=embedding_size
    self.num_layers=num_layers
    self.embedding=nn.Embedding(output_size,embedding_size)
    self.gru=nn.GRU(self.embedding_size,hidden_size,num_layers=num_layers)
    self.out=nn.Linear(hidden_size,output_size)
    self.softmax=nn.LogSoftmax(dim=1)
  
  def forward(self,input,hidden):
    output=self.embedding(input).view(1,1,-1)
    embedd=F.relu(output)
    output,hidden=self.gru(embedd,hidden)
    prediction=self.softmax(self.out(output[0]))
    return prediction,hidden

  def init_hidden(self):
    return torch.zeros(self.num_layers,1,self.hidden_size,device=device)

In [27]:
# class Seq2Seq(nn.Module):
#   def __init__(self,encoder,decoder,device,MAX_LENGTH=MAX_LENGTH):
#     super().__init__()
#     self.encoder=encoder
#     self.decoder=decoder
#     self.device=device
  
#   def forward(self,source,target,teacher_forcing_ratio=0.5):
#     input_length=source.size(0)
#     batch_size=target.shape[1]
#     target_length=target.shape[0]
#     vocab_size=self.decoder.output_size

#     #Initialize a variable to hold the predicted outputs
#     outputs=torch.zeros(target_length,batch_size,vocab_size).to(device)
    
#     #encode every word in the sentence
#     for i in range(input_length):
#       encoder_output,encoder_hidden=self.encoder(source[i])
    
#     #Use encoder's hidden layer as decoder hidden
#     decoder_hidden=encoder_hidden.to(device)

#     #add a token before the first predicted word
#     decoder_input=torch.tensor([SOS_token],device=device)

#     #topk is used to get the top K value over a list
#     #predict the output word from the current target word. If we enable the teaching force,  then the #next decoder input is the next word, else, use the decoder output highest value. 
#     for t in range(target_length):
#       decoder_output,decoder_hidden=self.decoder(decoder_input,decoder_hidden)
#       outputs[t]=decoder_output
#       teacher_force=random.random() < teacher_forcing_ratio
#       topv,topi=decoder_output.topk(1)
#       input=(target[t] if teacher_force else topi)
#       if(teacher_force==False and input.item()==EOS_token):
#         break
    
#     return outputs


In [28]:
# teacher_forcing_ratio=0.5
# def clacModel(model,input_tensor,target_tensor,optimizer,criterion):
#   optimizer.zero_grad()
#   input_length=input_tensor.size(0)
#   loss=0
#   epoch_loss=0
#   output=model(input_tensor,target_tensor)
#   num_iter=output.size(0)
#   #Calculate loss from predicted sentence with expected result
#   for i in range(num_iter):
#     loss+=criterion(output[i],target_tensor[i])
#   loss.backward()
#   optimizer.step()
#   epoch_loss=loss.item()/num_iter
#   return epoch_loss

In [29]:
# def train(model,source,target,pairs,num_iteration=200000):
#   model.train()
#   optimizer=optim.SGD(model.parameters(),lr=0.01)
#   criterion=nn.NLLLoss()
#   total_loss_iterations=0

#   training_pairs=[tensorsFromPair(source,target,random.choice(pairs)) for i in range(num_iteration)]
#   for iter in range(1,num_iteration+1):
#     training_pair=[iter-1]
#     input_tensor=training_pair[0]

#     target_tensor=training_pair[1]
#     loss=clacModel(model,input_tensor,target_tensor,optimizer,criterion)
    
#     total_loss_iterations+=loss

#     if iter%5000==0:
#       average_loss=total_loss_iterations/5000
#       total_loss_iterations=0
#       print('%d %.4f'%(iter,average_loss))
    
#   torch.save(model.state_dict(),'mytraining.pt')

In [30]:
teacher_forcing_ratio=0.5

In [44]:
def train(input_tensor,target_tensor,encoder,decoder,encoder_optimizer,decoder_optimizer,criterion,max_length):
  encoder_optimizer.zero_grad()
  decoder_optimizer.zero_grad()
  input_length=input_tensor.size(0)
  target_length=target_tensor.size(0)
  encoder_hidden=encoder.init_hidden()
  encoder_outputs=torch.zeros(max_length,encoder.hidden_size,device=device)
  loss=0
  for ei in range(input_length):
    encoder_output,encoder_hidden=encoder(
        input_tensor[ei],encoder_hidden
    )
    # print(encoder_output.shape)
    # print(encoder_hidden.shape)
    encoder_outputs[ei]=encoder_output[0,0]
  decoder_input=torch.tensor([[SOS_token]],device=device)
  decoder_hidden=encoder_hidden

  teacher_forcing=True if random.random() < teacher_forcing_ratio else False

  if teacher_forcing:
      # Teacher forcing: Feed the target as the next input
      for di in range(target_length):
        decoder_output, decoder_hidden = decoder(decoder_input, decoder_hidden)
        loss += criterion(decoder_output, target_tensor[di])
        decoder_input = target_tensor[di]  # Teacher forcing
  else:
    # Without teacher forcing: use its own predictions as the next input
    for di in range(target_length):
      decoder_output, decoder_hidden = decoder(decoder_input, decoder_hidden)
      topv, topi = decoder_output.topk(1)
      decoder_input = topi.squeeze().detach()  # detach from history as input
      loss += criterion(decoder_output, target_tensor[di])
      if decoder_input.item() == EOS_token:
        break
  loss.backward()
  encoder_optimizer.step()
  decoder_optimizer.step()
  return loss.item()/(target_length)

In [57]:
def train_iter(pairs,encoder,decoder,num_iters,max_length,lr=0.005):
  encoder_optimizer=torch.optim.SGD(encoder.parameters(),lr)
  decoder_optimizer=torch.optim.SGD(decoder.parameters(),lr)
  training_pairs=[tensorsFromPair(input_lang,output_lang,random.choice(pairs)) for i in range(num_iters)]
  criterion=nn.NLLLoss()
  total_loss=0
  for iter in range(1,(num_iters+1)):
    training_pair=training_pairs[iter-1]
    input_tensor=training_pair[0]
    target_tensor=training_pair[1]
    loss=train(input_tensor,target_tensor,encoder,decoder,encoder_optimizer,decoder_optimizer,criterion,max_length)
    total_loss+=loss

    if iter%5000==0:
      train_loss=total_loss/5000
      val_loss=evaluate(tensorsFromPair(input_lang,output_lang,random.choice(pairs)),encoder,decoder,criterion,max_length)
      print('Iter:{} , Train_Loss:{:.4f}, Val_Loss:{:.4}'.format(iter,train_loss,val_loss))

In [58]:
def evaluate(pair,encoder,decoder,criterion,max_length):
  encoder_hidden=encoder.init_hidden()
  input_tensor=pair[0]
  target_tensor=pair[1]
  input_length=input_tensor.size()[0]
  target_length=target_tensor.size()[0]
  encoder_outputs=torch.zeros(max_length,encoder.hidden_size,device=device)
  loss=0
  for ei in range(input_length):
    encoder_output,encoder_hidden=encoder(
        input_tensor[ei],encoder_hidden
    )
    encoder_outputs[ei]+=encoder_output[0,0]
  decoder_input=torch.tensor([[SOS_token]],device=device)
  decoder_hidden=encoder_hidden

  for di in range(target_length):
    decoder_output, decoder_hidden = decoder(decoder_input, decoder_hidden)
    topv, topi = decoder_output.topk(1)
    decoder_input = topi.squeeze().detach()  # detach from history as input
    loss += criterion(decoder_output, target_tensor[di])
    if decoder_input.item() == EOS_token:
      break
  return loss.item()/target_length

In [59]:
lang1='eng'
lang2='fra'
input_size=input_lang.n_words
output_size=output_lang.n_words
embed_size=256
hidden_size=512
num_layers=4
num_iters=100000

encoder=EncoderRNN(input_size,hidden_size,embed_size,num_layers).to(device)
decoder=DecoderRNN(output_size,hidden_size,embed_size,num_layers).to(device)
# model=Seq2Seq(encoder,decoder,device).to(device)

In [60]:
train_iter(pairs,encoder,decoder,num_iters,max_length,lr=0.005)

Iter:5000 , Train_Loss:5.1452, Val_Loss:2.915
Iter:10000 , Train_Loss:9.6939, Val_Loss:5.349
Iter:15000 , Train_Loss:14.0969, Val_Loss:3.588
Iter:20000 , Train_Loss:18.3665, Val_Loss:5.897
Iter:25000 , Train_Loss:22.5022, Val_Loss:4.118
Iter:30000 , Train_Loss:26.5093, Val_Loss:5.968
Iter:35000 , Train_Loss:30.3810, Val_Loss:6.373
Iter:40000 , Train_Loss:34.1577, Val_Loss:4.265
Iter:45000 , Train_Loss:37.8428, Val_Loss:5.412
Iter:50000 , Train_Loss:41.4590, Val_Loss:3.759
Iter:55000 , Train_Loss:45.0142, Val_Loss:2.89
Iter:60000 , Train_Loss:48.5010, Val_Loss:4.089
Iter:65000 , Train_Loss:51.9370, Val_Loss:4.548
Iter:70000 , Train_Loss:55.2462, Val_Loss:2.726
Iter:75000 , Train_Loss:58.5295, Val_Loss:2.321
Iter:80000 , Train_Loss:61.7562, Val_Loss:2.223
Iter:85000 , Train_Loss:64.9359, Val_Loss:1.606
Iter:90000 , Train_Loss:68.0708, Val_Loss:1.747
Iter:95000 , Train_Loss:71.1437, Val_Loss:2.165
Iter:100000 , Train_Loss:74.1686, Val_Loss:3.477


In [77]:
def predict(encoder,decoder,sentence,max_length=max_length):
  with torch.no_grad():
    input_tensor=tensorFromSentence(input_lang,sentence)
    input_length=input_tensor.size()[0]
    encoder_hidden=encoder.init_hidden()
    encoder_outputs=torch.zeros(max_length,encoder.hidden_size,device=device)
    # Encoder
    for ei in range(input_length):
      encoder_output,encoder_hidden=encoder(input_tensor[ei],encoder_hidden)
      encoder_outputs[ei]+=encoder_output[0,0]
    # Decoder
    decoder_input=torch.tensor([[SOS_token]],device=device)
    decoder_hidden=encoder_hidden
    decoded_words=[]
    for di in range(max_length):
      decoder_output,decoder_hidden=decoder(decoder_input,decoder_hidden)
      topv,topi=decoder_output.data.topk(1)
      if topi.item()==EOS_token:
        decoded_words.append('<EOS>')
      else:
        decoded_words.append(output_lang.index2word[topi.item()])
      decoder_input=topi.squeeze().detach()
    
    return decoded_words

In [78]:
def get_prediction(encoder,decoder,sentence,max_length):
  output_arr=predict(encoder,decoder,sentence,max_length)
  output_words=' '.join(output_arr)
  return output_words

In [79]:
sentence='he is speaking fluent in french'
output=get_prediction(encoder,decoder,sentence,max_length)
print(output)

il est en francais au francais francais . <EOS> <EOS> <EOS> . <EOS> <EOS> .


In [81]:
sentence='the world is a beautiful place to live in'
output=get_prediction(encoder,decoder,sentence,max_length)
print(output)

le monde est une qui est qui dans dans la <EOS> . <EOS> . <EOS>


In [82]:
sentence='we are here to protect you'
output=get_prediction(encoder,decoder,sentence,max_length)
print(output)

nous sommes ici pour toi . <EOS> . <EOS> . <EOS> . <EOS> . <EOS>


In [67]:
torch.save(encoder.state_dict(),'encoder.pt')
torch.save(decoder.state_dict(),'decoder.pt')