In [1]:
from __future__ import unicode_literals, print_function, division
import torch
import torch.nn as nn
import torch.optim as optim
import torch.nn.functional as F

import numpy as np
import pandas as pd

import os
import re
import random

In [2]:
SOS_token = 0
EOS_token = 1
MAX_LENGTH = 15

#initialize Lang Class
class Lang:
    def __init__(self):
        #initialize containers to hold the words and corresponding index
        self.word2index = {}
        self.word2count = {}
        self.index2word = {0: "SOS", 1: "EOS"}
        self.n_words = 2  # Count SOS and EOS

    #split a sentence into words and add it to the container
    def addSentence(self, sentence):
        for word in sentence.split(' '):
            self.addWord(word)

    #If the word is not in the container, the word will be added to it, 
    #else, update the word counter
    def addWord(self, word):
        if word not in self.word2index:
            self.word2index[word] = self.n_words
            self.word2count[word] = 1
            self.index2word[self.n_words] = word
            self.n_words += 1
        else:
            self.word2count[word] += 1

In [3]:
def indexesFromSentence(lang, sentence):
   return [lang.word2index[word] for word in sentence.split(' ')]

def tensorFromSentence(lang, sentence):
   indexes = indexesFromSentence(lang, sentence)
   indexes.append(EOS_token)
   return torch.tensor(indexes, dtype=torch.long, device=device).view(-1, 1)

def tensorsFromPair(source, target, pair):
   input_tensor = tensorFromSentence(source, pair[0])
   target_tensor = tensorFromSentence(target, pair[1])
   return (input_tensor, target_tensor)

In [4]:
device = torch.device("cuda" if torch.cuda.is_available() else "cpu")

In [5]:
class Encoder(nn.Module):
   def __init__(self, input_dim, hidden_dim, embbed_dim, num_layers):
       super(Encoder, self).__init__()
      
       #set the encoder input dimesion , embbed dimesion, hidden dimesion, and number of layers 
       self.input_dim = input_dim
       self.embbed_dim = embbed_dim
       self.hidden_dim = hidden_dim
       self.num_layers = num_layers

       #initialize the embedding layer with input and embbed dimention
       self.embedding = nn.Embedding(input_dim, self.embbed_dim)
       #intialize the GRU to take the input dimetion of embbed, and output dimention of hidden and
       #set the number of GRU layers
       self.GRU = nn.GRU(self.embbed_dim, self.hidden_dim, num_layers=self.num_layers)
              
   def forward(self, src):
      
       embedded = self.embedding(src).view(1,1,-1)
       outputs, hidden = self.GRU(embedded)
       return outputs, hidden

In [6]:
class Decoder(nn.Module):
    def __init__(self, output_dim, hidden_dim, embbed_dim, num_layers):
        super(Decoder, self).__init__()

        #set the encoder output dimension, embed dimension, hidden dimension, and number of layers 
        self.embbed_dim = embbed_dim
        self.hidden_dim = hidden_dim
        self.output_dim = output_dim
        self.num_layers = num_layers
        self.dropout = torch.nn.Dropout(0.1)

        # initialize every layer with the appropriate dimension. For the decoder layer, it will consist of an embedding, GRU, a Linear layer and a Log softmax activation function.
        self.embedding = nn.Embedding(output_dim, self.embbed_dim)
        self.GRU = nn.GRU(self.embbed_dim, self.hidden_dim, num_layers=self.num_layers)
        self.out = nn.Linear(self.hidden_dim, output_dim)
        self.softmax = nn.LogSoftmax(dim=1)
      
    def forward(self, input, hidden):
        

        # reshape the input to (1, batch_size)
        input = input.view(1, -1)
        embedded = self.embedding(input)
        embedded = self.dropout(embedded)
        output, hidden = self.GRU(embedded, hidden)       
        prediction = self.softmax(self.out(output[0]))
      
        return prediction, hidden


In [7]:
class Seq2Seq(nn.Module):
   def __init__(self, encoder, decoder, device, MAX_LENGTH=MAX_LENGTH):
        super().__init__()
      
        #initialize the encoder and decoder
        self.encoder = encoder
        self.decoder = decoder
        self.device = device
     
   def forward(self, source, target, teacher_forcing_ratio=0.5):

        input_length = source.size(0) #get the input length (number of words in sentence)
        target_length = target.shape[0]
        batch_size = target.shape[1] 
        vocab_size = self.decoder.output_dim
      
        #initialize a variable to hold the predicted outputs
        outputs = torch.zeros(target_length, batch_size, vocab_size).to(self.device)

        #encode every word in a sentence
        for i in range(input_length):
            encoder_output, encoder_hidden = self.encoder(source[i])

        #use the encoder’s hidden layer as the decoder hidden
        decoder_hidden = encoder_hidden.to(device)
    
        #add a token before the first predicted word
        decoder_input = torch.tensor([SOS_token], device=device)  # SOS

        #topk is used to get the top K value over a list
        #predict the output word from the current target word. If we enable the teaching force,  then the #next decoder input is the next word, else, use the decoder output highest value. 

        for t in range(target_length):   
            decoder_output, decoder_hidden = self.decoder(decoder_input, decoder_hidden)
            outputs[t] = decoder_output
            teacher_force = random.random() < teacher_forcing_ratio
            topv, topi = decoder_output.topk(1)
            input = (target[t] if teacher_force else topi)
            if(teacher_force == False and input.item() == EOS_token):
                break

        return outputs

In [8]:
teacher_forcing_ratio = 0.5

def clacModel(model, input_tensor, target_tensor, model_optimizer, criterion):
    model_optimizer.zero_grad()

    input_length = input_tensor.size(0)
    loss = 0
    epoch_loss = 0

    output = model(input_tensor, target_tensor)

    num_iter = output.size(0)

     #calculate the loss from a predicted sentence with the expected result
    for ot in range(num_iter):
        loss += criterion(output[ot], target_tensor[ot])

    loss.backward()
    model_optimizer.step()
    epoch_loss = loss.item() / num_iter

    return epoch_loss

def trainModel(model, source, target, pairs, num_iteration=20000):
    model.train()

    optimizer = optim.SGD(model.parameters(), lr=0.001)
    criterion = nn.NLLLoss()
    total_loss_iterations = 0
    training_pairs = [tensorsFromPair(source, target, random.choice(pairs)) for i in range(num_iteration)]
    
    print("num of iterations is", num_iteration)
    for iter in range(1, num_iteration + 1):
        training_pair = training_pairs[iter - 1]
        input_tensor = training_pair[0]
        target_tensor = training_pair[1]

        total_loss_iterations += clacModel(model, input_tensor, target_tensor, optimizer, criterion)

        if iter % 50 == 0:
            avarage_loss= total_loss_iterations / 50
            total_loss_iterations = 0
            print('%d %.4f' % (iter, avarage_loss))
        
            
    torch.save(model.state_dict(), 'mytraining.pt')
    return model

In [9]:
def evaluate(model, input_lang, output_lang, sentences, max_length=MAX_LENGTH):
    with torch.no_grad():
        input_tensor = tensorFromSentence(input_lang, sentences[0])
        output_tensor = tensorFromSentence(output_lang, sentences[1])
    
        decoded_words = []
    
        output = model(input_tensor, output_tensor)
        print(output_tensor)
        print(output)
        print("----")
        print(output.size(0))
        print(EOS_token)
    
        for ot in range(output.size(0)):
            topv, topi = output[ot].topk(1)

            if topi[0].item() == EOS_token:
                decoded_words.append('<EOS>')
                break
            else:
                decoded_words.append(output_lang.index2word[topi[0].item()])
        print(output_lang.index2word)
        print(decoded_words)
    return decoded_words

def evaluateRandomly(model, source, target, pairs, n=10):
    for i in range(n):
        pair = random.choice(pairs)
        print("source {}".format(pair[0]))
        print("target {}".format(pair[1]))
        output_words = evaluate(model, source, target, pair)
        output_sentence = ' '.join(output_words)
        print("predicted {}".format(output_sentence))

In [10]:
#Normalize every sentence
def normalize_sentence(df, lang):
    sentence = df[lang].str
    sentence = sentence.encode('utf-8', errors='ignore').str.decode('utf-8')
    return sentence

def read_sentence(df, first, second):
   sentence1 = normalize_sentence(df, first)
   sentence2 = normalize_sentence(df, second)
   return sentence1, sentence2

def read_file(loc, first, second):
   df = pd.read_csv(loc, delimiter=' , ', header=None, names=[first, second])
   return df

def process_data(first, second):
    df = read_file('/content/ferdosi.txt', first, second)
    print("Read %s sentence pairs" % len(df))
    sentence1, sentence2 = read_sentence(df, first, second)

    source = Lang()
    target = Lang()
    pairs = []

    for i in range(len(df)):
        if len(sentence1[i].split(' ')) < MAX_LENGTH and len(sentence2[i].split(' ')) < MAX_LENGTH:
            full = [sentence1[i], sentence2[i]]
            source.addSentence(sentence1[i])
            target.addSentence(sentence2[i])
            pairs.append(full)

    return source, target, pairs

In [None]:
source, target, pairs = process_data('first', 'second')

randomize = random.choice(pairs)
print('random sentence {}'.format(randomize))

#print number of words
input_size = source.n_words
output_size = target.n_words
print('Input : {} Output : {}'.format(input_size, output_size))

embed_size = 256
hidden_size = 512
num_layers = 1
num_iteration = 5000

#create encoder-decoder model
encoder = Encoder(input_size, hidden_size, embed_size, num_layers)
decoder = Decoder(output_size, hidden_size, embed_size, num_layers)

model = Seq2Seq(encoder, decoder, device).to(device)

#print model 
print(encoder)
print(decoder)

model = trainModel(model, source, target, pairs, num_iteration)

In [13]:
evaluateRandomly(model, source, target, pairs)

source سراپرده و خیمه زد بر دو میل
target بپوشید گیتی به نعل و به پیل
tensor([[ 843],
        [ 130],
        [  56],
        [3155],
        [  12],
        [  56],
        [ 562],
        [   1]])
tensor([[[-10.6476, -10.6687,  -8.4437,  ..., -10.3910, -11.0812, -11.2551]],

        [[-10.7964,  -7.1752,  -8.5215,  ..., -10.4022, -11.2634, -11.4134]],

        [[-10.9332,  -4.5708,  -8.7015,  ..., -10.4653, -11.3470, -11.4744]],

        ...,

        [[  0.0000,   0.0000,   0.0000,  ...,   0.0000,   0.0000,   0.0000]],

        [[  0.0000,   0.0000,   0.0000,  ...,   0.0000,   0.0000,   0.0000]],

        [[  0.0000,   0.0000,   0.0000,  ...,   0.0000,   0.0000,   0.0000]]])
----
8
1
{0: 'SOS', 1: 'EOS', 2: 'کزین', 3: 'برتر', 4: 'اندیشه', 5: 'برنگذرد', 6: 'خداوند', 7: 'روزی', 8: 'ده', 9: 'رهنمای', 10: 'فروزنده', 11: 'ماه', 12: 'و', 13: 'ناهید', 14: 'مهر', 15: 'نگارنده', 16: 'بر', 17: 'شده', 18: 'پیکرست', 19: 'نبینی', 20: 'مرنجان', 21: 'دو', 22: 'بیننده', 23: 'را', 24: 'که', 25: 'او'