In [15]:
from __future__ import unicode_literals, print_function, division
import torch
import torch.nn as nn
import torch.optim as optim
import torch.nn.functional as F

import numpy as np
import pandas as pd

import os
import re
import random

In [16]:
def normalize_sentence(df, part):
    sentence = df[part].str
    sentence = sentence.encode('utf-8', errors='ignore').str.decode('utf-8')
    return sentence

def read_sentence(df, first, second):
    sentence1 = normalize_sentence(df, first)
    sentence2 = normalize_sentence(df, second)
    return sentence1, sentence2

def read_file(file, first, second):
    df = pd.read_csv(file, delimiter=' , ', header=None, names=[first, second])
    return df

def process_data(first, second, MAX_LENGTH = 15):
    df = read_file('/content/ferdosi.txt', first, second)
    print("Read %s sentence pairs" % len(df))
    
    sentence1, sentence2 = read_sentence(df, first, second)

    source = Language()
    target = Language()
    pairs = []

    for i in range(len(df)):
        if len(sentence1[i].split(' ')) < MAX_LENGTH and len(sentence2[i].split(' ')) < MAX_LENGTH :
            full = [sentence1[i], sentence2[i]]
            source.add_sentence(sentence1[i])
            target.add_sentence(sentence2[i])
            pairs.append(full)
    
    return source, target, pairs


In [17]:
SOS_token = 0
EOS_token = 1
MAX_LENGTH = 15

class Language:
    def __init__(self):
        self.word2index = {}
        self.word2count = {}
        self.index2word = {0: "SOS", 1: "EOS"}
        self.n_words = 2

    def add_sentence(self, sentence):
        for word in sentence.split(' '):
            self.add_word(word)
    
    def add_word(self, word):
        if word not in self.word2index:
            self.word2index[word] = self.n_words
            self.word2count[word] = 1
            self.index2word[self.n_words] = word
            self.n_words += 1
        else:
            self.word2count[word] += 1

In [18]:
MAX_LENGTH = 15

def indexes_from_sentence(language, sentence):
    words =  [language.word2index[word] for word in sentence.split(' ')]
    return [SOS_token] + words + [EOS_token] * (MAX_LENGTH - len(words) - 1)

def tensor_from_sentence(language, sentence):
    indexes = indexes_from_sentence(language, sentence)
    indexes.append(EOS_token)
    return torch.tensor(indexes, dtype= torch.long, device = device).view(-1, 1)

def tensors_from_pair(source, target, pair):
    input_tensor = tensor_from_sentence(source, pair[0])
    target_tensor = tensor_from_sentence(target, pair[1])
    return (input_tensor, target_tensor) 

In [19]:
device = torch.device("cuda" if torch.cuda.is_available() else "cpu")

In [20]:
class Encoder(nn.Module):
    def __init__(self, input_dim, hidden_dim, embed_dim, decoder_hidden_dim = 512, num_layers=1):
        super(Encoder, self).__init__()

        self.input_dim = input_dim
        self.hidden_dim = hidden_dim
        self.embed_dim = embed_dim
        self.num_layers = num_layers

        self.embedding = nn.Embedding(self.input_dim, self.embed_dim)
        self.hidden_fc = torch.nn.Sequential(
            torch.nn.Linear(hidden_dim, decoder_hidden_dim),
            torch.nn.ReLU(inplace=True)
        )
        self.lstm = nn.LSTM(self.embed_dim, self.hidden_dim, num_layers = self.num_layers)

    def forward(self, source):
        # (1,1,-1)
        embedded = self.embedding(source)
        outputs, (hidden, cell) = self.lstm(embedded)
        return outputs, hidden, cell

In [21]:
class Decoder(nn.Module):
    def __init__(self, output_dim, hidden_dim, embed_dim, num_layers):
        super(Decoder, self).__init__()

        self.output_dim = output_dim
        self.hidden_dim = hidden_dim
        self.embed_dim = embed_dim
        self.num_layers = num_layers
        self.dropout = nn.Dropout(0.1)


        self.embedding = nn.Embedding(output_dim, self.embed_dim)
        self.lstm = nn.LSTM(self.embed_dim, self.hidden_dim, num_layers = self.num_layers)
        self.out = nn.Linear(self.hidden_dim, self.output_dim)
        self.softmax = nn.LogSoftmax(dim = -1)

    def forward(self, input, hidden, cell):
        input = input.view(1, -1)
        embedded = self.embedding(input)
        embedded = self.dropout(embedded)
        output, (hidden, cell) = self.lstm(embedded, (hidden, cell))       
        prediction = self.softmax(self.out(output[0]))

        return prediction, hidden, cell

In [22]:
class Seq2seq(nn.Module):
    def __init__(self, encoder, decoder, device, MAX_LENGTH=MAX_LENGTH):
        super().__init__()

        
        self.encoder = encoder
        self.decoder = decoder
        self.device = device
    
    def forward(self, source, target, teacher_forcing_ratio = 0.5):
        source_length = source.shape[0]
        target_length = target.shape[0]
        batch_size = target.shape[1]
        vocab_size = self.decoder.output_dim

        outputs = torch.zeros(target_length,  batch_size, vocab_size).to(self.device)
        encoder_output, encoder_hidden, cell = self.encoder(source)
        decoder_hidden = encoder_hidden.to(self.device)

        decoder_input = torch.tensor([SOS_token], device = self.device)

        for i in range(target_length):
            decoder_output, decoder_hidden, cell = self.decoder(decoder_input, decoder_hidden, cell)
            outputs[i] = decoder_output
            teacher_force = random.random() < teacher_forcing_ratio
            top_prob, top_ind = decoder_output.topk(1)
            input = target[i] if teacher_force else top_ind
            if teacher_force == False and input.item() == EOS_token:
                break
            
        return outputs

In [23]:
def get_batches(source, target, axis, batch_size):
    num_batches = int(np.ceil(source.shape[axis] * 1.0 / batch_size))

    for batch in range(num_batches):
        yield batch, source[:, batch * batch_size: min((batch + 1) * batch_size, source.shape[axis])], target[:, batch * batch_size: min((batch + 1) * batch_size, source.shape[axis])]

In [24]:
teacher_forcing_ratio = 0.5

def clacModel(model, input_tensor, target_tensor, model_optimizer, criterion):
    
    model_optimizer.zero_grad()

    input_length = input_tensor.size(0)
    loss = 0
    epoch_loss = 0
    output = model(input_tensor, target_tensor)

    num_iter = output.size(0)

    #calculate the loss from a predicted sentence with the expected result
    for ot in range(num_iter):
        loss += criterion(output[ot], target_tensor[ot])

    loss.backward()
    model_optimizer.step()
    epoch_loss = loss.item() / num_iter

    return epoch_loss

In [34]:
def train(model, source, target, pairs, optimizer, batch_size = 128):
    model.train()

    criterion = nn.NLLLoss()
    total_loss_iterations = 0
    loss = 0
    training_pairs = [tensors_from_pair(source, target, random.choice(pairs)) for i in range(num_iteration)]
    
    print("num of iterations is", batch_size)
    for iter in range(1, batch_size + 1):
        training_pair = training_pairs[iter - 1]
        input_tensor = training_pair[0]
        target_tensor = training_pair[1]

        total_loss_iterations += clacModel(model, input_tensor, target_tensor, optimizer, criterion)
        loss += total_loss_iterations
        
        if iter % 100 == 0:
            avarage_loss = total_loss_iterations / 50
            total_loss_iterations = 0
            print('%d %.4f' % (iter, avarage_loss))        
            
    torch.save(model.state_dict(), 'mytraining.pt')
    return model , (loss / iter)


In [30]:
def evaluate(model, input_lang, output_lang, sentences, max_length=MAX_LENGTH):
    with torch.no_grad():
        input_tensor = tensor_from_sentence(input_lang, sentences[0])
        output_tensor = tensor_from_sentence(output_lang, sentences[1])
    
        decoded_words = []
    
        output = model(input_tensor, output_tensor)
    
        for ot in range(output.size(0)):
            topv, topi = output[ot].topk(1)

            if topi[0].item() == EOS_token:
                decoded_words.append('<EOS>')
                break
            else:
                decoded_words.append(output_lang.index2word[topi[0].item()])
    
    return decoded_words

def evaluateRandomly(model, source, target, pairs, n=5):
    for i in range(n):
        pair = random.choice(pairs)
        print("source {}".format(pair[0]))        
        output_words = evaluate(model, source, target, pair)
        output_sentence = ' '.join(output_words)
        print("target {}".format(pair[1]))
        print("predicted {}".format(output_sentence))

In [35]:
source, target, pairs = process_data('first', 'second')

randomize = random.choice(pairs)
print('random sentence {}'.format(randomize))

#print number of words
input_size = source.n_words
output_size = target.n_words
print('Input : {} Output : {}'.format(input_size, output_size))

embed_size = 256
hidden_size = 512
num_layers = 1
num_iteration = 1000

#create encoder-decoder model
DEC_HID_DIM = 512
encoder = Encoder(input_size, hidden_size, embed_size,DEC_HID_DIM, num_layers)
decoder = Decoder(output_size, hidden_size, embed_size, num_layers)

model = Seq2seq(encoder, decoder, device).to(device)

#print model 
print(encoder)
print(decoder)

  if sys.path[0] == '':


Read 49609 sentence pairs
random sentence ['بدو گفت زینسان که گفتی بساز', 'مرا کردی از خون او بی نیاز']
Input : 12458 Output : 13071
Encoder(
  (embedding): Embedding(12458, 256)
  (hidden_fc): Sequential(
    (0): Linear(in_features=512, out_features=512, bias=True)
    (1): ReLU(inplace=True)
  )
  (lstm): LSTM(256, 512)
)
Decoder(
  (dropout): Dropout(p=0.1, inplace=False)
  (embedding): Embedding(13071, 256)
  (lstm): LSTM(256, 512)
  (out): Linear(in_features=512, out_features=13071, bias=True)
  (softmax): LogSoftmax(dim=-1)
)


In [None]:
from time import time

epochs = 300
optimizer = optim.Adam(model.parameters(), lr=0.001)

for i in range(epochs):

    start_time = time()
    
    model, train_loss = train(model, source, target, pairs, optimizer, 128)
    evaluateRandomly(model, source, target, pairs)

    print(f'\tTrain Loss: {train_loss:.3e} | Train PPL: {np.exp(train_loss):.3e}')

    end_time = time()

    print("ended in: ", end_time - start_time, "sec")

num of iterations is 128
100 4.7280
source بپرهیز و پیچان شو از خشم اوی
target ندیدی که خشم اورد چشم اوی
predicted SOS که و و و <EOS>
source سه پور جوان را سپهدار گفت
target پراگنده باشید با گنج جفت
predicted SOS که و و و <EOS>
source سواریش دیدم چو سرو سهی
target خردمند و با زیب و با فرهی
predicted SOS که و و و <EOS>
source بپیچید زان پس سوی دست راست
target بدانست کان روز روز بلاست
predicted SOS که و و و <EOS>
source همه سخته باید که راند سخن
target که گفتار نیکو نگردد کهن
predicted SOS که و و و <EOS>
	Train Loss: 1.009e+02 | Train PPL: 6.388e+43
ended in:  62.04293370246887 sec
num of iterations is 128
100 4.8008
source جهان دیده باید عنان دار کس
target سنان و سپر بایدش یار بس
predicted SOS که و و و <EOS>
source ز شه خواستند ان زمان زینهار
target فروریختند الت کارزار
predicted SOS که از و و <EOS>
source همان بیژن گیو برجست زود
target کجا بود در جنگ برسان دود
predicted SOS که و و و <EOS>
source سه دیگر که نزدیک موبد برند
target گزیت و سر باژها بشمرند
predicted SOS که از و و <EOS>
sour