In [2]:
# Imports required
from __future__ import unicode_literals, print_function, division
from io import open
import unicodedata
import string
import re
import random

import torch
import torch.nn as nn
from torch import optim
import torch.nn.functional as F

  from .autonotebook import tqdm as notebook_tqdm


In [3]:
SOS_token = 0
EOS_token = 1


class Lang:
    def __init__(self, name):
        self.name = name
        self.char2index = {"SOS": 0, "EOS": 1}
        self.char2count = {}
        self.index2char = {0: "SOS", 1: "EOS"}
        self.n_chars = 2  # Count SOS and EOS

    def addword(self, word):
        # Splits a word into individual characters and adds them to the object instance
        for char in list(word):
            self.addchar(char)

    def addchar(self, char):
        # If the char is being encountered for the first time
        if char not in self.char2index:
            self.char2index[char] = self.n_chars
            self.char2count[char] = 1
            self.index2char[self.n_chars] = char
            # Increment count of unique chars
            self.n_chars += 1
        else:
            # Increment count to depict how many times this char has occured
            self.char2count[char] += 1

In [4]:
def readLangs(data_path, lang1, lang2):
    print("Reading lines...")

    # Read the file and split into lines
    lines = open(data_path, encoding="utf-8").\
        read().strip().split('\n')

    # Split every line into pairs and normalize
    pairs = [[s for s in l.split(",")] for l in lines]

    # Make Lang instances
    input_lang = Lang(lang1)
    output_lang = Lang(lang2)

    return input_lang, output_lang, pairs

In [5]:
def  prepareData(data_path, lang1, lang2):
    input_lang, output_lang, pairs = readLangs(data_path, lang1, lang2)
    print("Read %s sentence pairs" % len(pairs))
    print("Counting chars...")
    for pair in pairs:
        input_lang.addword(pair[0])
        output_lang.addword(pair[1])
    print("Counted words:")
    print(input_lang.name, input_lang.n_chars)
    print(output_lang.name, output_lang.n_chars)
    return input_lang, output_lang, pairs

input_lang, output_lang, pairs = prepareData("../../data/Aksharantar/aksharantar_sampled/aksharantar_sampled/tam/tam_train.csv", 'eng', 'tam')
print(random.choice(pairs))

Reading lines...
Read 51200 sentence pairs
Counting chars...
Counted words:
eng 28
tam 48
['deivaththanmai', 'தெய்வத்தன்மை']


In [6]:
input_lang.char2count

{'t': 56253,
 'h': 51106,
 'o': 16365,
 'a': 172788,
 'c': 6050,
 'r': 36211,
 'y': 16109,
 'm': 25043,
 'e': 20573,
 'n': 38419,
 'i': 64107,
 'v': 21484,
 'u': 45117,
 'd': 17659,
 'g': 14824,
 'l': 32680,
 'z': 3554,
 'k': 40135,
 'p': 27530,
 's': 10519,
 'b': 2550,
 'w': 406,
 'j': 1488,
 'q': 24,
 'f': 148,
 'x': 68}

In [7]:
print(output_lang.char2count)

{'த': 41674, 'ொ': 3498, 'ட': 26961, '்': 100092, 'ா': 29506, 'ச': 14466, 'ர': 25180, 'ய': 17666, 'ம': 25036, 'ெ': 5686, 'ன': 19584, 'ை': 16577, 'அ': 3916, 'வ': 21882, 'ற': 12550, 'ி': 44664, 'ு': 42713, 'ஆ': 1219, 'ண': 5762, 'க': 53703, 'ள': 14456, 'ல': 17951, 'ழ': 3727, 'ஒ': 602, 'ந': 8539, 'ே': 7402, 'ப': 30399, 'ூ': 2859, 'ீ': 2631, 'ோ': 6002, 'ஜ': 1184, 'எ': 1300, 'இ': 2441, 'ஹ': 427, 'ங': 3923, 'உ': 1622, 'ஷ': 799, 'ஏ': 364, 'ஞ': 606, 'ஃ': 92, 'ஸ': 1510, 'ஈ': 155, 'ஓ': 229, 'ௌ': 66, 'ஊ': 229, 'ஐ': 66}


In [8]:
# Device is a cuda device if compatible NVidia GPU is found.
device = torch.device('cuda' if torch.cuda.is_available() else 'cpu')
device

device(type='cuda')

In [42]:
rnn_type_dict = {
    "rnn" : nn.RNN,
    "gru" : nn.GRU,
    "lstm" : nn.LSTM
}

In [43]:
MAX_LENGTH = 50

class AttnDecoderRNN(nn.Module):
    def __init__(self, rnn_type, output_embedding_dict_size, output_embedding_size, hidden_size, dropout_p=0.1, max_length=MAX_LENGTH):
        super(AttnDecoderRNN, self).__init__()
        self.hidden_size = hidden_size
        self.output_embedding_dict_size = output_embedding_dict_size
        self.output_embedding_size = output_embedding_size
        self.dropout_p = dropout_p
        self.max_length = max_length

        self.embedding = nn.Embedding(self.output_embedding_dict_size, self.output_embedding_size)
        self.attn = nn.Linear(self.hidden_size * 2, self.max_length)
        self.attn_combine = nn.Linear(self.hidden_size * 2, self.hidden_size)
        self.dropout = nn.Dropout(self.dropout_p)
        self.decoder = rnn_type_dict[rnn_type](self.hidden_size, self.hidden_size)
        self.out = nn.Linear(self.hidden_size, self.output_embedding_dict_size)

    def forward(self, input, hidden, encoder_outputs):
        embedded = self.embedding(input).view(1, 1, -1)
        embedded = self.dropout(embedded)

        attn_weights = F.softmax(
            self.attn(torch.cat((embedded[0], hidden[0]), 1)), dim=1)
        attn_applied = torch.bmm(attn_weights.unsqueeze(0),
                                 encoder_outputs.unsqueeze(0))

        output = torch.cat((embedded[0], attn_applied[0]), 1)
        output = self.attn_combine(output).unsqueeze(0)

        output = F.relu(output)
        output, hidden = self.decoder(output, hidden)

        output = F.log_softmax(self.out(output[0]), dim=1)
        return output, hidden, attn_weights

    def initHidden(self):
        return torch.zeros(1, 1, self.hidden_size, device=device)

In [44]:
import time
import math


def asMinutes(s):
    m = math.floor(s / 60)
    s -= m * 60
    return '%dm %ds' % (m, s)


def timeSince(since, percent):
    now = time.time()
    s = now - since
    es = s / (percent)
    rs = es - s
    return '%s (- %s)' % (asMinutes(s), asMinutes(rs))

In [45]:
def indexesFromword(lang, word):
    return [lang.char2index[char] for char in list(word)]


def tensorFromword(lang, word):
    indexes = indexesFromword(lang, word)
    indexes.append(EOS_token)
    return torch.tensor(indexes, dtype=torch.long, device=device).view(-1, 1)


def tensorsFromPair(pair):
    input_tensor = tensorFromword(input_lang, pair[0])
    target_tensor = tensorFromword(output_lang, pair[1])
    return (input_tensor, target_tensor)

In [46]:
import matplotlib.pyplot as plt
plt.switch_backend('agg')
import matplotlib.ticker as ticker
import numpy as np


def showPlot(points):
    plt.figure()
    fig, ax = plt.subplots()
    # this locator puts ticks at regular intervals
    loc = ticker.MultipleLocator(base=0.2)
    ax.yaxis.set_major_locator(loc)
    plt.plot(points)

In [47]:
def evaluate(encoder, decoder, word, max_length=MAX_LENGTH):
    with torch.no_grad():
        input_tensor = tensorFromword(input_lang, word)
        input_length = input_tensor.size()[0]
        encoder_hidden = encoder.initHidden()

        encoder_outputs = torch.zeros(max_length, encoder.hidden_size, device=device)

        for ei in range(input_length):
            encoder_output, encoder_hidden = encoder(input_tensor[ei],
                                                     encoder_hidden)
            encoder_outputs[ei] += encoder_output[0, 0]

        decoder_input = torch.tensor([[SOS_token]], device=device)  # SOS

        decoder_hidden = encoder_hidden

        decoded_chars = []
        decoder_attentions = torch.zeros(max_length, max_length)

        for di in range(max_length):
            # decoder_output, decoder_hidden, decoder_attention = decoder(
            #     decoder_input, decoder_hidden, encoder_outputs)
            # decoder_attentions[di] = decoder_attention.data
            decoder_output, decoder_hidden = decoder(
                decoder_input, decoder_hidden)
            topv, topi = decoder_output.data.topk(1)
            if topi.item() == EOS_token:
                decoded_chars.append('<EOS>')
                break
            else:
                decoded_chars.append(output_lang.index2char[topi.item()])

            decoder_input = topi.squeeze().detach()

        return decoded_chars, decoder_attentions[:di + 1]

In [48]:
def evaluateRandomly(encoder, decoder, n=10):
    for i in range(n):
        pair = random.choice(pairs)
        print('>', pair[0])
        print('=', pair[1])
        # output_chars, attentions = evaluate(encoder, decoder, pair[0])
        output_chars = evaluate(encoder, decoder, pair[0])
        output_word = ' '.join(output_chars)
        print('<', output_chars)
        print('')

In [49]:
class EncoderRNN(nn.Module):
    def __init__(self, rnn_type, input_embedding_dict_size, input_embedding_size, hidden_size, num_layers=1, dropout=0, bidirectional=False):
        super(EncoderRNN, self).__init__()
        self.input_embedding_dict_size = input_embedding_dict_size
        self.embedding_size = input_embedding_size
        self.hidden_size = hidden_size
        self.num_layers = num_layers
        self.dropout = dropout
        self.bidirectional = bidirectional
        self.rnn_type = rnn_type
        self.embedding = nn.Embedding(input_embedding_dict_size, input_embedding_size)
        self.encoder = rnn_type_dict[rnn_type](input_embedding_size, hidden_size, num_layers, dropout=dropout, bidirectional=bidirectional)

    def forward(self, input, hidden):
        embedded_input = self.embedding(input).view(1, 1, -1)
        output, hidden = self.encoder(embedded_input, hidden)
        print(output.shape, hidden.shape)
        if self.rnn_type == "lstm":
            hidden = hidden[0]
        return output, hidden
    
    def initHidden(self):
        return torch.zeros(self.num_layers, 1, self.hidden_size, device=device)

In [50]:
class DecoderRNN(nn.Module):
    def __init__(self, rnn_type, output_embedding_dict_size, output_embedding_size, hidden_size, num_layers=1):
        super(DecoderRNN, self).__init__()
        self.hidden_size = hidden_size
        self.output_embedding_dict_size = output_embedding_dict_size
        self.output_embedding_size = output_embedding_size
        self.num_layers = num_layers
        self.embedding = nn.Embedding(output_embedding_dict_size, output_embedding_size)
        self.rnn_type = rnn_type
        self.decoder = rnn_type_dict[rnn_type](output_embedding_size, hidden_size, num_layers)
        self.out = nn.Linear(hidden_size, output_embedding_dict_size)
        self.softmax = nn.LogSoftmax(dim=1)

    def forward(self, output, hidden):
        embedded_output = self.embedding(output).view(1, 1, -1)
        embedded_output_relued = F.relu(embedded_output)
        output, hidden = self.decoder(embedded_output_relued, hidden)
        if self.rnn_type == "lstm":
            hidden = hidden[0]
        final_output = self.softmax(self.out(output[0]))
        return final_output, hidden

    def initHidden(self):
        return torch.zeros(self.num_layers, 1, self.hidden_size, device=device)

In [51]:
def trainIters(encoder, decoder, n_epochs, print_every=1000, plot_every=100, learning_rate=0.01):
    start = time.time()    
    plot_losses = []
    print_loss_total = 0  # Reset every print_every
    plot_loss_total = 0  # Reset every plot_every

    encoder_optimizer = optim.SGD(encoder.parameters(), lr=learning_rate)
    decoder_optimizer = optim.SGD(decoder.parameters(), lr=learning_rate)
    
    criterion = nn.NLLLoss()

    for iter in range(1, n_epochs + 1):
        for pair in pairs:
            training_pair = tensorsFromPair(pair)
            input_tensor = training_pair[0]
            target_tensor = training_pair[1]

            loss = train(input_tensor, target_tensor, encoder,
                        decoder, encoder_optimizer, decoder_optimizer, criterion)
            print_loss_total += loss
            plot_loss_total += loss

            if iter % print_every == 0:
                print_loss_avg = print_loss_total / print_every
                print_loss_total = 0
                print('%s (%d %d%%) %.4f' % (timeSince(start, iter / n_epochs),
                                            iter, iter / n_epochs * 100, print_loss_avg))

            if iter % plot_every == 0:
                plot_loss_avg = plot_loss_total / plot_every
                plot_losses.append(plot_loss_avg)
                plot_loss_total = 0

    showPlot(plot_losses)

In [52]:
teacher_forcing_ratio = 0.5
MAX_LENGTH = 50

def train(input_tensor, target_tensor, encoder, decoder, encoder_optimizer, decoder_optimizer, criterion, max_length=MAX_LENGTH):
    encoder_hidden = encoder.initHidden()

    encoder_optimizer.zero_grad()
    decoder_optimizer.zero_grad()

    input_length = input_tensor.size(0)
    print(input_tensor.size(), input_tensor.shape, input_length)
    target_length = target_tensor.size(0)

    encoder_outputs = torch.zeros(max_length, encoder.hidden_size, device=device)

    loss = 0

    for ei in range(input_length):
        encoder_output, encoder_hidden = encoder(
            input_tensor[ei], encoder_hidden)
        encoder_outputs[ei] = encoder_output[0, 0]  

    decoder_input = torch.tensor([[SOS_token]], device=device)

    decoder_hidden = encoder_hidden

    use_teacher_forcing = True if random.random() < teacher_forcing_ratio else False

    if use_teacher_forcing:
        # Teacher forcing: Feed the target as the next input
        for di in range(target_length):
            # decoder_output, decoder_hidden, decoder_attention = decoder(
            #     decoder_input, decoder_hidden, encoder_outputs)
            decoder_output, decoder_hidden = decoder(
                decoder_input, decoder_hidden)
            loss += criterion(decoder_output, target_tensor[di])
            decoder_input = target_tensor[di]  # Teacher forcing

    else:
        # Without teacher forcing: use its own predictions as the next input
        for di in range(target_length):
            # decoder_output, decoder_hidden, decoder_attention = decoder(
            #     decoder_input, decoder_hidden, encoder_outputs)
            decoder_output, decoder_hidden = decoder(
                decoder_input, decoder_hidden)
            topv, topi = decoder_output.topk(1)
            decoder_input = topi.squeeze().detach()  # detach from history as input

            loss += criterion(decoder_output, target_tensor[di])
            if decoder_input.item() == EOS_token:
                break

    loss.backward()

    encoder_optimizer.step()
    decoder_optimizer.step()

    return loss.item() / target_length

In [53]:
hidden_size = 256
encoder1 = EncoderRNN(rnn_type="gru", input_embedding_dict_size=input_lang.n_chars, input_embedding_size=hidden_size, hidden_size=hidden_size, num_layers=3).to(device)
# attn_decoder1 = AttnDecoderRNN(hidden_size, output_lang.n_chars, dropout_p=0.1).to(device)
decoder1 = DecoderRNN(rnn_type="gru", output_embedding_dict_size=output_lang.n_chars, output_embedding_size=hidden_size, hidden_size=hidden_size, num_layers=3).to(device)
trainIters(encoder1, decoder1, 5, print_every=1)

torch.Size([13, 1]) torch.Size([13, 1]) 13
torch.Size([1, 1, 256]) torch.Size([3, 1, 256])
torch.Size([1, 1, 256]) torch.Size([3, 1, 256])
torch.Size([1, 1, 256]) torch.Size([3, 1, 256])
torch.Size([1, 1, 256]) torch.Size([3, 1, 256])
torch.Size([1, 1, 256]) torch.Size([3, 1, 256])
torch.Size([1, 1, 256]) torch.Size([3, 1, 256])
torch.Size([1, 1, 256]) torch.Size([3, 1, 256])
torch.Size([1, 1, 256]) torch.Size([3, 1, 256])
torch.Size([1, 1, 256]) torch.Size([3, 1, 256])
torch.Size([1, 1, 256]) torch.Size([3, 1, 256])
torch.Size([1, 1, 256]) torch.Size([3, 1, 256])
torch.Size([1, 1, 256]) torch.Size([3, 1, 256])
torch.Size([1, 1, 256]) torch.Size([3, 1, 256])
0m 0s (- 0m 2s) (1 20%) 3.8863
torch.Size([12, 1]) torch.Size([12, 1]) 12
torch.Size([1, 1, 256]) torch.Size([3, 1, 256])
torch.Size([1, 1, 256]) torch.Size([3, 1, 256])
torch.Size([1, 1, 256]) torch.Size([3, 1, 256])
torch.Size([1, 1, 256]) torch.Size([3, 1, 256])
torch.Size([1, 1, 256]) torch.Size([3, 1, 256])
torch.Size([1, 1, 2

KeyboardInterrupt: 

In [59]:
evaluateRandomly(encoder1, decoder1)

> kaaththunindraar
= காத்துநின்றார்
< ['ந', 'ி', 'ன', '்', 'ற', 'ு', 'ந', '்', 'த', 'ு', 'ன', '்', '<EOS>']

> aloagangkalin
= அலோகங்களின்
< ['க', 'ொ', 'ல', '்', 'க', 'ள', 'ி', 'ன', '்', '<EOS>']

> mudivudaiya
= முடிவுடைய
< ['ம', 'ு', 'ட', 'ு', 'வ', 'ி', 'ட', 'ு', 'ட', 'ு', 'வ', 'ட', 'ு', '<EOS>']

> kaaynthupoayulla
= காய்ந்துபோயுள்ள
< ['ந', 'ா', 'ன', '்', 'ப', 'ு', 'ப', '்', 'ப', 'ு', 'ள', '்', 'ள', 'ு', 'ள', '்', '<EOS>']

> marakkura
= மறக்குற
< ['ம', 'ு', 'ர', 'ு', 'க', '்', 'க', 'க', 'ு', 'க', '்', 'க', '<EOS>']

> asaivinaal
= அசைவினால்
< ['வ', 'ி', 'ன', '்', 'ன', 'ி', 'ன', '்', '<EOS>']

> vidaikandathu
= விடைகண்டது
< ['வ', 'ி', 'ன', '்', 'க', 'ு', 'க', '்', 'க', 'ு', 'க', '்', 'க', 'ு', '<EOS>']

> makkalinathumaana
= மக்களினதுமான
< ['க', 'ி', 'ம', 'ன', '்', 'க', 'ு', 'க', 'ள', 'ு', 'க', '்', 'க', 'ள', 'ு', 'ம', '்', '<EOS>']

> thanjaiyilirunthu
= தஞ்சையிலிருந்து
< ['ந', 'ி', 'ன', '்', 'த', 'ி', 'ய', 'ு', 'ன', '்', '<EOS>']

> sittruthadugalukkul
= சிற்றுதடுகளுக்குள்
< ['ச',

In [39]:
output_chars, attentions = evaluate(
    encoder1, attn_decoder1, "jaavaiyum")
plt.matshow(attentions.numpy())

<matplotlib.image.AxesImage at 0x1cafc67f948>

In [40]:
def showAttention(input_sentence, output_words, attentions):
    # Set up figure with colorbar
    fig = plt.figure()
    ax = fig.add_subplot(111)
    cax = ax.matshow(attentions.numpy(), cmap='bone')
    fig.colorbar(cax)

    # Set up axes
    ax.set_xticklabels([''] + input_sentence.split(' ') +
                       ['<EOS>'], rotation=90)
    ax.set_yticklabels([''] + output_words)

    # Show label at every tick
    ax.xaxis.set_major_locator(ticker.MultipleLocator(1))
    ax.yaxis.set_major_locator(ticker.MultipleLocator(1))

    plt.show()


def evaluateAndShowAttention(input_sentence):
    output_words, attentions = evaluate(
        encoder1, attn_decoder1, input_sentence)
    print('input =', input_sentence)
    print('output =', ' '.join(output_words))
    showAttention(input_sentence, output_words, attentions)


evaluateAndShowAttention("elle a cinq ans de moins que moi .")

evaluateAndShowAttention("elle est trop petit .")

evaluateAndShowAttention("je ne crains pas de mourir .")

evaluateAndShowAttention("c est un jeune directeur plein de talent .")

KeyError: ' '