In [None]:
from __future__ import unicode_literals, print_function, division
from io import open
import unicodedata
import string
import re
import random

import torch
import torch.nn as nn
from torch import optim
import torch.nn.functional as F

device = torch.device("cuda" if torch.cuda.is_available() else "cpu")

In [None]:
import sys
from google.colab import drive
drive.mount('/content/gdrive/')
system_path = "/content/gdrive/MyDrive/pytorch/data/data_name/eng-fra.txt"

Drive already mounted at /content/gdrive/; to attempt to forcibly remount, call drive.mount("/content/gdrive/", force_remount=True).


In [None]:
SOS_token = 0
EOS_token = 1

In [None]:
def unicodeToAscii(s):
    return ''.join(
        c for c in unicodedata.normalize('NFD', s)
        if unicodedata.category(c) != 'Mn'
    )

def normalize_string(s):
  s = unicodeToAscii(s.lower().strip())
  s = re.sub(r"([.!?])", r" \1", s)
  s = re.sub(r"[^a-zA-Z.!?]+", r" ", s)
  return s

def read_data(file_path):
  lines = open(file_path).read().strip().split('\n')
  pairs = [[normalize_string(s) for s in line.split('\t')] for line in lines ]
  return pairs

In [None]:
MAX_LENGTH = 10
eng_prefixes = (
    "i am ", "i m ",
    "he is", "he s ",
    "she is", "she s ",
    "you are", "you re ",
    "we are", "we re ",
    "they are", "they re "
)

def filter_pair(p):
  p1, p2 = p[0], p[1]
  return len(p1.split(' ')) < MAX_LENGTH and len(p2.split(' ')) < MAX_LENGTH and \
        p1.startswith(eng_prefixes)

def filter(pairs):
  return [pair for pair in pairs if filter_pair(pair)]

In [None]:
class Language:
  def __init__(self, name):
    self.name = name
    self.word2index = {}
    self.word2count = {}
    self.index2word = {0: "SOS", 1: "EOS"}
    self.n_words = 2  # Count SOS and EOS
  
  def add_word(self, word):
    if word not in self.word2index:
      self.word2index[word] = self.n_words
      self.word2count[word] = 1
      self.index2word[self.n_words] = word
      self.n_words += 1
    else:
      self.word2count[word] += 1
  
  def add_sentence(self, sentence):
    for word in sentence.split(' '):
      self.add_word(word)

In [None]:
def process_data(file_path, reverse):
  pairs = filter(read_data(file_path))

  if reverse:
    pairs = [list(reversed(pair)) for pair in pairs]
    input, output = Language('french'), Language('eng')
  else:
    input, output = Language('eng'), Language('french')

  for pair in pairs:
    input.add_sentence(pair[0])
    output.add_sentence(pair[1])

  return input, output, pairs

In [None]:
input, output, pairs = process_data(system_path, True)

In [None]:
print(input.name, input.n_words)
print(output.name, output.n_words)

french 4345
eng 2803


### For every input word the encoder outputs a vector and a hidden state, and uses the hidden state for the next input word.

## Classical encoder-decoder 

In [None]:
class EncoderRNN(nn.Module):
  def __init__(self, vocab_size, embed_size, hidden_size):
    super(EncoderRNN, self).__init__()
    self.hidden_size = hidden_size
    self.embedding = nn.Embedding(vocab_size, embed_size)
    self.gru = nn.GRU(embed_size, hidden_size)

  def forward(self, input, hidden):
    embeds = self.embedding(input).view(1, 1, -1)
    output, hidden = self.gru(embeds, hidden)
    return output, hidden

  def initHidden(self):
    return torch.zeros(1, 1, self.hidden_size, device=device)

In [None]:
class DecoderRNN(nn.Module):
  def __init__(self, vocab_size, embed_size, hidden_size):
    super(DecoderRNN, self).__init__()
    self.hidden_size = hidden_size
    self.embedding = nn.Embedding(vocab_size, embed_size)
    self.gru = nn.GRU(embed_size, hidden_size)
    self.linear1 = nn.Linear(hidden_size, vocab_size)
    self.softmax = nn.LogSoftmax(dim=1)

  def forward(self, input, hidden):
    input = self.embedding(input).view(1, 1, -1)
    input = F.relu(input)
    output, hidden = self.gru(input, hidden)
    output = self.linear1(output[0])
    prob = self.softmax(output)
    return output, hidden

  def initHidden(self):
    return torch.zeros(1, 1, self.hidden_size, device=device)

In [None]:
class DecoderRNN(nn.Module):
    def __init__(self, hidden_size, output_size):
        super(DecoderRNN, self).__init__()
        self.hidden_size = hidden_size

        self.embedding = nn.Embedding(output_size, hidden_size)
        self.gru = nn.GRU(hidden_size, hidden_size)
        self.out = nn.Linear(hidden_size, output_size)
        self.softmax = nn.LogSoftmax(dim=1)

    def forward(self, input, hidden):
        output = self.embedding(input).view(1, 1, -1)
        output = F.relu(output)
        output, hidden = self.gru(output, hidden)
        output = self.softmax(self.out(output[0]))
        return output, hidden

    def initHidden(self):
        return torch.zeros(1, 1, self.hidden_size, device=device)

In [None]:
# given a sentence, return the index representation of the sentence
def indexesFromSentence(language, sentence):
  return [language.word2index[word] for word in sentence.split(' ')]

def tensorFromSentence(language, sentence):
  indexes = indexesFromSentence(language, sentence)
  indexes.append(EOS_token)
  return torch.tensor(indexes, dtype=torch.long, device=device).view(-1, 1)

def tensorFromPair(input, output, pair):
  input_tensor = tensorFromSentence(input, pair[0])
  target_tensor = tensorFromSentence(output, pair[1])
  return input_tensor, target_tensor

In [None]:
def train(input_tensor, target_tensor, encoder, decoder, encoder_optimizer, decoder_optimizer, criterion, max_length=MAX_LENGTH):
  encoder_hidden = encoder.initHidden()

  encoder_optimizer.zero_grad()
  decoder_optimizer.zero_grad()

  input_length = input_tensor.size(0)
  target_length = target_tensor.size(0)

  encoder_outputs = torch.zeros(max_length, encoder.hidden_size, device=device)
  loss = 0
  
  for i in range(input_length):
    encoder_output, encoder_hidden = encoder(input_tensor[i], encoder_hidden)
    encoder_outputs[i] = encoder_output[0, 0]
  
  decoder_input = torch.tensor([[SOS_token]], device = device)
  decoder_hidden = encoder_hidden

  for j in range(target_length):
    decoder_output, decoder_hidden = decoder(decoder_input, decoder_hidden)
    topv, topi = decoder_output.topk(1)
    decoder_input = topi.squeeze().detach()

    loss += criterion(decoder_output, target_tensor[j])
    if decoder_input.item() == EOS_token:
      break

  loss.backward()

  encoder_optimizer.step()
  decoder_optimizer.step()

  return loss.item() / target_length


In [None]:
import time
import math
import matplotlib.pyplot as plt
plt.switch_backend('agg')
import matplotlib.ticker as ticker
import numpy as np


def asMinutes(s):
    m = math.floor(s / 60)
    s -= m * 60
    return '%dm %ds' % (m, s)


def timeSince(since, percent):
    now = time.time()
    s = now - since
    es = s / (percent)
    rs = es - s
    return '%s (- %s)' % (asMinutes(s), asMinutes(rs))

def showPlot(points):
    plt.figure()
    fig, ax = plt.subplots()
    # this locator puts ticks at regular intervals
    loc = ticker.MultipleLocator(base=0.2)
    ax.yaxis.set_major_locator(loc)
    plt.plot(points)

In [None]:
def trainIters(input, target, encoder, decoder, n_iters, print_every=1000, plot_every=100, learning_rate=0.01):
  start = time.time()
  plot_losses = []
  print_loss_total = 0  
  plot_loss_total = 0 

  encoder_optimizer = optim.SGD(encoder.parameters(), lr=learning_rate)
  decoder_optimizer = optim.SGD(decoder.parameters(), lr=learning_rate)
  training_pairs = [tensorFromPair(input, target, random.choice(pairs)) for i in range(n_iters)]
  criterion = nn.NLLLoss()

  for i in range(1, n_iters + 1):
    training_pair = training_pairs[i - 1]
    input_tensor = training_pair[0]
    target_tensor = training_pair[1]

    loss = train(input_tensor, target_tensor, encoder, decoder, encoder_optimizer, decoder_optimizer, criterion)
    print_loss_total += loss
    plot_loss_total += loss

    if i % print_every == 0:
      print_loss_avg = print_loss_total / print_every
      print_loss_total = 0
      print('%s (%d %d%%) %.4f' % (timeSince(start, i / n_iters),
                                   i, i / n_iters * 100, print_loss_avg))
    if i % plot_every == 0:
      plot_loss_avg = plot_loss_total / plot_every
      plot_losses.append(plot_loss_avg)
      plot_loss_total = 0
  showPlot(plot_losses)

In [None]:
hidden_size = 256
embed_size = 256
encoder1 = EncoderRNN(input.n_words, embed_size, hidden_size).to(device)
decoder1 = DecoderRNN(hidden_size, output.n_words).to(device)

trainIters(input, output, encoder1, decoder1, 75000, print_every=5000)

1m 1s (- 14m 18s) (5000 6%) 3.0091
1m 57s (- 12m 46s) (10000 13%) 2.4785
2m 55s (- 11m 42s) (15000 20%) 2.1726
3m 53s (- 10m 40s) (20000 26%) 1.8788


KeyboardInterrupt: ignored