In [None]:
!nvidia-smi -L

# 0. Setup

In [None]:
!pip install -U torch==1.8.0 torchtext==0.9.0
exit()

Looking in indexes: https://pypi.org/simple, https://us-python.pkg.dev/colab-wheels/public/simple/
Collecting torch==1.8.0
  Downloading torch-1.8.0-cp37-cp37m-manylinux1_x86_64.whl (735.5 MB)
[K     |████████████████████████████████| 735.5 MB 13 kB/s 
[?25hCollecting torchtext==0.9.0
  Downloading torchtext-0.9.0-cp37-cp37m-manylinux1_x86_64.whl (7.1 MB)
[K     |████████████████████████████████| 7.1 MB 31.8 MB/s 
Installing collected packages: torch, torchtext
  Attempting uninstall: torch
    Found existing installation: torch 1.12.0+cu113
    Uninstalling torch-1.12.0+cu113:
      Successfully uninstalled torch-1.12.0+cu113
  Attempting uninstall: torchtext
    Found existing installation: torchtext 0.13.0
    Uninstalling torchtext-0.13.0:
      Successfully uninstalled torchtext-0.13.0
[31mERROR: pip's dependency resolver does not currently take into account all the packages that are installed. This behaviour is the source of the following dependency conflicts.
torchvision 0.1

In [None]:
from google.colab import drive

drive.mount("./drive")

Mounted at ./drive


## JSON files

In [None]:
!mkdir data
!cp ./drive/MyDrive/projects/language-translation/data/json/language-translation/es-en/es-en-10p*.json ./data/

# Language Translation

In [None]:
import torch
import torch.nn as nn
import torch.optim as optim
from torchtext.legacy.data import Field, BucketIterator, TabularDataset


import os
import pandas as pd
from spacy.tokenizer import Tokenizer
import spacy
import random

random.seed(42)

## Models

### seq2seq LSTM

In [None]:
class Encoder(nn.Module):
  def __init__(self, input_size, embedding_size, hidden_size, num_layers, p):
      super(Encoder, self).__init__()
      self.dropout = nn.Dropout(p)
      self.hidden_size = hidden_size
      self.num_layers = num_layers

      self.embedding = nn.Embedding(input_size, embedding_size)
      self.rnn = nn.LSTM(embedding_size, hidden_size, num_layers, dropout=p)

  def forward(self, x):
    embedding = self.dropout(self.embedding(x))
    outputs, (hidden, cell) = self.rnn(embedding)
    return hidden, cell

class Decoder(nn.Module):
  def __init__(
      self, input_size, embedding_size, hidden_size, output_size, num_layers, p):
      super(Decoder, self).__init__()
      self.dropout = nn.Dropout(p)
      self.hidden_size = hidden_size
      self.num_layers = num_layers

      self.embedding = nn.Embedding(input_size, embedding_size)
      self.rnn = nn.LSTM(embedding_size, hidden_size, num_layers, dropout=p)
      self.fc = nn.Linear(hidden_size, output_size)

  def forward(self, x, hidden, cell):
      x = x.unsqueeze(0)
      embedding = self.dropout(self.embedding(x))
      outputs, (hidden, cell) = self.rnn(embedding, (hidden, cell))
      predictions = self.fc(outputs)
      predictions = predictions.squeeze(0)

      return predictions, hidden, cell

In [None]:
class seq2seq(nn.Module):
  def __init__(self, encoder, decoder):
      super(seq2seq, self).__init__()
      self.encoder = encoder
      self.decoder = decoder

  def forward(self, source, target, teacher_force_ratio=0.5):
      batch_size = source.shape[1]
      target_len = target.shape[0]
      target_vocab_size = len(english.vocab)

      outputs = torch.zeros(target_len, batch_size, target_vocab_size, requires_grad=True).to(device)
      hidden, cell = self.encoder(source)
      x = target[0]

      for t in range(1, target_len):
          output, hidden, cell = self.decoder(x, hidden, cell)
          best_guess = output.argmax(1)
          x = target[t] if random.random() < teacher_force_ratio else best_guess

      return outputs

## Single Language
Spanish to English

In [None]:
!python -m spacy download es_core_news_sm

Installing collected packages: es-core-news-sm
Successfully installed es-core-news-sm-3.4.0
[38;5;2m✔ Download and installation successful[0m
You can now load the package via spacy.load('es_core_news_sm')


In [None]:
spanish_tokenizer = Tokenizer(spacy.load("es_core_news_sm").vocab)
english_tokenizer = Tokenizer(spacy.load("en_core_web_sm").vocab)

def tokenizer_es(data):
  return [token.text for token in spanish_tokenizer(data)]

def tokenizer_en(data):
  return [token.text for token in english_tokenizer(data)]

In [None]:
english = Field(tokenize="spacy", lower=True, sequential=True, use_vocab=True,
                init_token="<sos>", eos_token="<eos>", tokenizer_language="en_core_web_sm")

spanish = Field(tokenize="spacy", lower=True, sequential=True, use_vocab=True,
                init_token="<sos>", eos_token="<eos>", tokenizer_language="es_core_news_sm")

fields = {"spanish": ("spanish", spanish), "english": ("english", english)}

In [None]:
train_data, valid_data, test_data = TabularDataset.splits(path="data", train="es-en-10p-train.json",
                      validation="es-en-10p-validation.json", test="es-en-10p-test.json",
                      format="json", fields=fields)

In [None]:
spanish.build_vocab(train_data, max_size=10000, min_freq=2) #vectors='glove.6B.100d'
english.build_vocab(train_data, max_size=10000, min_freq=2) #vectors='glove.6B.100d'

train_iterator, valid_iterator, test_iterator = BucketIterator.splits((train_data, valid_data, test_data),
                                                                      batch_size=32, sort_within_batch=True,
                                                                      sort_key=lambda x: len(x.spanish))

In [None]:
device = torch.device('cuda' if torch.cuda.is_available() else 'cpu')
INPUT_SIZE_ENCODER = len(spanish.vocab)
INPUT_SIZE_DECODER = len(english.vocab)
OUTPUT_SIZE = INPUT_SIZE_DECODER
EMBEDDING_SIZE_ENCODER = 300
EMBEDDING_SIZE_DECODER = 300
HIDDEN_SIZE = 512
NUM_LAYERS = 2
DROPOUT_ENCODER = .5
DROPOUT_DECODER = .5

In [None]:
encoder = Encoder(INPUT_SIZE_ENCODER, EMBEDDING_SIZE_ENCODER, HIDDEN_SIZE, NUM_LAYERS,
                  DROPOUT_ENCODER).to(device)
decoder = Decoder(INPUT_SIZE_DECODER, EMBEDDING_SIZE_DECODER, HIDDEN_SIZE, INPUT_SIZE_DECODER, NUM_LAYERS,
                  DROPOUT_DECODER).to(device)

seq2seq_model = seq2seq(encoder, decoder).to(device)

In [None]:
LR = .001
BATCH_SIZE = 32
PAD_INDEX = english.vocab.stoi["<pad>"]

loss_fn = nn.CrossEntropyLoss(ignore_index=PAD_INDEX)
optimizer = optim.Adam(seq2seq_model.parameters(), lr=LR)

### Training

In [None]:
from tqdm.notebook import tqdm

N_EPOCHS = 10
N_BATCHES = len(train_iterator)
for epoch_num in range(1, N_EPOCHS+1):
  print("Epoch [{}/{}]".format(epoch_num, N_EPOCHS))

  loop = tqdm(range(1, N_BATCHES+1), total=N_BATCHES)

  for i, batch in zip(loop, train_iterator):
    loop.set_description("{}/{}".format(i, N_BATCHES))
    source_data = batch.spanish.to(device)
    target_data = batch.english.to(device)

    output = seq2seq_model(source_data, target_data)
    output = output[1:].reshape(-1, output.shape[2])
    target_data = target_data[1:].reshape(-1)

    optimizer.zero_grad()
    loss = loss_fn(output, target_data)
    loss.backward()

    torch.nn.utils.clip_grad_norm_(seq2seq_model.parameters(), max_norm=1)
    optimizer.step()

Epoch [1/10]


  0%|          | 0/4301 [00:00<?, ?it/s]