In [None]:
!nvidia-smi -L

# 0. Setup

In [2]:
!pip install -U torch==1.8.0 torchtext==0.9.0
exit()

Looking in indexes: https://pypi.org/simple, https://us-python.pkg.dev/colab-wheels/public/simple/
Collecting torch==1.8.0
  Downloading torch-1.8.0-cp37-cp37m-manylinux1_x86_64.whl (735.5 MB)
[K     |████████████████████████████████| 735.5 MB 13 kB/s 
[?25hCollecting torchtext==0.9.0
  Downloading torchtext-0.9.0-cp37-cp37m-manylinux1_x86_64.whl (7.1 MB)
[K     |████████████████████████████████| 7.1 MB 51.0 MB/s 
Installing collected packages: torch, torchtext
  Attempting uninstall: torch
    Found existing installation: torch 1.12.0+cu113
    Uninstalling torch-1.12.0+cu113:
      Successfully uninstalled torch-1.12.0+cu113
  Attempting uninstall: torchtext
    Found existing installation: torchtext 0.13.0
    Uninstalling torchtext-0.13.0:
      Successfully uninstalled torchtext-0.13.0
[31mERROR: pip's dependency resolver does not currently take into account all the packages that are installed. This behaviour is the source of the following dependency conflicts.
torchvision 0.1

In [3]:
from google.colab import drive

drive.mount("./drive")

Mounted at ./drive


## JSON files

In [1]:
!mkdir data
!cp ./drive/MyDrive/projects/language-translation/data/json/language-translation/es-en/es-en-10p*.json ./data/

# Language Translation

In [2]:
import torch
import torch.nn as nn
import torch.optim as optim
from torchtext.legacy.data import Field, BucketIterator, TabularDataset


import os
import pandas as pd
from spacy.tokenizer import Tokenizer
import spacy
import random

random.seed(42)

## Models

### seq2seq LSTM

In [26]:
class Encoder(nn.Module):
  def __init__(self, input_size, embedding_size=100, hidden_size=100, num_layers=2, dropout_rate=.5):
    super(Encoder, self).__init__()
    self.embedding_size = embedding_size
    self.hidden_size = hidden_size
    self.num_layers = num_layers

    self.embedding = nn.Embedding(input_size, embedding_size)
    self.rnn = nn.LSTM(embedding_size, hidden_size, num_layers, dropout=dropout_rate)
    self.dropout = nn.Dropout(dropout_rate)

  def forward(self, X):
    embedding = self.dropout(self.embedding(X))
    _, (h, c) = self.rnn(embedding)

    return h, c

class Decoder(nn.Module):
  def __init__(self, input_size, embedding_size, hidden_size, num_layers, dropout_rate):
    super(Decoder, self).__init__()
    self.embedding_size = embedding_size
    self.hidden_size = hidden_size
    self.num_layers = num_layers

    self.embedding = nn.Embedding(input_size, embedding_size)
    self.rnn = nn.LSTM(embedding_size, hidden_size, num_layers, dropout=dropout_rate)
    self.dropout = nn.Dropout(dropout_rate)
    output_size = input_size
    self.fcc = nn.Linear(hidden_size, output_size)

  def forward(self, X, h, c):
    X = X.unsqueeze(0)

    embedding = self.dropout(self.embedding(X))
    output, (h, c) = self.rnn(embedding, (h, c))

    prediction = self.fcc(output).squeeze(0)
    return prediction, h, c

In [27]:
class seq2seq(nn.Module):
  def __init__(self, encoder, decoder):
    super(seq2seq, self).__init__()
    self.encoder = encoder
    self.decoder = decoder

  def forward(self, source, target, teacher_force_ratio=.5):
    batch_size, target_len = source.shape[1], target.shape[0]
    target_vocab_size = len(english.vocab)

    outputs = torch.zeros(target_len, batch_size, target_vocab_size).to(device)
    h, c = self.encoder(source)
    X = target[0]

    for i in range(1, target_len):
      output, h, c = self.decoder(X, h, c)
      outputs[i] = output
      predictions = output.argmax(1) #predictions on batch
      if random.random() < teacher_force_ratio: X = target[i]
      else: X = predictions

    return outputs

## Single Language
Spanish to English

In [3]:
!python -m spacy download es_core_news_sm

[38;5;2m✔ Download and installation successful[0m
You can now load the package via spacy.load('es_core_news_sm')


In [None]:
spanish_tokenizer = Tokenizer(spacy.load("es_core_news_sm").vocab)
english_tokenizer = Tokenizer(spacy.load("en_core_web_sm").vocab)

def tokenizer_es(data):
  return [token.text for token in spanish_tokenizer(data)]

def tokenizer_en(data):
  return [token.text for token in english_tokenizer(data)]

In [4]:
english = Field(tokenize="spacy", lower=True, sequential=True, use_vocab=True,
                init_token="<sos>", eos_token="<eos>", tokenizer_language="en_core_web_sm")

spanish = Field(tokenize="spacy", lower=True, sequential=True, use_vocab=True,
                init_token="<sos>", eos_token="<eos>", tokenizer_language="es_core_news_sm")

fields = {"spanish": ("spanish", spanish), "english": ("spanish", english)}

In [5]:
train_data, valid_data, test_data = TabularDataset.splits(path="data", train="es-en-10p-train.json",
                      validation="es-en-10p-validation.json", test="es-en-10p-test.json",
                      format="json", fields=fields)

In [6]:
spanish.build_vocab(train_data, max_size=10000, min_freq=2) #vectors='glove.6B.100d'
english.build_vocab(train_data, max_size=10000, min_freq=2) #vectors='glove.6B.100d'

train_iterator, valid_iterator, test_iterator = BucketIterator.splits((train_data, valid_data, test_data),
                                                                      batch_size=32, sort_within_batch=True,
                                                                      sort_key=lambda x: len(x.spanish))

In [10]:
device = torch.device('cuda' if torch.cuda.is_available() else 'cpu')
INPUT_SIZE_ENCODER = len(spanish.vocab)
INPUT_SIZE_DECODER = len(english.vocab)
OUTPUT_SIZE = INPUT_SIZE_DECODER
EMBEDDING_SIZE_ENCODER = 300
EMBEDDING_SIZE_DECODER = 300
HIDDEN_SIZE = 512
NUM_LAYERS = 2
DROPOUT_ENCODER = .5
DROPOUT_DECODER = .5

In [25]:
LR = .001
BATCH_SIZE = 32
PAD_INDEX = english.vocab.stoi["<pad>"]

loss_fn = nn.CrossEntropyLoss(ignore_index=PAD_INDEX)
optimizer = optim.Adam(seq2seq_model.parameters(), lr=LR)

In [13]:
encoder = Encoder(INPUT_SIZE_ENCODER, EMBEDDING_SIZE_ENCODER, HIDDEN_SIZE, NUM_LAYERS,
                  DROPOUT_ENCODER).to(device)
decoder = Decoder(INPUT_SIZE_DECODER, EMBEDDING_SIZE_DECODER, HIDDEN_SIZE, NUM_LAYERS,
                  DROPOUT_DECODER).to(device)

seq2seq_model = seq2seq(encoder, decoder).to(device)