In [None]:
from google.colab import drive
drive.mount("/content/drive")

Mounted at /content/drive


In [None]:
!pip install pytorch-lightning

Looking in indexes: https://pypi.org/simple, https://us-python.pkg.dev/colab-wheels/public/simple/
Collecting pytorch-lightning
  Downloading pytorch_lightning-1.8.0.post1-py3-none-any.whl (796 kB)
[K     |████████████████████████████████| 796 kB 23.2 MB/s 
Collecting lightning-lite==1.8.0.post1
  Downloading lightning_lite-1.8.0.post1-py3-none-any.whl (136 kB)
[K     |████████████████████████████████| 136 kB 86.7 MB/s 
Collecting torchmetrics>=0.7.0
  Downloading torchmetrics-0.10.2-py3-none-any.whl (529 kB)
[K     |████████████████████████████████| 529 kB 84.3 MB/s 
Collecting lightning-utilities==0.3.*
  Downloading lightning_utilities-0.3.0-py3-none-any.whl (15 kB)
Collecting fire
  Downloading fire-0.4.0.tar.gz (87 kB)
[K     |████████████████████████████████| 87 kB 9.3 MB/s 
Building wheels for collected packages: fire
  Building wheel for fire (setup.py) ... [?25l[?25hdone
  Created wheel for fire: filename=fire-0.4.0-py2.py3-none-any.whl size=115942 sha256=3346ff9cf0dd488

In [None]:
import torch
from torch import nn
from torch.utils.data import Dataset, DataLoader
from torch.optim import lr_scheduler, Adam

import pytorch_lightning as pl
from pytorch_lightning import Trainer
from pytorch_lightning.callbacks import ModelCheckpoint
from pytorch_lightning.loggers import TensorBoardLogger

import pandas as pd
import string

In [None]:
df = pd.read_csv("/content/drive/MyDrive/Datasets/RNN/data/names_with_countries.csv")
df

Unnamed: 0,Category,Name
0,Japanese,Abe
1,Japanese,Abukara
2,Japanese,Adachi
3,Japanese,Aida
4,Japanese,Aihara
...,...,...
20069,Vietnamese,Truong
20070,Vietnamese,Van
20071,Vietnamese,Vinh
20072,Vietnamese,Vuong


In [None]:
df["Category"].value_counts()

Russian       9408
English       3668
Arabic        2000
Japanese       991
German         724
Italian        709
Czech          519
Spanish        298
Dutch          297
French         277
Chinese        268
Irish          232
Greek          203
Polish         139
Scottish       100
Korean          94
Portuguese      74
Vietnamese      73
Name: Category, dtype: int64

We will treat each letter as a separate element of a sequence. Thanks to that we don't need to perform any sophisticated tokenization, whatsoever.

Besides letters, we will add tho additional tokens <pad> and <eos>. The fist is needed for padding (we will pad the names with 0 so we can have sequences of the same size in the single batch) while the second will be used to "announce" to the model that the name generation process ended.

In [None]:
all_letters = ["<pad>"] + list(string.ascii_letters + " .,;'-") + ["<eos>"]
n_letters = len(all_letters)
n_letters

60

In [None]:
stoi = {letter: idx for idx, letter in enumerate(all_letters)}
itos = [letter for idx, letter in enumerate(all_letters)]

In [None]:
stoi["<eos>"], itos[59]

(59, '<eos>')

In [None]:
len(stoi)

60

To provide the data to the model we need a Dataset. The one defined by us will return a dictionary of five elements.

The three most important are:

category_tensor - one-hot representation of one of the 18 categories.

E.g English is represented as [[0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 1., 0., 0., 0., 0., 0.]]

input_tensor int representation of letter tokens from the name.

target_tensor int representation of letter tokens from the target name. Target differs from input in such a way that it skips the first letter and adds <eos> at the end.

We use 0 for <pad> token and 59 for <eos>

In [None]:
class NameDataset(Dataset):
  def __init__(self, df, stoi, eos_token="<eos>"):
    self.stoi = stoi
    self.eos_token = eos_token
    self.n_tokens = len(self.stoi)

    self.categories = df["Category"].tolist()
    self.names = df["Name"].tolist()

    self.all_categories = list(set(self.categories))
    self.n_categories = len(self.all_categories)

  def __getitem__(self, item):
    category = self.categories[item]
    name = self.names[item]

    category_tensor = self.get_category_tensor(category)

    input_tensor = torch.tensor([stoi[char] for char in name])
    target_tensor = torch.tensor([stoi[char] for char in list(name[1:]) + [self.eos_token]])

    item_dict = {
        "category": category,
        "name": name,
        "category_tensor": category_tensor,
        "input_tensor": input_tensor,
        "target_tensor": target_tensor
    } 

    return item_dict 

  def __len__(self):
    return len(self.categories)

  def get_category_tensor(self, category):
    li = self.all_categories.index(category)
    tensor = torch.zeros(1, self.n_categories)
    tensor[0][li] = 1
    return tensor 

In [None]:
ds = NameDataset(df, stoi)
ds[1]

{'category': 'Japanese',
 'name': 'Abukara',
 'category_tensor': tensor([[0., 0., 0., 0., 0., 0., 0., 1., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0.]]),
 'input_tensor': tensor([27,  2, 21, 11,  1, 18,  1]),
 'target_tensor': tensor([ 2, 21, 11,  1, 18,  1, 59])}



The Dataset returns sequences of a different length, which might cause problems, due to that we need to define the collate_fn method which will handle this issue. It will add padding (0) to at the end of the sequences that are shorter than the longest sentence in a batch. Thanks to that we can work with batches of size other than one

In [None]:
def collate_fn(data):
  def merge(sequences):
    lengths = [len(seq) for seq in sequences]
    padded_seqs = torch.zeros(len(sequences), max(lengths)).long()
    for i, seq in enumerate(sequences):
      end = lengths[i]
      padded_seqs[i, :end] = seq[:end]
    return padded_seqs, lengths

  categories = [x["category"] for x in data]
  names = [x["name"] for x in data]
  category_tensors = torch.cat([x["category_tensor"] for x in data])

  input_tensors = [x["input_tensor"] for x in data]
  input_tensors,_ = merge(input_tensors)

  target_tensors = [x["target_tensor"] for x in data]
  target_tensors,_ = merge(target_tensors)

  return categories, names, category_tensors, input_tensors, target_tensors 

In [None]:
dl = DataLoader(ds, batch_size=1, collate_fn=collate_fn, shuffle=True)

In [None]:
next(iter(dl))

(['Russian'],
 ['Andronikov'],
 tensor([[0., 0., 0., 1., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0.]]),
 tensor([[27, 14,  4, 18, 15, 14,  9, 11, 15, 22]]),
 tensor([[14,  4, 18, 15, 14,  9, 11, 15, 22, 59]]))

To pass everything into the Lightning training loop we combine all of the previous steps and we define a LightningDataModule object

In [None]:
class NamesDatamodule(pl.LightningDataModule):
  def __init__(self, batch_size):
    super().__init__()
    self.batch_size = batch_size
    self.df = pd.read_csv("/content/drive/MyDrive/Datasets/RNN/data/names_with_countries.csv")

    self.all_letters = ["<pad>"] + list(string.ascii_letters + " .,;'-") + ["<eos>"]
    self.stoi = {letter: idx for idx, letter in enumerate(self.all_letters)}

  def setup(self, stage=None):
    self.train_set = NameDataset(self.df, self.stoi)

  def train_dataloader(self):
    return DataLoader(self.train_set, batch_size=self.batch_size, shuffle=True, collate_fn=self.collate_fn)

  def collate_fn(self, data):
    def merge(sequences):
      lengths = [len(seq) for seq in sequences]
      padded_seqs = torch.zeros(len(sequences), max(lengths)).long()
      for i, seq in enumerate(sequences):
        end = lengths[i]
        padded_seqs[i, :end] = seq[:end]
      return padded_seqs, lengths 

    categories = [x["category"] for x in data]
    names = [x["name"] for x in data]
    category_tensors = torch.cat([x["category_tensor"] for x in data])

    input_tensors = [x["input_tensor"] for x in data]
    input_tensors,_ = merge(input_tensors)

    target_tensors = [x["target_tensor"] for x in data]
    target_tensors,_ = merge(target_tensors)

    item_dict = {
        "categories": categories,
        "names": names,
        "category_tensors": category_tensors,
        "input_tensors": input_tensors,
        "target_tensors": target_tensors
    }

    return item_dict



We define a RNN. As a loss function, we will use CrossEntropyLoss

In [None]:
class RNN(pl.LightningModule):
  lr = 5e-4

  def __init__(self, input_size, hidden_size, embedding_size, n_categories, n_layers, output_size, p):
    super().__init__()

    self.criterion = nn.CrossEntropyLoss()

    self.n_layers = n_layers
    self.hidden_size = hidden_size

    self.embedding = nn.Embedding(input_size + n_categories, embedding_size)
    self.lstm = nn.LSTM(embedding_size + n_categories, hidden_size, n_layers, dropout=p)
    self.fc_out = nn.Linear(hidden_size, output_size)

    self.dropout = nn.Dropout(p)

  def forward(self, batch_of_category, batch_of_letter, hidden, cell):
    ## letter level operations

    embedding = self.dropout(self.embedding(batch_of_letter))
    category_plus_letter = torch.cat((batch_of_category, embedding), 1)

    #sequence_length = 1
    category_plus_letter = category_plus_letter.unsqueeze(1)

    out, (hidden, cell) = self.lstm(category_plus_letter, (hidden, cell))
    out = self.fc_out(out)
    out = out.squeeze(1)

    return out, (hidden, cell)

  def configure_optimizers(self):
    optimizer = Adam(self.parameters(), self.lr)
    scheduler = lr_scheduler.StepLR(optimizer, step_size=7, gamma=0.1)

    return [optimizer], [scheduler]

  def training_step(self, batch, batch_idx):
    item_dict = batch
    loss = 0
    batch_of_category = item_dict["category_tensors"]

    #to(device) needed due to some problem with PL
    hidden = torch.zeros(self.n_layers, 1, self.hidden_size).to(self.device)
    cell = torch.zeros(self.n_layers, 1, self.hidden_size).to(self.device)

    #we loop over letters, single batch at the time 
    for t in range(item_dict["input_tensors"].size(1)):
      batch_of_letter = item_dict["input_tensors"][:, t]

      output, (hidden, cell) = self(batch_of_category, batch_of_letter, hidden, cell)

      loss += self.criterion(output, item_dict["target_tensors"][:, t])

    loss = loss / (t+1)

    tensorboard_logs = {"train_loss": loss}

    return {"loss": loss, "log": tensorboard_logs}

  def init_hidden(self, batch_size):
    hidden = torch.zeros(self.n_layers, batch_size, self.hidden_size)
    cell = torch.zeros(self.n_layers, batch_size, self.hidden_size)
        
    return hidden, cell 

Finally, after defining the model and datamodule we can start training. For some strange reason that is not entirely clear to us, the model performs best when it is trained on a batch of size 1. After only 2 or 3 epochs it should be capable of generating the names

In [None]:
dm = NamesDatamodule(1)

rnn_model = RNN(
    input_size=ds.n_tokens,
    hidden_size = 256,
    embedding_size=128,
    n_layers=2,
    n_categories=ds.n_categories,
    output_size=ds.n_tokens,
    p=0.3
)

trainer = Trainer(
    max_epochs=3,
    logger=None,
    gpus=1,
    ) 

trainer.fit(rnn_model, dm)

  f"Setting `Trainer(gpus={gpus!r})` is deprecated in v1.7 and will be removed"
INFO:pytorch_lightning.utilities.rank_zero:GPU available: True (cuda), used: True
INFO:pytorch_lightning.utilities.rank_zero:TPU available: False, using: 0 TPU cores
INFO:pytorch_lightning.utilities.rank_zero:IPU available: False, using: 0 IPUs
INFO:pytorch_lightning.utilities.rank_zero:HPU available: False, using: 0 HPUs
INFO:pytorch_lightning.accelerators.cuda:LOCAL_RANK: 0 - CUDA_VISIBLE_DEVICES: [0]
INFO:pytorch_lightning.callbacks.model_summary:
  | Name      | Type             | Params
-----------------------------------------------
0 | criterion | CrossEntropyLoss | 0     
1 | embedding | Embedding        | 10.0 K
2 | lstm      | LSTM             | 940 K 
3 | fc_out    | Linear           | 15.4 K
4 | dropout   | Dropout          | 0     
-----------------------------------------------
965 K     Trainable params
0         Non-trainable params
965 K     Total params
3.862     Total estimated model para

Training: 0it [00:00, ?it/s]

KeyError: ignored