In [3]:
import collections
import copy
import csv
import os
from io import StringIO

import torch
import torch.nn as nn
import torch.nn.functional as F
import numpy as np
from torch.utils.tensorboard import SummaryWriter

from torch.distributions import categorical
from torch.utils.data import Dataset, DataLoader
from tqdm.notebook import tqdm as tqdm
# from google.colab import files
import matplotlib.pyplot as plt




# Before you start
You need to save a copy in your own Google Drive then you could edit on this colab.

Google offers free GPU in the colab environments, but you may need to configure the environment.

You can turn on the GPU mode in `Edit -> Notebook Settings` and change the `Runtime type` to be `Python3` and `Hardware accelerator` to be `GPU`.

In [4]:
print("GPU Model: %s" % torch.cuda.get_device_name(0))
print("You should get either a Tesla P100 or Tesla T4 GPU.")
print("Tesla P100 is probably 3x faster than T4 but both should work.")

GPU Model: NVIDIA GeForce RTX 4080 Laptop GPU
You should get either a Tesla P100 or Tesla T4 GPU.
Tesla P100 is probably 3x faster than T4 but both should work.


In [5]:
%load_ext tensorboard

In [6]:
writer = SummaryWriter('./')

In [7]:
PADDING_TOKEN = 0

# RNN modules

In [8]:
class GRUCell(nn.Module):
  """Implementation of GRU cell from https://arxiv.org/pdf/1406.1078.pdf."""

  def __init__(self, input_size, hidden_size, bias=False):
    super().__init__()

    self.input_size = input_size
    self.hidden_size = hidden_size
    self.bias = bias

    # Learnable weights and bias for `update gate`
    self.W_z = nn.Parameter(torch.Tensor(hidden_size, hidden_size + input_size))
    if bias:
      self.b_z = nn.Parameter(torch.Tensor(hidden_size))
    else:
      self.register_parameter('b_z', None)

    # Learnable weights and bias for `reset gate`
    self.W_r = nn.Parameter(torch.Tensor(hidden_size, hidden_size + input_size))
    if bias:
      self.b_r = nn.Parameter(torch.Tensor(hidden_size))
    else:
      self.register_parameter('b_r', None)

    # Learnable weights and bias for `output gate`
    self.W = nn.Parameter(torch.Tensor(hidden_size, hidden_size + input_size))
    if bias:
      self.b = nn.Parameter(torch.Tensor(hidden_size))
    else:
      self.register_parameter('b', None)

    self.reset_parameters()

  def forward(self, x, prev_state):
    if prev_state is None:
      batch = x.shape[0]
      prev_h = torch.zeros((batch, self.hidden_size), device=x.device)
    else:
      prev_h = prev_state

    concat_hx = torch.cat((prev_h, x), dim=1)
    z = torch.sigmoid(F.linear(concat_hx, self.W_z, self.b_z))
    r = torch.sigmoid(F.linear(concat_hx, self.W_r, self.b_r))
    h_tilde = torch.tanh(
        F.linear(torch.cat((r * prev_h, x), dim=1), self.W, self.b))
    next_h = (1 - z) * prev_h + z * h_tilde
    return next_h

  def reset_parameters(self):
    sqrt_k = (1. / self.hidden_size)**0.5
    with torch.no_grad():
      for param in self.parameters():
        param.uniform_(-sqrt_k, sqrt_k)
    return

  def extra_repr(self):
    return 'input_size={}, hidden_size={}, bias={}'.format(
        self.input_size, self.hidden_size, self.bias is not True)

  def count_parameters(self):
    print('Total Parameters: %d' %
          sum(p.numel() for p in self.parameters() if p.requires_grad))
    return

In [9]:
class LSTMCell(nn.Module):

  def __init__(self, input_size, hidden_size, bias=True):
    super().__init__()
    self.input_size = input_size
    self.hidden_size = hidden_size
    self.bias = bias

    # Learnable weights and bias for `input gate`
    self.W_i = nn.Parameter(torch.Tensor(hidden_size, hidden_size + input_size))
    if bias:
      self.b_i = nn.Parameter(torch.Tensor(hidden_size))
    else:
      self.register_parameter('b_i', None)

    # Learnable weights and bias for `forget gate`
    self.W_f = nn.Parameter(torch.Tensor(hidden_size, hidden_size + input_size))
    if bias:
      self.b_f = nn.Parameter(torch.Tensor(hidden_size))
    else:
      self.register_parameter('b_f', None)

    
    # Learnable weights and bias for `output gate`
    self.W_o = nn.Parameter(torch.Tensor(hidden_size, hidden_size + input_size))
    if bias:
      self.b_o = nn.Parameter(torch.Tensor(hidden_size))
    else:
      self.register_parameter('b_o', None)

    # Learnable weights and bias for `cell gate`
    self.W_c = nn.Parameter(torch.Tensor(hidden_size, hidden_size + input_size))
    if bias:
      self.b_c = nn.Parameter(torch.Tensor(hidden_size))
    else:
      self.register_parameter('b_c', None)

    self.reset_parameters()

  def forward(self, x, prev_state):
      if prev_state is None:
          batch = x.shape[0]
          prev_h = torch.zeros((batch, self.hidden_size), device=x.device)
          prev_c = torch.zeros((batch, self.hidden_size), device=x.device)
      else:
          prev_h, prev_c = prev_state
      
      # Concatenate the previous hidden state and the current input
      concat_hx = torch.cat((prev_h, x), dim=1)
      
      # Compute the input gate activation
      i = torch.sigmoid(F.linear(concat_hx, self.W_i, self.b_i))
      
      # Compute the forget gate activation
      f = torch.sigmoid(F.linear(concat_hx, self.W_f, self.b_f))
      
      # Compute the output gate activation
      o = torch.sigmoid(F.linear(concat_hx, self.W_o, self.b_o))
      
      # Compute the cell gate (candidate) activation
      c_tilde = torch.tanh(F.linear(concat_hx, self.W_c, self.b_c))
      
      # Update the cell state
      c = f * prev_c + i * c_tilde
      
      # Compute the new hidden state
      h = o * torch.tanh(c)
      
      return h, c

  def reset_parameters(self):
    sqrt_k = (1. / self.hidden_size)**0.5
    with torch.no_grad():
      for param in self.parameters():
        param.uniform_(-sqrt_k, sqrt_k)
    return

  def extra_repr(self):
    return 'input_size={}, hidden_size={}, bias={}'.format(
        self.input_size, self.hidden_size, self.bias is not True)

  def count_parameters(self):
    print('Total Parameters: %d' %
          sum(p.numel() for p in self.parameters() if p.requires_grad))
    return

In [10]:
class PeepholedLSTMCell(nn.Module):
  def __init__(self, input_size, hidden_size, bias=True):
      super().__init__()
      self.input_size = input_size
      self.hidden_size = hidden_size
      self.bias = bias

      # Adjusting the sizes for the concatenated input: [C_{t-1}, h_{t-1}, x_t] and [C_t, h_{t-1}, x_t]
      self.W_i = nn.Parameter(torch.Tensor(hidden_size, hidden_size*2 + input_size))
      self.W_f = nn.Parameter(torch.Tensor(hidden_size, hidden_size*2 + input_size))
      self.W_o = nn.Parameter(torch.Tensor(hidden_size, hidden_size*2 + input_size))
      self.W_c = nn.Parameter(torch.Tensor(hidden_size, hidden_size*2 + input_size))

      if bias:
          self.b_i = nn.Parameter(torch.Tensor(hidden_size))
          self.b_f = nn.Parameter(torch.Tensor(hidden_size))
          self.b_o = nn.Parameter(torch.Tensor(hidden_size))
          self.b_c = nn.Parameter(torch.Tensor(hidden_size))
      else:
          self.register_parameter('b_i', None)
          self.register_parameter('b_f', None)
          self.register_parameter('b_o', None)
          self.register_parameter('b_c', None)

      self.reset_parameters()

  def forward(self, x, prev_state):
      if prev_state is None:
          batch_size = x.size(0)
          prev_h = torch.zeros(batch_size, self.hidden_size, device=x.device)
          prev_c = torch.zeros(batch_size, self.hidden_size, device=x.device)
      else:
          prev_h, prev_c = prev_state

      # Manually concatenate [C_{t-1}, h_{t-1}, x_t] for the inputs
      concat_chx = torch.cat((prev_c, prev_h, x), dim=1)

      # Apply the gates
      i = torch.sigmoid(F.linear(concat_chx, self.W_i, self.b_i))
      f = torch.sigmoid(F.linear(concat_chx, self.W_f, self.b_f))
      g = torch.tanh(F.linear(concat_chx, self.W_c, self.b_c))

      # Update cell state
      c = f * prev_c + i * g

      # For the output gate, now using [C_t, h_{t-1}, x_t]
      concat_chx1 = torch.cat((c, prev_h, x), dim=1)
      o = torch.sigmoid(F.linear(concat_chx1, self.W_o, self.b_o))

      # Compute the new hidden state
      h = o * torch.tanh(c)

      return h, c

  def reset_parameters(self):
    sqrt_k = (1. / self.hidden_size)**0.5
    with torch.no_grad():
      for param in self.parameters():
        param.uniform_(-sqrt_k, sqrt_k)
    return

  def extra_repr(self):
    return 'input_size={}, hidden_size={}, bias={}'.format(
        self.input_size, self.hidden_size, self.bias is not True)

  def count_parameters(self):
    print('Total Parameters: %d' %
          sum(p.numel() for p in self.parameters() if p.requires_grad))
    return

In [11]:
class CoupledLSTMCell(nn.Module):

  def __init__(self, input_size, hidden_size, bias=True):
    super(CoupledLSTMCell, self).__init__()

    self.input_size = input_size
    self.hidden_size = hidden_size
    self.bias = bias

    # Coupled input and forget gate weights
    self.W_if = nn.Parameter(torch.Tensor(hidden_size, hidden_size + input_size))
    if bias:
        self.b_if = nn.Parameter(torch.Tensor(hidden_size))
    else:
        self.register_parameter('b_if', None)

    # Output gate weights
    self.W_o = nn.Parameter(torch.Tensor(hidden_size, hidden_size + input_size))
    if bias:
        self.b_o = nn.Parameter(torch.Tensor(hidden_size))
    else:
        self.register_parameter('b_o', None)

    # Cell gate (candidate) weights
    self.W_c = nn.Parameter(torch.Tensor(hidden_size, hidden_size + input_size))
    if bias:
        self.b_c = nn.Parameter(torch.Tensor(hidden_size))
    else:
        self.register_parameter('b_c', None)

    self.reset_parameters()

  def forward(self, x, prev_state):
    if prev_state is None:
        batch_size = x.size(0)
        prev_h = torch.zeros(batch_size, self.hidden_size, device=x.device)
        prev_c = torch.zeros(batch_size, self.hidden_size, device=x.device)
    else:
        prev_h, prev_c = prev_state

    # Concatenate the previous hidden state and the current input
    concat_hx = torch.cat((prev_h, x), dim=1)

    # Coupled input and forget gate
    if_gate = torch.sigmoid(F.linear(concat_hx, self.W_if, self.b_if))
    i = if_gate  # Update (input) gate
    f = 1 - if_gate  # Forget gate is simply 1 - input gate

    # Cell gate (candidate)
    g = torch.tanh(F.linear(concat_hx, self.W_c, self.b_c))

    # Update the cell state
    c = f * prev_c + i * g

    # Output gate
    o = torch.sigmoid(F.linear(concat_hx, self.W_o, self.b_o))

    # Compute the new hidden state
    h = o * torch.tanh(c)

    return h, c

  def reset_parameters(self):
    sqrt_k = (1. / self.hidden_size)**0.5
    with torch.no_grad():
      for param in self.parameters():
        param.uniform_(-sqrt_k, sqrt_k)
    return

  def extra_repr(self):
    return 'input_size={}, hidden_size={}, bias={}'.format(
        self.input_size, self.hidden_size, self.bias is not True)

  def count_parameters(self):
    print('Total Parameters: %d' %
          sum(p.numel() for p in self.parameters() if p.requires_grad))
    return

In [12]:
RNN_MODULES = {
  'gru': GRUCell,
  'lstm': LSTMCell,
  'peepholed_lstm': PeepholedLSTMCell,
  'coupled_lstm': CoupledLSTMCell,
}

# Upload data
Please use the following code snippet to upload

* imdb_train.csv
* imdb_test.csv
* shakespeare.txt

You can choose multiple files to upload all at once.

You Can choose the uncomment the below code if you are using google colab, I run it at the local environment since google colab would only allows you to train on CPU once the quota runs out.

In [13]:

# uploaded = files.upload()

# for fn in uploaded.keys():
#   print('User uploaded file "{name}" with length {length} bytes'.format(
#       name=fn, length=len(uploaded[fn])))

In [14]:
def upload_files(file_paths):
    """
    Mimics the behavior of files.upload() from Google Colab in a local environment.
    Reads the specified files and returns a dictionary with file names as keys
    and file contents as values.
    
    :param file_paths: A list of strings, where each string is a file path.
    :return: A dictionary with file names as keys and file contents (as bytes) as values.
    """
    uploaded = {}
    for file_path in file_paths:
        try:
            with open(file_path, 'rb') as file:  # Open the file in binary mode
                uploaded[file_path.split('/')[-1]] = file.read()
            print('User uploaded file "{}" with length {} bytes'.format(
                file_path.split('/')[-1], len(uploaded[file_path.split('/')[-1]])
            ))
        except Exception as e:
            print(f"Error reading {file_path}: {e}")
    return uploaded

# Specify the paths to your files
file_paths = [
    'imdb_train.csv',
    'imdb_test.csv',
    'shakespeare.txt',
]

# Use the function to read the files
uploaded = upload_files(file_paths)

train_dataset_text = uploaded['imdb_train.csv']
test_dataset_text = uploaded['imdb_test.csv']
shakespeare_text = uploaded['shakespeare.txt']

User uploaded file "imdb_train.csv" with length 60197565 bytes
User uploaded file "imdb_test.csv" with length 6640779 bytes
User uploaded file "shakespeare.txt" with length 1115394 bytes


# Sentiment analysis

In [15]:
### Hyperparameters for training (previously defined in FLAGS)
LEARNING_RATE = 1e-3
WEIGHT_DECAY = 0
BATCH_SIZE = 4096
EPOCHS = 51
GRADIENT_CLIP_NORM = 1.0

### Hyperparameters for sentence analysis model
EMBEDDING_DIM = 128
HIDDEN_SIZE = 100
REVIEW_MAX_LENGTH = 200
VOCABULARY_MIN_COUNT = 100
VOCABULARY_MAX_SIZE = 20000
RNN_MODULE = 'gru'    # You need to try 'lstm', 'peepholed_lstm', 'coupled_lstm'

In [16]:
class IMDBReviewDataset(Dataset):

  def __init__(self,
               csv_text,
               vocabulary=None,
               vocab_min_count=10,
               vocab_max_size=None,
               review_max_length=200):
    self.csv_text = csv_text
    self.vocab_min_count = vocab_min_count
    self.vocab_max_size = vocab_max_size
    self.review_max_length = review_max_length - 2

    self.data = []

    encoded_text = csv_text.strip().decode(encoding='utf-8')
    fp = StringIO(encoded_text)
    reader = csv.DictReader(fp, delimiter=',')
    for row in tqdm(reader):
      self.data.append((row['review'].split(' ')[:review_max_length],
                        int(row['sentiment'] == 'positive')))
    fp.close()

    if vocabulary is not None:
      print('Using external vocabulary - vocab-related configs ignored.')
      self.vocabulary = vocabulary
    else:
      self.vocabulary = self._build_vocabulary()

    self.word2index = {w: i for (i, w) in enumerate(self.vocabulary)}
    self.index2word = {i: w for (i, w) in enumerate(self.vocabulary)}
    self.oov_token_id = self.word2index['OOV_TOKEN']
    self.pad_token_id = self.word2index['PAD_TOKEN']

  def __len__(self):
    return len(self.data)

  def __getitem__(self, index):
    review, label = self.data[index]
    review = ['BEGIN_TOKEN'] + review + ['END_TOKEN']
    token_ids = [self.word2index.get(w, self.oov_token_id) for w in review]
    return token_ids, label

  def _build_vocabulary(self):
    special_tokens = ['PAD_TOKEN', 'BEGIN_TOKEN', 'OOV_TOKEN', 'END_TOKEN']

    counter = collections.Counter()
    for review, _ in self.data:
      counter.update(review)

    vocab = counter.most_common(self.vocab_max_size - 4)
    if self.vocab_min_count is not None:
      vocab_tokens = [w for (w, c) in vocab if c >= self.vocab_min_count]
    else:
      vocab_tokens, _ = zip(vocab)

    return special_tokens + vocab_tokens

  def get_vocabulary(self):
    return self.vocabulary

  def print_statistics(self):
    reviews, labels = zip(*self.data)
    lengths = [len(x) for x in reviews]
    positive = np.sum(labels)
    negative = len(labels) - positive
    print('Total instances: %d, positive: %d, negative: %d' %
          (len(self.data), positive, negative))
    print('Review lengths: max: %d, min: %d, mean: %d, median: %d' %
          (max(lengths), min(lengths), np.mean(lengths), np.median(lengths)))
    print('Vocabulary size: %d' % len(self.vocabulary))
    return


def imdb_collate_fn(batch_data, padding_token_id=PADDING_TOKEN):
  """Padding variable-length sequences."""
  batch_tokens, batch_labels = zip(*batch_data)
  lengths = [len(x) for x in batch_tokens]
  max_length = max(lengths)

  padded_tokens = []
  for tokens, length in zip(batch_tokens, lengths):
    padded_tokens.append(tokens + [padding_token_id] * (max_length - length))

  padded_tokens = torch.tensor(padded_tokens, dtype=torch.int64)
  lengths = torch.tensor(lengths, dtype=torch.int64)
  labels = torch.tensor(batch_labels, dtype=torch.int64)

  return padded_tokens, lengths, labels

In [17]:
class SentimentClassification(nn.Module):

  def __init__(self,
               vocabulary_size,
               embedding_dim,
               rnn_module,
               hidden_size,
               bias=False):
    super().__init__()
    self.vocabulary_size = vocabulary_size
    self.rnn_module = rnn_module
    self.embedding_dim = embedding_dim
    self.hidden_size = hidden_size
    self.bias = bias

    self.embedding = nn.Embedding(num_embeddings=vocabulary_size,
                                  embedding_dim=embedding_dim,
                                  padding_idx=PADDING_TOKEN)
    self.rnn_model = self.rnn_module(input_size=embedding_dim,
                                     hidden_size=hidden_size,
                                     bias=bias)
    self.classifier = nn.Linear(hidden_size, 2)
    return

  def forward(self, batch_reviews, batch_lengths):
    data = self.embedding(batch_reviews)

    state = None
    batch_size, total_steps, _ = data.shape
    full_outputs = []
    for step in range(total_steps):
      next_state = self.rnn_model(data[:, step, :], state)
      if isinstance(next_state, tuple):
        h, c = next_state
        full_outputs.append(h)
      else:
        full_outputs.append(next_state)
      state = next_state

    full_outputs = torch.stack(full_outputs, dim=1)
    outputs = full_outputs[torch.arange(batch_size), batch_lengths - 1, :]
    logits = self.classifier(outputs)
    return logits

In [18]:
def imdb_trainer(batch_size, epochs):
  writer = SummaryWriter()
  train_dataset = IMDBReviewDataset(csv_text=train_dataset_text,
                                    vocab_min_count=VOCABULARY_MIN_COUNT,
                                    vocab_max_size=VOCABULARY_MAX_SIZE,
                                    review_max_length=REVIEW_MAX_LENGTH)
  train_dataset.print_statistics()
  train_loader = DataLoader(train_dataset,
                            batch_size=batch_size,
                            shuffle=True,
                            num_workers=0,
                            collate_fn=imdb_collate_fn)
  vocabulary = train_dataset.get_vocabulary()

  # Validation dataset should use the same vocabulary as the training set.
  val_dataset = IMDBReviewDataset(csv_text=test_dataset_text,
                                  vocabulary=vocabulary,
                                  review_max_length=REVIEW_MAX_LENGTH)
  val_dataset.print_statistics()
  val_loader = DataLoader(val_dataset,
                          batch_size=batch_size,
                          shuffle=False,
                          num_workers=0,
                          collate_fn=imdb_collate_fn)

  best_model = None
  best_acc = 0.0

  full_train_loss = []
  full_train_accuracy = []
  full_val_loss = []
  full_val_accuracy = []

  device = torch.device('cuda:0' if torch.cuda.is_available() else 'cpu')

  model = SentimentClassification(vocabulary_size=len(vocabulary),
                                  embedding_dim=EMBEDDING_DIM,
                                  rnn_module=RNN_MODULES[RNN_MODULE],
                                  hidden_size=HIDDEN_SIZE)
  
  model.to(device)

  model.rnn_model.count_parameters()

  print('Model Architecture:\n%s' % model)

  criterion = nn.CrossEntropyLoss(reduction='mean')
  optimizer = torch.optim.Adam(model.parameters(),
                               lr=LEARNING_RATE,
                               weight_decay=WEIGHT_DECAY)

  for epoch in range(epochs):
    for phase in ('train', 'eval'):
      if phase == 'train':
        model.train()
        dataset = train_dataset
        data_loader = train_loader
      else:
        model.eval()
        dataset = val_dataset
        data_loader = val_loader

      running_loss = 0.0
      running_corrects = 0

      for step, (reviews, lengths, labels) in tqdm(enumerate(data_loader)):
        reviews = reviews.to(device)
        lengths = lengths.to(device)
        labels = labels.to(device)

        optimizer.zero_grad()

        with torch.set_grad_enabled(phase == 'train'):
          outputs = model(reviews, lengths)
          _, preds = torch.max(outputs, 1)
          loss = criterion(outputs, labels)

          if phase == 'train':
            loss.backward()

            # RNN model is easily getting exploded gradients, thus we perform
            # gradients clipping to mitigate this issue.
            nn.utils.clip_grad_norm_(model.parameters(), GRADIENT_CLIP_NORM)
            optimizer.step()

        running_loss += loss.item() * reviews.size(0)
        running_corrects += torch.sum(preds == labels.data)

      epoch_loss = running_loss / len(dataset)
      epoch_acc = running_corrects.double() / len(dataset)
      if phase == 'train':
        writer.add_scalar('Loss/train', epoch_loss, epoch)
        writer.add_scalar('Accuracy/train', epoch_acc, epoch)
        full_train_accuracy.append(epoch_acc)
        full_train_loss.append(epoch_loss)
      elif phase == 'eval':
        writer.add_scalar('Loss/val', epoch_loss, epoch)
        writer.add_scalar('Accuracy/val', epoch_acc, epoch)
        full_val_accuracy.append(epoch_acc)
        full_val_loss.append(epoch_loss)

      print('[Epoch %d] %s accuracy: %.4f, loss: %.4f' %
            (epoch + 1, phase, epoch_acc, epoch_loss))

      if phase == 'eval':
        if epoch_acc > best_acc:
          best_acc = epoch_acc
          best_model = copy.deepcopy(model.state_dict())

  # state_dict = {"model": best_model.cpu().state_dict(),
  #               "vocabulary": vocabulary}
  print("Best validation accuracy: %.4f" % best_acc)
  logs = (full_train_loss, full_train_accuracy, full_val_loss, full_val_accuracy)

  writer.close()
  return logs

In [19]:
logs = imdb_trainer(BATCH_SIZE, EPOCHS)

0it [00:00, ?it/s]

Total instances: 45000, positive: 22500, negative: 22500
Review lengths: max: 200, min: 8, mean: 168, median: 198
Vocabulary size: 4835


0it [00:00, ?it/s]

Using external vocabulary - vocab-related configs ignored.
Total instances: 5000, positive: 2500, negative: 2500
Review lengths: max: 200, min: 11, mean: 167, median: 195
Vocabulary size: 4835
Total Parameters: 68400
Model Architecture:
SentimentClassification(
  (embedding): Embedding(4835, 128, padding_idx=0)
  (rnn_model): GRUCell(input_size=128, hidden_size=100, bias=True)
  (classifier): Linear(in_features=100, out_features=2, bias=True)
)


0it [00:00, ?it/s]

[Epoch 1] train accuracy: 0.5027, loss: 0.7004


0it [00:00, ?it/s]

[Epoch 1] eval accuracy: 0.5222, loss: 0.6919


0it [00:00, ?it/s]

[Epoch 2] train accuracy: 0.5414, loss: 0.6871


0it [00:00, ?it/s]

[Epoch 2] eval accuracy: 0.5456, loss: 0.6859


0it [00:00, ?it/s]

[Epoch 3] train accuracy: 0.5668, loss: 0.6793


0it [00:00, ?it/s]

[Epoch 3] eval accuracy: 0.5722, loss: 0.6760


0it [00:00, ?it/s]

[Epoch 4] train accuracy: 0.5972, loss: 0.6607


0it [00:00, ?it/s]

[Epoch 4] eval accuracy: 0.6230, loss: 0.6454


0it [00:00, ?it/s]

[Epoch 5] train accuracy: 0.6735, loss: 0.6123


0it [00:00, ?it/s]

[Epoch 5] eval accuracy: 0.6768, loss: 0.6011


0it [00:00, ?it/s]

[Epoch 6] train accuracy: 0.7159, loss: 0.5608


0it [00:00, ?it/s]

[Epoch 6] eval accuracy: 0.7288, loss: 0.5453


0it [00:00, ?it/s]

[Epoch 7] train accuracy: 0.7646, loss: 0.5006


0it [00:00, ?it/s]

[Epoch 7] eval accuracy: 0.7656, loss: 0.4961


0it [00:00, ?it/s]

[Epoch 8] train accuracy: 0.7917, loss: 0.4576


0it [00:00, ?it/s]

[Epoch 8] eval accuracy: 0.7716, loss: 0.4918


0it [00:00, ?it/s]

[Epoch 9] train accuracy: 0.7997, loss: 0.4443


0it [00:00, ?it/s]

[Epoch 9] eval accuracy: 0.7946, loss: 0.4560


0it [00:00, ?it/s]

[Epoch 10] train accuracy: 0.8242, loss: 0.4024


0it [00:00, ?it/s]

[Epoch 10] eval accuracy: 0.8108, loss: 0.4327


0it [00:00, ?it/s]

[Epoch 11] train accuracy: 0.8364, loss: 0.3809


0it [00:00, ?it/s]

[Epoch 11] eval accuracy: 0.8152, loss: 0.4231


0it [00:00, ?it/s]

[Epoch 12] train accuracy: 0.8494, loss: 0.3608


0it [00:00, ?it/s]

[Epoch 12] eval accuracy: 0.8228, loss: 0.4170


0it [00:00, ?it/s]

[Epoch 13] train accuracy: 0.8578, loss: 0.3444


0it [00:00, ?it/s]

[Epoch 13] eval accuracy: 0.8306, loss: 0.4044


0it [00:00, ?it/s]

[Epoch 14] train accuracy: 0.8640, loss: 0.3312


0it [00:00, ?it/s]

[Epoch 14] eval accuracy: 0.8370, loss: 0.4019


0it [00:00, ?it/s]

[Epoch 15] train accuracy: 0.8691, loss: 0.3214


0it [00:00, ?it/s]

[Epoch 15] eval accuracy: 0.8392, loss: 0.3849


0it [00:00, ?it/s]

[Epoch 16] train accuracy: 0.8732, loss: 0.3119


0it [00:00, ?it/s]

[Epoch 16] eval accuracy: 0.8362, loss: 0.3874


0it [00:00, ?it/s]

[Epoch 17] train accuracy: 0.8792, loss: 0.3022


0it [00:00, ?it/s]

[Epoch 17] eval accuracy: 0.8424, loss: 0.3881


0it [00:00, ?it/s]

[Epoch 18] train accuracy: 0.8851, loss: 0.2896


0it [00:00, ?it/s]

[Epoch 18] eval accuracy: 0.8388, loss: 0.3957


0it [00:00, ?it/s]

[Epoch 19] train accuracy: 0.8882, loss: 0.2820


0it [00:00, ?it/s]

[Epoch 19] eval accuracy: 0.8444, loss: 0.3834


0it [00:00, ?it/s]

[Epoch 20] train accuracy: 0.8921, loss: 0.2749


0it [00:00, ?it/s]

[Epoch 20] eval accuracy: 0.8446, loss: 0.3902


0it [00:00, ?it/s]

[Epoch 21] train accuracy: 0.8954, loss: 0.2680


0it [00:00, ?it/s]

[Epoch 21] eval accuracy: 0.8430, loss: 0.3802


0it [00:00, ?it/s]

[Epoch 22] train accuracy: 0.8986, loss: 0.2616


0it [00:00, ?it/s]

[Epoch 22] eval accuracy: 0.8444, loss: 0.3853


0it [00:00, ?it/s]

[Epoch 23] train accuracy: 0.9012, loss: 0.2555


0it [00:00, ?it/s]

[Epoch 23] eval accuracy: 0.8506, loss: 0.3811


0it [00:00, ?it/s]

[Epoch 24] train accuracy: 0.9055, loss: 0.2478


0it [00:00, ?it/s]

[Epoch 24] eval accuracy: 0.8528, loss: 0.3724


0it [00:00, ?it/s]

[Epoch 25] train accuracy: 0.9073, loss: 0.2436


0it [00:00, ?it/s]

[Epoch 25] eval accuracy: 0.8490, loss: 0.3788


0it [00:00, ?it/s]

[Epoch 26] train accuracy: 0.9105, loss: 0.2374


0it [00:00, ?it/s]

[Epoch 26] eval accuracy: 0.8502, loss: 0.3770


0it [00:00, ?it/s]

[Epoch 27] train accuracy: 0.9123, loss: 0.2333


0it [00:00, ?it/s]

[Epoch 27] eval accuracy: 0.8540, loss: 0.3777


0it [00:00, ?it/s]

[Epoch 28] train accuracy: 0.9146, loss: 0.2282


0it [00:00, ?it/s]

[Epoch 28] eval accuracy: 0.8548, loss: 0.3773


0it [00:00, ?it/s]

[Epoch 29] train accuracy: 0.9189, loss: 0.2223


0it [00:00, ?it/s]

[Epoch 29] eval accuracy: 0.8506, loss: 0.3781


0it [00:00, ?it/s]

[Epoch 30] train accuracy: 0.9208, loss: 0.2161


0it [00:00, ?it/s]

[Epoch 30] eval accuracy: 0.8528, loss: 0.3850


0it [00:00, ?it/s]

[Epoch 31] train accuracy: 0.9218, loss: 0.2120


0it [00:00, ?it/s]

[Epoch 31] eval accuracy: 0.8568, loss: 0.3777


0it [00:00, ?it/s]

[Epoch 32] train accuracy: 0.9239, loss: 0.2069


0it [00:00, ?it/s]

[Epoch 32] eval accuracy: 0.8534, loss: 0.3912


0it [00:00, ?it/s]

[Epoch 33] train accuracy: 0.9261, loss: 0.2033


0it [00:00, ?it/s]

[Epoch 33] eval accuracy: 0.8546, loss: 0.3745


0it [00:00, ?it/s]

[Epoch 34] train accuracy: 0.9279, loss: 0.1996


0it [00:00, ?it/s]

[Epoch 34] eval accuracy: 0.8518, loss: 0.3894


0it [00:00, ?it/s]

[Epoch 35] train accuracy: 0.9301, loss: 0.1948


0it [00:00, ?it/s]

[Epoch 35] eval accuracy: 0.8552, loss: 0.3803


0it [00:00, ?it/s]

[Epoch 36] train accuracy: 0.9325, loss: 0.1900


0it [00:00, ?it/s]

[Epoch 36] eval accuracy: 0.8574, loss: 0.3826


0it [00:00, ?it/s]

[Epoch 37] train accuracy: 0.9322, loss: 0.1881


0it [00:00, ?it/s]

[Epoch 37] eval accuracy: 0.8500, loss: 0.3996


0it [00:00, ?it/s]

[Epoch 38] train accuracy: 0.9364, loss: 0.1811


0it [00:00, ?it/s]

[Epoch 38] eval accuracy: 0.8522, loss: 0.3955


0it [00:00, ?it/s]

[Epoch 39] train accuracy: 0.9371, loss: 0.1794


0it [00:00, ?it/s]

[Epoch 39] eval accuracy: 0.8598, loss: 0.3824


0it [00:00, ?it/s]

[Epoch 40] train accuracy: 0.9383, loss: 0.1751


0it [00:00, ?it/s]

[Epoch 40] eval accuracy: 0.8540, loss: 0.4073


0it [00:00, ?it/s]

[Epoch 41] train accuracy: 0.9381, loss: 0.1763


0it [00:00, ?it/s]

[Epoch 41] eval accuracy: 0.8518, loss: 0.4198


0it [00:00, ?it/s]

[Epoch 42] train accuracy: 0.9408, loss: 0.1691


0it [00:00, ?it/s]

[Epoch 42] eval accuracy: 0.8524, loss: 0.3916


0it [00:00, ?it/s]

[Epoch 43] train accuracy: 0.9426, loss: 0.1645


0it [00:00, ?it/s]

[Epoch 43] eval accuracy: 0.8602, loss: 0.4099


0it [00:00, ?it/s]

[Epoch 44] train accuracy: 0.9458, loss: 0.1593


0it [00:00, ?it/s]

[Epoch 44] eval accuracy: 0.8602, loss: 0.4260


0it [00:00, ?it/s]

[Epoch 45] train accuracy: 0.9478, loss: 0.1559


0it [00:00, ?it/s]

[Epoch 45] eval accuracy: 0.8506, loss: 0.4152


0it [00:00, ?it/s]

[Epoch 46] train accuracy: 0.9479, loss: 0.1539


0it [00:00, ?it/s]

[Epoch 46] eval accuracy: 0.8530, loss: 0.3918


0it [00:00, ?it/s]

[Epoch 47] train accuracy: 0.9494, loss: 0.1508


0it [00:00, ?it/s]

[Epoch 47] eval accuracy: 0.8616, loss: 0.4351


0it [00:00, ?it/s]

[Epoch 48] train accuracy: 0.9517, loss: 0.1456


0it [00:00, ?it/s]

[Epoch 48] eval accuracy: 0.8600, loss: 0.4376


0it [00:00, ?it/s]

[Epoch 49] train accuracy: 0.9534, loss: 0.1411


0it [00:00, ?it/s]

[Epoch 49] eval accuracy: 0.8518, loss: 0.4141


0it [00:00, ?it/s]

[Epoch 50] train accuracy: 0.9555, loss: 0.1370


0it [00:00, ?it/s]

[Epoch 50] eval accuracy: 0.8498, loss: 0.4330


0it [00:00, ?it/s]

[Epoch 51] train accuracy: 0.9564, loss: 0.1345


0it [00:00, ?it/s]

[Epoch 51] eval accuracy: 0.8594, loss: 0.4140
Best validation accuracy: 0.8616


In [43]:
### You can make a plot using matplotlib with logs
%tensorboard --logdir runs

Reusing TensorBoard on port 6006 (pid 20232), started 15:52:10 ago. (Use '!kill 20232' to kill it.)

# Language model and sentence generation

In [46]:
### Hyperparameters for training (previously defined in FLAGS)
LEARNING_RATE = 1e-3
WEIGHT_DECAY = 0
BATCH_SIZE = 4096
EPOCHS = 10

### Hyperparameters for sentence analysis model
EMBEDDING_DIM = 256
HIDDEN_SIZE = 512
RNN_MODULE = 'gru'
HISTORY_LENGTH = 100

### Hyperparameters for generating new sentence
GENERATION_LENGTH = 2000
START_STRING = 'JULIET'
TEMPERATURE = 1.0

In [30]:
class ShakespeareDataset(Dataset):

  def __init__(self, encoded_text, history_length):
    self.encoded_text = encoded_text
    self.history_length = history_length

    raw_text = self.encoded_text.strip().decode(encoding='utf-8')

    self.vocab = sorted(set(raw_text))
    self.char2index = {x: i for (i, x) in enumerate(self.vocab)}
    self.index2char = {i: x for (i, x) in enumerate(self.vocab)}

    self.data = [(raw_text[i:i + history_length], raw_text[i + history_length])
                 for i in range(len(raw_text) - history_length)]
    return

  def __len__(self):
    return len(self.data)

  def __getitem__(self, index):
    history, label = self.data[index]
    history = np.array([self.char2index[x] for x in history])
    label = self.char2index[label]
    return history, label

  def get_vocabulary(self):
    return self.vocab

In [31]:
class SentenceGeneration(nn.Module):

  def __init__(self,
               vocabulary_size,
               embedding_dim,
               rnn_module,
               hidden_size,
               bias=False):
    super().__init__()
    self.vocabulary_size = vocabulary_size
    self.rnn_module = rnn_module
    self.embedding_dim = embedding_dim
    self.hidden_size = hidden_size
    self.bias = bias

    self.embedding = nn.Embedding(num_embeddings=vocabulary_size,
                                  embedding_dim=embedding_dim,
                                  padding_idx=PADDING_TOKEN)
    self.rnn_model = self.rnn_module(input_size=embedding_dim,
                                     hidden_size=hidden_size,
                                     bias=bias)
    self.classifier = nn.Linear(hidden_size, vocabulary_size)
    return

  def forward(self, batch_reviews, state=None):
    data = self.embedding(batch_reviews)

    batch_size, total_steps, _ = data.shape
    for step in range(total_steps):
      next_state = self.rnn_model(data[:, step, :], state)
      if isinstance(next_state, tuple):
        h, c = next_state
        outputs = h
      else:
        outputs = next_state
      state = next_state

    logits = self.classifier(outputs)
    return logits, state

  def reset_parameters(self):
    with torch.no_grad:
      for param in self.parameters():
        param.reset_parameters()
    return

In [32]:
def shakespeare_trainer(batch_size, epochs):
  train_dataset = ShakespeareDataset(encoded_text=shakespeare_text,
                                     history_length=HISTORY_LENGTH)

  print('Train dataset: %d' % len(train_dataset))

  train_loader = DataLoader(train_dataset,
                            batch_size=batch_size,
                            shuffle=True,
                            num_workers=0)
  vocabulary = train_dataset.get_vocabulary()

  best_model = None
  best_loss = 0.0
  full_loss = []

  device = torch.device('cuda:0' if torch.cuda.is_available() else 'cpu')

  model = SentenceGeneration(vocabulary_size=len(vocabulary),
                             embedding_dim=EMBEDDING_DIM,
                             rnn_module=RNN_MODULES[RNN_MODULE],
                             hidden_size=HIDDEN_SIZE)
  model.to(device)

  print('Model Architecture:\n%s' % model)

  criterion = nn.CrossEntropyLoss(reduction='mean')
  optimizer = torch.optim.Adam(model.parameters(), lr=LEARNING_RATE)

  for epoch in range(epochs):
    model.train()
    dataset = train_dataset
    data_loader = train_loader

    progress_bar = tqdm(enumerate(data_loader), total=len(data_loader))
    for step, (sequences, labels) in progress_bar:
      total_step = epoch * len(data_loader) + step
      sequences = sequences.to(device)
      labels = labels.to(device)

      optimizer.zero_grad()

      outputs, _ = model(sequences)
      _, preds = torch.max(outputs, 1)
      loss = criterion(outputs, labels)
      corrects = torch.sum(preds == labels.data)

      loss.backward()
      optimizer.step()

      progress_bar.set_description(
          'Loss: %.4f, Accuracy: %.4f' %
          (loss.item(), corrects.item() / len(labels)))
      full_loss.append(loss.item())

  state_dict = {"model": model.cpu().state_dict(),
                "vocabulary": vocabulary}

  return state_dict, full_loss

In [33]:
final_model, loss = shakespeare_trainer(batch_size=BATCH_SIZE,
                                        epochs=EPOCHS)

Train dataset: 1115293
Model Architecture:
SentenceGeneration(
  (embedding): Embedding(65, 256, padding_idx=0)
  (rnn_model): GRUCell(input_size=256, hidden_size=512, bias=True)
  (classifier): Linear(in_features=512, out_features=65, bias=True)
)


  0%|          | 0/273 [00:00<?, ?it/s]

  0%|          | 0/273 [00:00<?, ?it/s]

  0%|          | 0/273 [00:00<?, ?it/s]

  0%|          | 0/273 [00:00<?, ?it/s]

  0%|          | 0/273 [00:00<?, ?it/s]

  0%|          | 0/273 [00:00<?, ?it/s]

  0%|          | 0/273 [00:00<?, ?it/s]

  0%|          | 0/273 [00:00<?, ?it/s]

  0%|          | 0/273 [00:00<?, ?it/s]

  0%|          | 0/273 [00:00<?, ?it/s]

In [None]:
### You can make a plot using matplotlib with loss

In [34]:
def sample_next_char_id(predicted_logits):
  next_char_id = categorical.Categorical(logits=predicted_logits).sample()
  return next_char_id

In [41]:
def shakespeare_writer(state_dict, start_string):
  """Generates new sentences using trained language model."""
  device = 'cpu'

  vocabulary = state_dict['vocabulary']

  char2index = {x: i for (i, x) in enumerate(vocabulary)}
  index2char = {i: x for (i, x) in enumerate(vocabulary)}

  inputs = torch.tensor([char2index[x] for x in start_string])
  inputs = inputs.view(1, -1)

  model = SentenceGeneration(vocabulary_size=len(vocabulary),
                             embedding_dim=EMBEDDING_DIM,
                             rnn_module=RNN_MODULES[RNN_MODULE],
                             hidden_size=HIDDEN_SIZE)

  model.load_state_dict(state_dict['model'])
  model.eval()

  generated_chars = []
  hidden = None
  #####################################################################
  # Implement here for generating new sentence                        #
  # Specifically, you need to iterate through the history and predict #
  # next character; then you could take the predicted history as part #
  # of history then repeat the process. The generation should be      #
  # repeated for FLAGS.generation_length times.
  #####################################################################
  # Convert start string to tensor
  inputs = inputs.to(device)

  with torch.no_grad():
      for _ in range(GENERATION_LENGTH):
          output, hidden = model(inputs, hidden)
          # Adjust indexing based on actual output dimensions
          predicted_logits = output.squeeze()  # Removing sequence length dimension since it's likely 1
          predicted_char_id = sample_next_char_id(predicted_logits).item()
          generated_chars.append(index2char[predicted_char_id])
          # Update inputs for the next iteration
          inputs = torch.tensor([[predicted_char_id]], dtype=torch.long, device=device)

  return start_string + ''.join(generated_chars)

In [44]:
generated_text_romeo = shakespeare_writer(final_model, START_STRING)

In [45]:
print(generated_text_romeo)

ROMEO:
No, now, to fail them will away his pace.

DUKE VINCENTIO:
Go thou: 'tis but body to go you apparel in
The hole word. Your zoundly dust I have said,
It wounds you in the morning's nor question gross;
Our father remove your worship in as he
thess, stand and fongues and terrions in this.

GRUMIO:
How do your father cast? Perchiard, I
may not poor Henry for Rome.

VIRGILIA:
In that! I had rememperous the head of all
walting hate, carries a dream--

BRUTUS:
Sirrah, tell you, let us share guilty hearing.
He bears the battle and makes their waters
To want thee shall I to be put am.

RIVINE:
Been in my great creysure of the earth
Distipline. Your worship.

ESCALUS:
Which was the church is reddel is lord!

GRUMIO:
I tell you, my inchard-bed, fie! But, dear days:
Clumb Siges no withood he stands most one
Of a king as, breast of reople? Thou confess'd,
If you, if this, have obeybed at it.
Anon, brother, I pray you: here being it;
Your graced freely do is for even lies:
Speak with you; I w

In [47]:
generated_text_juliet = shakespeare_writer(final_model, START_STRING)

In [48]:
print(generated_text_juliet)

JULIET:
Call off is it bight our three again; I word,
What answer Bolingbroke's to the Towem.
Here's to yaunt?
O Tybalt, wouldst law before thy book!
O day, cannot speak, shout on thee till I
And say this throw their voices
Till her sacrus. I warraguide another,
When apellion whom could end our beards, and breathe
very straight: to my followers make a tormently
and our bacely issue: I will tell him there
of forside this fellow:
And was not her; even some spirits upon
When I have not suspect to enmite
Our swords to this inglight to her back:
What to the son, that you shall be move,
Like a foul wrecks for me, or heard him pack'd
Your cortalieve on me. I am my encrease
Or no more; spoke her wants to counsel a house
And leave it thus bestrid thy throat,
And yet put thee to her justice: the queen's chaste,
And let my shadow hands with warlike out.

PETERILA:
Is it fall louded, sir:
But that say you shall requite my tent:
Down and reasons, I will sets on her:
Shall I be of our worscian, whos