In [None]:
import torch
import random
import zipfile

with zipfile.ZipFile('jaychou_lyrics.txt.zip') as zin:
  with zin.open('jaychou_lyrics.txt') as f:
    corpus_chars = f.read().decode('utf-8')
corpus_chars[:40]

In [262]:
corpus_chars = corpus_chars.replace('\n', ' ').replace('\r', ' ')
corpus_chars = corpus_chars[0:10000]

In [None]:
idx_to_char = list(set(corpus_chars))
char_to_idx = dict([(char, i) for i, char in enumerate(idx_to_char)])
vocab_size = len(char_to_idx)
vocab_size # 1027

In [None]:
corpus_indices = [char_to_idx[char] for char in corpus_chars]
sample = corpus_indices[:20]
print('chars:', ''.join([idx_to_char[idx] for idx in sample]))
print('indices:', sample)

In [265]:
def load_data_jay_lyrics():
  """load the Jay Chou lyrics dataset"""
  with zipfile.ZipFile('jaychou_lyrics.txt.zip') as zin:
    with zin.open('jaychou_lyrics.txt') as f:
      corpus_chars = f.read().decode('utf-8')
  corpus_chars = corpus_chars.replace('\n', ' ').replace('\r', ' ')
  corpus_chars = corpus_chars[0:10000]
  idx_to_char = list(set(corpus_chars))
  char_to_idx = dict([(char, i) for i, char in enumerate(idx_to_char)])
  vocab_size = len(char_to_idx)
  corpus_indices = [char_to_idx[char] for char in corpus_chars]
  return corpus_indices, char_to_idx, idx_to_char, vocab_size

In [None]:
import time
import math
import numpy as np
import torch
from torch import nn, optim
import torch.nn.functional as F

device = torch.device('cuda' if torch.cuda.is_available() else 'cpu')
(corpus_indices, char_to_idx, idx_to_char, vocab_size) = load_data_jay_lyrics()
vocab_size

In [267]:
num_hiddens = 256
# rnn_layer = nn.LSTM(input_size=vocab_size, hidden_size=num_hiddens) # 已测试
rnn_layer = nn.RNN(input_size=vocab_size, hidden_size=num_hiddens)

In [None]:
num_steps = 35
batch_size = 2
state = None
X = torch.rand(num_steps, batch_size, vocab_size)
Y, state_new = rnn_layer(X, state)
print(Y.shape, len(state_new), state_new[0].shape)

In [269]:
def one_hot(x, n_class, dtype=torch.float32): 
  # X shape: (batch), output shape: (batch, n_class)
  x = x.long()
  res = torch.zeros(x.shape[0], n_class, dtype=dtype, device=x.device)
  res.scatter_(1, x.view(-1, 1), 1)
  return res

def to_onehot(X, n_class):
  # X shape: (batch, seq_len), output: seq_len elements of (batch, n_class)
  return [one_hot(X[:, i], n_class) for i in range(X.shape[1])]

In [270]:
class RNNModel(nn.Module):
  def __init__(self, rnn_layer, vocab_size):
    super().__init__()
    self.rnn = rnn_layer
    self.hidden_size = rnn_layer.hidden_size * (2 if rnn_layer.bidirectional else 1)
    self.vocab_size = vocab_size
    self.dense = nn.Linear(self.hidden_size, vocab_size)
    self.state = None

  def forward(self, inputs, state):  # inputs: (batch, seq_len)
    # get one-hot vector representation
    X = to_onehot(inputs, self.vocab_size)  # X is a list
    Y, self.state = self.rnn(torch.stack(X), state)
    
    # the fully connected layer first reshapes Y to (num_steps * batch_size, num_hiddens),
    # and its output shape will be (num_steps * batch_size, vocab_size)
    output = self.dense(Y.view(-1, Y.shape[-1]))
    return output, self.state


In [271]:
def predict_rnn_pytorch(prefix, num_chars, model, vocab_size, device, idx_to_char,
                        char_to_idx):
  state = None
  # output will record prefix plus the generated output
  output = [char_to_idx[prefix[0]]]
  for t in range(num_chars + len(prefix) - 1):
    X = torch.tensor([output[-1]], device=device).view(1, 1)
    if state is not None:
      if isinstance(state, tuple):  # LSTM, state: (h, c)
        state = (state[0].to(device), state[1].to(device))
      else:
        state = state.to(device)

    (Y, state) = model(X, state)
    if t < len(prefix) - 1:
      output.append(char_to_idx[prefix[t + 1]])
    else:
      output.append(int(Y.argmax(dim=1).item()))
  return ''.join([idx_to_char[i] for i in output])


In [None]:
model = RNNModel(rnn_layer, vocab_size).to(device)
predict_rnn_pytorch('分开', 10, model, vocab_size, device, idx_to_char, char_to_idx)

In [277]:
def data_iter_consecutive(corpus_indices, batch_size, num_steps, device=None):
  if device is None:
    device = torch.device('cuda' if torch.cuda.is_available() else 'cpu')
    
  corpus_indices = torch.tensor(corpus_indices, dtype=torch.float32, device=device)
  data_len = len(corpus_indices)
  batch_len = data_len // batch_size
  indices = corpus_indices[0: batch_size*batch_len].view(batch_size, batch_len)
  epoch_size = (batch_len - 1) // num_steps
  for i in range(epoch_size):
    i = i * num_steps
    X = indices[:, i: i + num_steps] 
    Y = indices[:, i + 1: i + num_steps + 1]
    yield X, Y


In [275]:
def grad_clipping(params, theta, device):
  norm = torch.tensor([0.0], device=device)
  for param in params:
    norm += (param.grad.data ** 2).sum()
  
  norm = norm.sqrt().item()
  if norm > theta:
    for param in params:
      param.grad.data *= (theta / norm)


In [278]:
def train_and_predict_rnn_pytorch(model, num_hiddens, vocab_size, device,
                                  corpus_indices, idx_to_char, char_to_idx,
                                  num_epochs, num_steps, lr, clipping_theta,
                                  batch_size, pred_period, pred_len, prefixes):
  criterion = nn.CrossEntropyLoss()
  optimizer = torch.optim.Adam(model.parameters(), lr=lr)
  model.to(device)
  state = None
  for epoch in range(num_epochs):
    l_sum, n, start = 0.0, 0, time.time()
    data_iter = data_iter_consecutive(corpus_indices, batch_size, num_steps, device)  # consecutive sampling
    for X, Y in data_iter:
      if state is not None:
        # use detach to separate the hidden state from the computation graph.
        # this is to ensure that the gradient computation of the model parameters
        # only depends on the small batch sequence read in one iteration (to prevent
        # excessive gradient computation costs).
        if isinstance(state, tuple):  # LSTM, state: (h, c)
          state = (state[0].detach(), state[1].detach())
        else:
          state = state.detach()

      # output shape: (num_steps * batch_size, vocab_size)
      (output, state) = model(X, state)

      # the shape of Y is (batch_size, num_steps). after transposing and reshaping,
      # it becomes a vector of length batch * num_steps, aligning with the output rows.
      y = torch.transpose(Y, 0, 1).contiguous().view(-1)
      loss = criterion(output, y.long())

      optimizer.zero_grad()
      loss.backward()
      # gradient clipping
      grad_clipping(model.parameters(), clipping_theta, device)
      optimizer.step()
      l_sum += loss.item() * y.shape[0]
      n += y.shape[0]

    try:
      perplexity = math.exp(l_sum / n)
    except OverflowError:
      perplexity = float('inf')
    if (epoch + 1) % pred_period == 0:
      print('epoch %d, perplexity %f, time %.2f sec' % (
          epoch + 1, perplexity, time.time() - start))
      for prefix in prefixes:
        print(' -', predict_rnn_pytorch(
            prefix, pred_len, model, vocab_size, device, idx_to_char,
            char_to_idx))


In [None]:
# note the learning rate setting here
num_epochs, batch_size, lr, clipping_theta = 250, 32, 1e-3, 1e-2
pred_period, pred_len, prefixes = 50, 50, ['分开', '不分开']
train_and_predict_rnn_pytorch(model, num_hiddens, vocab_size, device,
                              corpus_indices, idx_to_char, char_to_idx,
                              num_epochs, num_steps, lr, clipping_theta,
                              batch_size, pred_period, pred_len, prefixes)
