<a href="https://colab.research.google.com/github/shazzad-hasan/practice-deep-learning-with-pytorch/blob/main/seq_to_seq/char_level_lstm.ipynb" target="_parent"><img src="https://colab.research.google.com/assets/colab-badge.svg" alt="Open In Colab"/></a>

In [1]:
# upload kaggle API key from your local machine
from google.colab import files
files.upload()

Saving kaggle.json to kaggle.json


{'kaggle.json': b'{"username":"shazzadraihan","key":"da63bbe0f8dcb3bd7fb35034046ca758"}'}

In [2]:
# make a kaggle dir, copy the API key to it
# and make sure the file in only readable by yourself (chmod 600)
!mkdir ~/.kaggle 
!cp kaggle.json ~/.kaggle/
!chmod 600 ~/.kaggle/kaggle.json

In [3]:
# use API command to download the dataset
!kaggle datasets download -d wanderdust/anna-karenina-book

Downloading anna-karenina-book.zip to /content
100% 739k/739k [00:00<00:00, 898kB/s]
100% 739k/739k [00:00<00:00, 897kB/s]


In [4]:
# uncompress the dataset
!unzip -qq anna-karenina-book.zip

In [5]:
# import required libraries
import torch
import numpy as np

In [6]:
# check if cuda is available
train_on_gpu = torch.cuda.is_available()

if not train_on_gpu:
  print("CUDA is not available")
else:
  print("CUDA is available")

device = torch.device('cuda') if train_on_gpu else torch.device('cpu')

CUDA is available


In [7]:
# open text file and read dataset
with open("/content/anna.txt", "r") as f:
  text = f.read()

text[:100]

'Chapter 1\n\n\nHappy families are all alike; every unhappy family is unhappy in its own\nway.\n\nEverythin'

### Pre-process the dataset

In [8]:
# tokenization

chars = tuple(set(text))
# map each int to char
int_to_char = dict(enumerate(chars))
# map each char to int
char_to_int = {ch:idx for idx, ch in int_to_char.items()}

# encode 
encoded = np.array([char_to_int[ch] for ch in text])

encoded[:100]

array([62, 52, 47, 75, 81,  3, 70, 55, 61, 22, 22, 22, 76, 47, 75, 75, 68,
       55,  1, 47,  4, 71,  7, 71,  3, 51, 55, 47, 70,  3, 55, 47,  7,  7,
       55, 47,  7, 71, 53,  3, 28, 55,  3, 10,  3, 70, 68, 55, 72, 30, 52,
       47, 75, 75, 68, 55,  1, 47,  4, 71,  7, 68, 55, 71, 51, 55, 72, 30,
       52, 47, 75, 75, 68, 55, 71, 30, 55, 71, 81, 51, 55, 34, 13, 30, 22,
       13, 47, 68, 12, 22, 22, 16, 10,  3, 70, 68, 81, 52, 71, 30])

In [9]:
def one_hot_encode(arr, n_labels):
  # initialize the encoded array with zeros
  one_hot = np.zeros((arr.size, n_labels), dtype=np.float32)
  # fill with ones where appropriate
  one_hot[np.arange(one_hot.shape[0]), arr.flatten()] = 1.
  # reshape to get back to the original array
  one_hot = one_hot.reshape((*arr.shape, n_labels))
  
  return one_hot

In [10]:
def get_batches(arr, batch_size, seq_length):
  """returns batches of size batch_size * seq_length"""
  total_batch_size = batch_size * seq_length
  # total number of batches
  n_batches = len(arr)//total_batch_size
  
  # keep enough characters to make full batches
  arr = arr[:n_batches * total_batch_size]
  # reshape into batch_size rows
  arr = arr.reshape((batch_size, -1))
  
  for n in range(0, arr.shape[1], seq_length):
      # features
      x = arr[:, n:n+seq_length]
      # targets, shifted by one
      y = np.zeros_like(x)
      try:
          y[:, :-1], y[:, -1] = x[:, 1:], arr[:, n+seq_length]
      except IndexError:
          y[:, :-1], y[:, -1] = x[:, 1:], arr[:, 0]
      yield x, y


In [11]:
# split the dataset
valid_size = 0.1

valid_idx = int(len(encoded)*(1-valid_size))
train_data, valid_data = encoded[:valid_idx], encoded[valid_idx:]

### Model

In [12]:
import torch.nn as nn
import torch.nn.functional as F
import torch.optim as optim

class RNN(nn.Module):
  def __init__(self, tokens, n_hidden, n_layers, drop_prob=0.5, lr=0.001):
    super().__init__()
    self.n_layers = n_layers 
    self.n_hidden = n_hidden 
    self.drop_prob = drop_prob
    self.lr = lr 

    # create character dictionaries
    self.chars = tokens 
    self.int_to_char = dict(enumerate(self.chars))
    self.char_to_int = {ch:idx for idx, ch in self.int_to_char.items()}

    self.lstm = nn.LSTM(len(self.chars), n_hidden, n_layers, dropout=drop_prob, batch_first=True)
    self.dropout = nn.Dropout(drop_prob)
    self.fc = nn.Linear(n_hidden, len(self.chars))


  def forward(self, x, hidden):
    out, hidden = self.lstm(x, hidden)
    out = self.dropout(out)
    out = out.contiguous().view(-1, self.n_hidden)
    out = self.fc(out)
    return out, hidden

  def initialize_hidden(self, batch_size):
    weight = next(self.parameters()).data

    # initialize hidden state and cell state of LSTM with zeros (n_layers * batch_size * n_hidden)
    hidden = (weight.new(self.n_layers, batch_size, self.n_hidden).zero_().to(device),
             weight.new(self.n_layers, batch_size, self.n_hidden).zero_().to(device))
    
    return hidden

In [13]:
n_hidden = 512 
n_layers = 2 
drop_prob=0.5
lr=0.001

model = RNN(chars, n_hidden, n_layers, drop_prob, lr)
print(model)
model.to(device)

RNN(
  (lstm): LSTM(83, 512, num_layers=2, batch_first=True, dropout=0.5)
  (dropout): Dropout(p=0.5, inplace=False)
  (fc): Linear(in_features=512, out_features=83, bias=True)
)


RNN(
  (lstm): LSTM(83, 512, num_layers=2, batch_first=True, dropout=0.5)
  (dropout): Dropout(p=0.5, inplace=False)
  (fc): Linear(in_features=512, out_features=83, bias=True)
)

### Train

In [14]:
def train(model, data, epochs, batch_size, seq_length, criterion, optimizer, lr, clip, print_every=10):

  model.train()

  counter = 0
  n_chars = len(model.chars)
  for epoch in range(epochs):

    # initialize the hidden state
    h = model.initialize_hidden(batch_size)

    for inputs, targets in get_batches(data, batch_size, seq_length):
      counter += 1 
      # one-hot encode the data
      inputs = one_hot_encode(inputs, n_chars)
      # make torch tensor
      inputs, targets = torch.from_numpy(inputs), torch.from_numpy(targets)
      # move the tensors to the right device
      inputs, targets = inputs.to(device), targets.to(device)

      # create new variable for the hidden state to avoid backprop through the 
      # entire training history
      h = tuple([each.data for each in h])

      # clear the gradients of all optimized variables
      model.zero_grad()
      # forward pass
      output, h = model(inputs, h)
      # calculate the loss
      loss = criterion(output, targets.view(batch_size * seq_length).long())
      # backprob
      loss.backward()
      # prevent exploding gradients problem in rnn/lstm
      nn.utils.clip_grad_norm_(model.parameters(), clip)
      # update parameters
      optimizer.step()

      # ------------ validate the model -----------------
      if counter % print_every == 0:
        # initialize the hidden state
        valid_h = model.initialize_hidden(batch_size)

        valid_losses = []

        # set the model to evaluation mode
        model.eval()
        for inputs, targets in get_batches(valid_data, batch_size, seq_length):
          # one-hot encode the inputs
          inputs = one_hot_encode(inputs, n_chars)
          # make torch tensor
          inputs, targets = torch.from_numpy(inputs), torch.from_numpy(targets)
          # create new variable for the hidden state to avoid backprop through the 
          # entire training history 
          valid_h = tuple([each for each in valid_h])
          # move the tensor to the right device
          inputs, targets = inputs.to(device), targets.to(device)
          # forward pass
          output, valid_h = model(inputs, valid_h)
          # calculate the batch loss
          valid_loss = criterion(output, targets.view(batch_size * seq_length).long())

          valid_losses.append(valid_loss.item())

        # reset to train mode
        model.train()

        print("Epochs: {} \tStep: {} \tTraining loss: {:.6f} \tValidation loss: {:.6f}".format(epoch+1, 
                                                                                               counter, 
                                                                                               loss.item(), 
                                                                                               np.mean(valid_losses)))

In [15]:
epochs = 50
batch_size = 128
seq_length = 200
lr=0.001
clip = 5
print_every=10

# define an optimizer
optimizer = optim.Adam(model.parameters(), lr = lr)
# define a loss function
criterion = nn.CrossEntropyLoss()

# train the model
train(model, encoded, epochs, batch_size, seq_length, criterion, optimizer, lr, clip, print_every)

Epochs: 1 	Step: 10 	Training loss: 3.253586 	Validation loss: 3.184217
Epochs: 1 	Step: 20 	Training loss: 3.171566 	Validation loss: 3.114517
Epochs: 1 	Step: 30 	Training loss: 3.134019 	Validation loss: 3.100145
Epochs: 1 	Step: 40 	Training loss: 3.137975 	Validation loss: 3.096791
Epochs: 1 	Step: 50 	Training loss: 3.101921 	Validation loss: 3.092978
Epochs: 1 	Step: 60 	Training loss: 3.133132 	Validation loss: 3.092322
Epochs: 1 	Step: 70 	Training loss: 3.123014 	Validation loss: 3.089268
Epochs: 2 	Step: 80 	Training loss: 3.101340 	Validation loss: 3.083387
Epochs: 2 	Step: 90 	Training loss: 3.089215 	Validation loss: 3.071867
Epochs: 2 	Step: 100 	Training loss: 3.067941 	Validation loss: 3.037249
Epochs: 2 	Step: 110 	Training loss: 3.011845 	Validation loss: 2.970101
Epochs: 2 	Step: 120 	Training loss: 2.910742 	Validation loss: 2.847421
Epochs: 2 	Step: 130 	Training loss: 2.810991 	Validation loss: 2.745389
Epochs: 2 	Step: 140 	Training loss: 2.709513 	Validation lo

### Test

In [16]:
def predict(model, char, h=None, top_k=None):
  """Given an input character, returns the predicted next character and hidden state"""

  x = np.array([[model.char_to_int[char]]])
  x = one_hot_encode(x, len(model.chars))
  inputs = torch.from_numpy(x)
  inputs = inputs.to(device)

  # detach hidden state from history
  h = tuple([each.data for each in h])
  # output of the model
  out, h = model(inputs, h)

  # character probabilities
  p = F.softmax(out, dim=1).data 

  if train_on_gpu: p = p.cpu()

  # get top charactors
  if top_k is None:
    top_ch = np.arange(len(model.chars))
  else:
    p, top_ch = p.topk(top_k)
    top_ch = top_ch.numpy().squeeze()

  # randomly select the probable next characters
  p = p.numpy().squeeze()
  char = np.random.choice(top_ch, p = p/p.sum())

  return model.int_to_char[char], h

In [17]:
def sample(model, size, prime="The", top_k=None):

  model.to(device)

  model.eval()
  # run through the prime characters
  chars = [ch for ch in prime]
  h = model.initialize_hidden(1)
  for ch in prime:
    char, h = predict(model, ch, h, top_k=top_k)

  chars.append(char)

  for i in range(size):
    char, h = predict(model, chars[-1], h, top_k=top_k)
    chars.append(char)

  return "".join(chars)

In [18]:
print(sample(model, 2000, prime="Anna", top_k=10))

Anna had
been still done; he wlated himself and going out of his
heart.

About his
place in haste to take his head his caresses and smalleways, and the only
mere more dividing his face
with shooting,
later of the same terms with it
almost in the little principle to the pausen, who degined, for that with his land of tasting complete letter of the
men after the lip was the probanch from half home and had
been put a leg when one in which she was not
sincere, and that it would be in his minute, and was in annoyons in the same
warms mean to
be seen that it was he did not asked off the considerity
of his character, all the principles of the direction of which the while was all
time to come about him, though he would be subject that she were coming again, but
they lift. Sergey Ivanovitch had long been
brought for the dishanges when there
were painting
for
what he was so saying. So that if they was sevired from the
peolle of the prince.... To act at the better of her son and his mace at the fa

In [19]:
# save the trained model
model_name = "char_rnn.model"

checkpoint = {"n_hidden": model.n_hidden,
              "n_layers": model.n_layers,
              "state_dict": model.state_dict(),
              "tokens": model.chars}

with open(model_name, "wb") as f:
  torch.save(checkpoint, f)

In [20]:
# load saved model
with open("char_rnn.model", "rb") as f:
  checkpoint = torch.load(f)

# sample using trained model
loaded = RNN(checkpoint['tokens'], n_hidden=checkpoint["n_hidden"], n_layers=checkpoint["n_layers"])
loaded.load_state_dict(checkpoint["state_dict"])
print(sample(loaded, size = 2000, prime="And Levin said", top_k=10))

And Levin said this.

"Well, we'll
dean of it."

Alexey Alexandrovitch
had not mentioned. At suches for some corridig he, and considering what was he did some to have been fancied for this at the child
and in the whole
crowd would say to herself his capital education on love of love. When the child before he began helpatilly, with a letter she had the rut that he had been hair that wished the sound of
his capable, she sent to the
detail at the sight of the bed had been fif all
circles, and standing and coath up their starss on the
big great children. And while
it she was definitely, he saw that he thought it since the party
thought that he had been to talk about, on the picture till
he had not been in love with the matter as no day,
and
felt
that the country
means of all her hossess, of coldness. She heard
what the men were stupid, the shriek of
changal in the supporitions that
always did to say anything a tring, woman she had chores. To definite her harry with words) of what she shoul