<a href="https://colab.research.google.com/github/shazzad-hasan/practice-deep-learning-with-pytorch/blob/main/seq_to_seq/char_level_lstm.ipynb" target="_parent"><img src="https://colab.research.google.com/assets/colab-badge.svg" alt="Open In Colab"/></a>

In [16]:
# upload kaggle API key from your local machine
from google.colab import files
files.upload()

Saving kaggle.json to kaggle (1).json


{'kaggle.json': b'{"username":"shazzadraihan","key":"da63bbe0f8dcb3bd7fb35034046ca758"}'}

In [17]:
# make a kaggle dir, copy the API key to it
# and make sure the file in only readable by yourself (chmod 600)
!mkdir ~/.kaggle 
!cp kaggle.json ~/.kaggle/
!chmod 600 ~/.kaggle/kaggle.json

mkdir: cannot create directory ‘/root/.kaggle’: File exists


In [18]:
# use API command to download the dataset
!kaggle datasets download -d wanderdust/anna-karenina-book

anna-karenina-book.zip: Skipping, found more recently modified local copy (use --force to force download)


In [19]:
# uncompress the dataset
!unzip -qq anna-karenina-book.zip

replace anna.txt? [y]es, [n]o, [A]ll, [N]one, [r]ename: n


In [20]:
# open text file and read in some data as text
with open("/content/anna.txt", "r") as f:
  text = f.read()

text[:100]

'Chapter 1\n\n\nHappy families are all alike; every unhappy family is unhappy in its own\nway.\n\nEverythin'

In [21]:
# import required libraries
import torch
import numpy as np

In [22]:
# check if cuda is available
train_on_gpu = torch.cuda.is_available()

if not train_on_gpu:
  print("CUDA is not available")
else:
  print("CUDA is available")

device = torch.device('cuda') if train_on_gpu else torch.device('cpu')

CUDA is available


### Pre-process the dataset

In [23]:
# tokenization

chars = tuple(set(text))
# map each int to char
int_to_char = dict(enumerate(chars))
# map each char to int
char_to_int = {ch:idx for idx, ch in int_to_char.items()}

# encode 
encoded = np.array([char_to_int[ch] for ch in text])
encoded[:100]

array([15, 67,  6, 78, 37, 11, 19, 43, 27, 20, 20, 20, 47,  6, 78, 78, 39,
       43, 70,  6, 10, 77,  1, 77, 11, 12, 43,  6, 19, 11, 43,  6,  1,  1,
       43,  6,  1, 77, 80, 11, 21, 43, 11, 18, 11, 19, 39, 43,  8, 29, 67,
        6, 78, 78, 39, 43, 70,  6, 10, 77,  1, 39, 43, 77, 12, 43,  8, 29,
       67,  6, 78, 78, 39, 43, 77, 29, 43, 77, 37, 12, 43, 30, 25, 29, 20,
       25,  6, 39, 50, 20, 20, 16, 18, 11, 19, 39, 37, 67, 77, 29])

In [24]:
def one_hot_encode(arr, n_labels):
    
    # initialize the the encoded array with zeros
    one_hot = np.zeros((arr.size, n_labels), dtype=np.float32)
    
    # fill with ones where appropriate
    one_hot[np.arange(one_hot.shape[0]), arr.flatten()] = 1.
    
    # reshape to get back to the original array
    one_hot = one_hot.reshape((*arr.shape, n_labels))
    
    return one_hot

In [25]:
def get_batches(arr, batch_size, seq_length):
    total_batch_size = batch_size * seq_length
    # total number of batches
    n_batches = len(arr)//total_batch_size
    
    # keep enough characters to make full batches
    arr = arr[:n_batches * total_batch_size]
    # reshape into batch_size rows
    arr = arr.reshape((batch_size, -1))
    
    for n in range(0, arr.shape[1], seq_length):
        # features
        x = arr[:, n:n+seq_length]
        # targets, shifted by one
        y = np.zeros_like(x)
        try:
            y[:, :-1], y[:, -1] = x[:, 1:], arr[:, n+seq_length]
        except IndexError:
            y[:, :-1], y[:, -1] = x[:, 1:], arr[:, 0]
        yield x, y
  

### Model

In [51]:
import torch.nn as nn
import torch.optim as optim

class RNN(nn.Module):
  def __init__(self, tokens, n_hidden, n_layers, drop_prob=0.5, lr=0.001):
    super().__init__()
    self.n_layers = n_layers 
    self.n_hidden = n_hidden 
    self.drop_prob = drop_prob
    self.lr = lr 

    # create character dictionaries
    self.chars = tokens 
    self.int_to_char = dict(enumerate(self.chars))
    self.char_to_int = {ch:idx for idx, ch in self.int_to_char.items()}

    self.lstm = nn.LSTM(len(self.chars), n_hidden, n_layers, dropout=drop_prob, batch_first=True)
    self.dropout = nn.Dropout(drop_prob)
    self.fc = nn.Linear(n_hidden, len(self.chars))


  def forward(self, x, hidden):
    out, hidden = self.lstm(x, hidden)
    out = self.dropout(out)
    out = out.contiguous().view(-1, self.n_hidden)
    out = self.fc(out)
    return out, hidden

  def initialize_hidden(self, batch_size):
    weight = next(self.parameters()).data

    # initialize hidden state and cell state of LSTM with zeros (n_layers * batch_size * n_hidden)
    hidden = (weight.new(self.n_layers, batch_size, self.n_hidden).zero_().to(device),
             weight.new(self.n_layers, batch_size, self.n_hidden).zero_().to(device))
    
    return hidden

In [52]:
n_hidden = 512 
n_layers = 2 
drop_prob=0.5
lr=0.001

model = RNN(chars, n_hidden, n_layers, drop_prob, lr)
print(model)

RNN(
  (lstm): LSTM(83, 512, num_layers=2, batch_first=True, dropout=0.5)
  (dropout): Dropout(p=0.5, inplace=False)
  (fc): Linear(in_features=512, out_features=83, bias=True)
)


### Train

In [28]:
def train(model, data, epochs, batch_size, seq_length, lr, clip, valid_size, print_every=10):

  model.train()

  params = model.parameters()
  optimizer = optim.Adam(params, lr = lr)
  criterion = nn.CrossEntropyLoss()

  # split the dataset
  valid_idx = int(len(data)*(1-valid_size))
  train_data, valid_data = data[:valid_idx], data[valid_idx:]

  model.to(device)

  counter = 0
  n_chars = len(model.chars)
  for epoch in range(epochs):

    # initialize the hidden state
    h = model.initialize_hidden(batch_size)

    for inputs, targets in get_batches(data, batch_size, seq_length):
      counter += 1 
      # one-hot encode the data
      inputs = one_hot_encode(inputs, n_chars)
      # make torch tensor
      inputs, targets = torch.from_numpy(inputs), torch.from_numpy(targets)
      # move the tensors to the right device
      inputs, targets = inputs.to(device), targets.to(device)

      # create new variable for the hidden state to avoid backprop through the 
      # entire training history
      h = tuple([each.data for each in h])

      # clear the gradients of all optimized variables
      model.zero_grad()
      # forward pass
      output, h = model(inputs, h)
      # calculate the loss
      loss = criterion(output, targets.view(batch_size * seq_length).long())
      # backprob
      loss.backward()
      # prevent exploding gradients problem in rnn/lstm
      nn.utils.clip_grad_norm(model.parameters(), clip)
      # update parameters
      optimizer.step()

      # ------------ validate the model -----------------
      if counter % print_every == 0:
        # initialize the hidden state
        valid_h = model.init_hidden(batch_size)

        valid_losses = []

        # set the model to evaluation mode
        model.eval()
        for inputs, targets in get_batches(valid_data, batch_size, seq_length):
          # one-hot encode the inputs
          inputs = one_hot_encode(inputs, n_chars)
          # make torch tensor
          inputs, targets = torch.from_numpy(inputs), torch.from_numpy(targets)
          # create new variable for the hidden state to avoid backprop through the 
          # entire training history 
          valid_h = tuple([each for each in valid_h])
          # move the tensor to the right device
          inputs, targets = inputs.to(device), targets.to(device)
          # forward pass
          output, valid_h = model(inputs, valid_h)
          # calculate the batch loss
          valid_loss = criterion(output, targets.view(batch_size * seq_length).long())

          valid_losses.append(valid_loss.item())
        
        # reset to train mode
        model.train()

        print("Epochs: {} \tStep: {} \tTraining loss: {:.6f} \tValidation loss: {:.6f}".format(epoch+1, 
                                                                                               counter, 
                                                                                               loss.item(), 
                                                                                               np.mean(valid_losses)))

In [39]:
epochs = 50
batch_size = 128
seq_length = 100
n_epochs = 20
lr=0.001
clip = 5
val_size=0.1
print_every=10

# train the model
train(model, encoded, epochs, batch_size, seq_length, lr, clip, val_size, print_every=10)

  nn.utils.clip_grad_norm(model.parameters(), clip)


Epochs: 1 	Step: 10 	Training loss: 1.642606 	Validation loss: 1.573425
Epochs: 1 	Step: 20 	Training loss: 1.626478 	Validation loss: 1.551778
Epochs: 1 	Step: 30 	Training loss: 1.634040 	Validation loss: 1.539947
Epochs: 1 	Step: 40 	Training loss: 1.611581 	Validation loss: 1.534872
Epochs: 1 	Step: 50 	Training loss: 1.622869 	Validation loss: 1.527410
Epochs: 1 	Step: 60 	Training loss: 1.569574 	Validation loss: 1.521552
Epochs: 1 	Step: 70 	Training loss: 1.630716 	Validation loss: 1.516182
Epochs: 1 	Step: 80 	Training loss: 1.585607 	Validation loss: 1.512136
Epochs: 1 	Step: 90 	Training loss: 1.586133 	Validation loss: 1.507111
Epochs: 1 	Step: 100 	Training loss: 1.611381 	Validation loss: 1.500932
Epochs: 1 	Step: 110 	Training loss: 1.610609 	Validation loss: 1.495279
Epochs: 1 	Step: 120 	Training loss: 1.582781 	Validation loss: 1.490929
Epochs: 1 	Step: 130 	Training loss: 1.572485 	Validation loss: 1.485762
Epochs: 1 	Step: 140 	Training loss: 1.583190 	Validation lo

In [40]:
import torch.nn.functional as F

def predict(model, char, h=None, top_k=None):

  x = np.array([[model.char_to_int[char]]])
  x = one_hot_encode(x, len(model.chars))
  inputs = torch.from_numpy(x)

  inputs = inputs.to(device)

  h = tuple([each.data for each in h])
  out, h = model(inputs, h)

  p = F.softmax(out, dim=1).data 

  if train_on_gpu:
    p = p.cpu()

  if top_k is None:
    top_ch = np.arange(len(model.chars))
  else:
    p, top_ch = p.topk(top_k)
    top_ch = top_ch.numpy().squeeze()

  p = p.numpy().squeeze()
  char = np.random.choice(top_ch, p = p/p.sum())
  return model.int_to_char[char], h

In [41]:
def sample(model, size, prime="The", top_k=None):
  model.to(device)
  
  model.eval()
  
  chars = [ch for ch in prime]
  h = model.init_hidden(1)
  for ch in prime:
    char, h = predict(model, ch, h, top_k=top_k)

  chars.append(char)

  for ii in range(size):
    char, h = predict(model, chars[-1], h, top_k=top_k)
    chars.append(char)

  return "".join(chars)

In [42]:
print(sample(model, 2000, prime="Anna", top_k=5))

Anna,
how it was the same teacher, was that he must think about her own
sort. And it was so successful. This in an old peinters would not
high and corserve himself that, and with an exasperated partners.

The liver was still he could not be asking an old particular wither
of an interest in her acquaintances, and that there was nothing so like
a shirt, to be disappointed. As he had such an answer. She set the
hall was gown away, and as she would have come to stop it," said Anna, smiling,
with a smile. "It's a meaning face, as it was better than all the same."

"Yes, it was not tired."

"It's so," she thought, "that I don't stretch her in the hand of it.
That is to get in," she added, so it was still more angry at the
simple and election, and went out.

And the sick man he had been continually staying with his father,
who had not spent, and with the happiest consciousness
of her sisters, and this was, so he was seeing him to be
clearly, and that he had a continual gaily cheek in a stort.

In [43]:
model_name = "char_rnn.model"

checkpoint = {"n_hidden": model.n_hidden,
              "n_layers": model.n_layers,
              "state_dict": model.state_dict(),
              "tokens": model.chars}

with open(model_name, "wb") as f:
  torch.save(checkpoint, f)

In [44]:
with open("char_rnn.model", "rb") as f:
  checkpoint = torch.load(f)

loaded = CharRNN(checkpoint['tokens'], n_hidden=checkpoint["n_hidden"], n_layers=checkpoint["n_layers"])
loaded.load_state_dict(checkpoint["state_dict"])
print(sample(loaded, size = 2000, prime="And Levin said", top_k=5))

And Levin said:

"You must give him a girl and would bucht on the beginning!"

"Why do you want?"

"I haven't the same," said Levin.

"Oh, this woman has told me a step defect on all the particular of
his position and so much affected by a sort of men of a treather, and
to manage to answer the professor."

"Why is it you had a chesce to the problem? What do you say, I describe
myself on that story."

"What a sense of teachers think of it. He doesn't been in this
position in which, and I drive up with her into the wedding
set to see her. They don't but have any, but he's so delighted."

Alexey Alexandrovitch got up, when he was all open was despising in her arms
in his corners, and shook her face, and settling at the same time in
all with which he had already sent the politicial committee on his
son, who had not must be dull say, and a signs of conversation with which
he came the hall--who should say that he was at the same time. Anna with
his handsome, station in his handsome waters, s