<a href="https://colab.research.google.com/github/shazzad-hasan/practice-deep-learning-with-pytorch/blob/main/seq_to_seq/char_level_lstm.ipynb" target="_parent"><img src="https://colab.research.google.com/assets/colab-badge.svg" alt="Open In Colab"/></a>

In [1]:
# upload kaggle API key from your local machine
from google.colab import files
files.upload()

Saving kaggle.json to kaggle.json


{'kaggle.json': b'{"username":"shazzadraihan","key":"da63bbe0f8dcb3bd7fb35034046ca758"}'}

In [2]:
# make a kaggle dir, copy the API key to it
# and make sure the file in only readable by yourself (chmod 600)
!mkdir ~/.kaggle 
!cp kaggle.json ~/.kaggle/
!chmod 600 ~/.kaggle/kaggle.json

In [3]:
# use API command to download the dataset
!kaggle datasets download -d wanderdust/anna-karenina-book

Downloading anna-karenina-book.zip to /content
  0% 0.00/739k [00:00<?, ?B/s]
100% 739k/739k [00:00<00:00, 134MB/s]


In [4]:
# uncompress the dataset
!unzip -qq anna-karenina-book.zip

In [5]:
# open text file and read in some data as text
with open("/content/anna.txt", "r") as f:
  text = f.read()

text[:100]

'Chapter 1\n\n\nHappy families are all alike; every unhappy family is unhappy in its own\nway.\n\nEverythin'

In [6]:
# import required libraries
import torch
import numpy as np

In [7]:
# check if cuda is available
train_on_gpu = torch.cuda.is_available()

if not train_on_gpu:
  print("CUDA is not available")
else:
  print("CUDA is available")

device = torch.device('cuda') if train_on_gpu else torch.device('cpu')

CUDA is available


### Pre-process the dataset

In [8]:
# tokenization

chars = tuple(set(text))
# map each int to char
int_to_char = dict(enumerate(chars))
# map each char to int
char_to_int = {ch:idx for idx, ch in int_to_char.items()}

# encode 
encoded = np.array([char_to_int[ch] for ch in text])
encoded[:100]

array([16, 67,  3, 10, 25, 37, 21, 82,  7, 20, 20, 20, 11,  3, 10, 10, 73,
       82, 33,  3, 18, 50, 13, 50, 37, 51, 82,  3, 21, 37, 82,  3, 13, 13,
       82,  3, 13, 50, 63, 37, 49, 82, 37, 52, 37, 21, 73, 82, 40, 60, 67,
        3, 10, 10, 73, 82, 33,  3, 18, 50, 13, 73, 82, 50, 51, 82, 40, 60,
       67,  3, 10, 10, 73, 82, 50, 60, 82, 50, 25, 51, 82, 31, 54, 60, 20,
       54,  3, 73, 35, 20, 20,  8, 52, 37, 21, 73, 25, 67, 50, 60])

In [9]:
# split the dataset
valid_size = 0.1

valid_idx = int(len(encoded)*(1-valid_size))
train_data, valid_data = encoded[:valid_idx], encoded[valid_idx:]

In [10]:
def one_hot_encode(arr, n_labels):
    
    # initialize the the encoded array with zeros
    one_hot = np.zeros((arr.size, n_labels), dtype=np.float32)
    
    # fill with ones where appropriate
    one_hot[np.arange(one_hot.shape[0]), arr.flatten()] = 1.
    
    # reshape to get back to the original array
    one_hot = one_hot.reshape((*arr.shape, n_labels))
    
    return one_hot

In [11]:
def get_batches(arr, batch_size, seq_length):
    total_batch_size = batch_size * seq_length
    # total number of batches
    n_batches = len(arr)//total_batch_size
    
    # keep enough characters to make full batches
    arr = arr[:n_batches * total_batch_size]
    # reshape into batch_size rows
    arr = arr.reshape((batch_size, -1))
    
    for n in range(0, arr.shape[1], seq_length):
        # features
        x = arr[:, n:n+seq_length]
        # targets, shifted by one
        y = np.zeros_like(x)
        try:
            y[:, :-1], y[:, -1] = x[:, 1:], arr[:, n+seq_length]
        except IndexError:
            y[:, :-1], y[:, -1] = x[:, 1:], arr[:, 0]
        yield x, y
  

### Model

In [12]:
import torch.nn as nn
import torch.optim as optim

class RNN(nn.Module):
  def __init__(self, tokens, n_hidden, n_layers, drop_prob=0.5, lr=0.001):
    super().__init__()
    self.n_layers = n_layers 
    self.n_hidden = n_hidden 
    self.drop_prob = drop_prob
    self.lr = lr 

    # create character dictionaries
    self.chars = tokens 
    self.int_to_char = dict(enumerate(self.chars))
    self.char_to_int = {ch:idx for idx, ch in self.int_to_char.items()}

    self.lstm = nn.LSTM(len(self.chars), n_hidden, n_layers, dropout=drop_prob, batch_first=True)
    self.dropout = nn.Dropout(drop_prob)
    self.fc = nn.Linear(n_hidden, len(self.chars))


  def forward(self, x, hidden):
    out, hidden = self.lstm(x, hidden)
    out = self.dropout(out)
    out = out.contiguous().view(-1, self.n_hidden)
    out = self.fc(out)
    return out, hidden

  def initialize_hidden(self, batch_size):
    weight = next(self.parameters()).data

    # initialize hidden state and cell state of LSTM with zeros (n_layers * batch_size * n_hidden)
    hidden = (weight.new(self.n_layers, batch_size, self.n_hidden).zero_().to(device),
             weight.new(self.n_layers, batch_size, self.n_hidden).zero_().to(device))
    
    return hidden

In [13]:
n_hidden = 512 
n_layers = 2 
drop_prob=0.5
lr=0.001

model = RNN(chars, n_hidden, n_layers, drop_prob, lr)
print(model)
model.to(device)

RNN(
  (lstm): LSTM(83, 512, num_layers=2, batch_first=True, dropout=0.5)
  (dropout): Dropout(p=0.5, inplace=False)
  (fc): Linear(in_features=512, out_features=83, bias=True)
)


RNN(
  (lstm): LSTM(83, 512, num_layers=2, batch_first=True, dropout=0.5)
  (dropout): Dropout(p=0.5, inplace=False)
  (fc): Linear(in_features=512, out_features=83, bias=True)
)

### Train

In [14]:
def train(model, data, epochs, batch_size, seq_length, criterion, optimizer, lr, clip, print_every=10):

  model.train()

  counter = 0
  n_chars = len(model.chars)
  for epoch in range(epochs):

    # initialize the hidden state
    h = model.initialize_hidden(batch_size)

    for inputs, targets in get_batches(data, batch_size, seq_length):
      counter += 1 
      # one-hot encode the data
      inputs = one_hot_encode(inputs, n_chars)
      # make torch tensor
      inputs, targets = torch.from_numpy(inputs), torch.from_numpy(targets)
      # move the tensors to the right device
      inputs, targets = inputs.to(device), targets.to(device)

      # create new variable for the hidden state to avoid backprop through the 
      # entire training history
      h = tuple([each.data for each in h])

      # clear the gradients of all optimized variables
      model.zero_grad()
      # forward pass
      output, h = model(inputs, h)
      # calculate the loss
      loss = criterion(output, targets.view(batch_size * seq_length).long())
      # backprob
      loss.backward()
      # prevent exploding gradients problem in rnn/lstm
      nn.utils.clip_grad_norm(model.parameters(), clip)
      # update parameters
      optimizer.step()

      # ------------ validate the model -----------------
      if counter % print_every == 0:
        # initialize the hidden state
        valid_h = model.initialize_hidden(batch_size)

        valid_losses = []

        # set the model to evaluation mode
        model.eval()
        for inputs, targets in get_batches(valid_data, batch_size, seq_length):
          # one-hot encode the inputs
          inputs = one_hot_encode(inputs, n_chars)
          # make torch tensor
          inputs, targets = torch.from_numpy(inputs), torch.from_numpy(targets)
          # create new variable for the hidden state to avoid backprop through the 
          # entire training history 
          valid_h = tuple([each for each in valid_h])
          # move the tensor to the right device
          inputs, targets = inputs.to(device), targets.to(device)
          # forward pass
          output, valid_h = model(inputs, valid_h)
          # calculate the batch loss
          valid_loss = criterion(output, targets.view(batch_size * seq_length).long())

          valid_losses.append(valid_loss.item())

        # reset to train mode
        model.train()

        print("Epochs: {} \tStep: {} \tTraining loss: {:.6f} \tValidation loss: {:.6f}".format(epoch+1, 
                                                                                               counter, 
                                                                                               loss.item(), 
                                                                                               np.mean(valid_losses)))

In [15]:
epochs = 50
batch_size = 128
seq_length = 100
n_epochs = 20
lr=0.001
clip = 5
print_every=10

optimizer = optim.Adam(model.parameters(), lr = lr)
criterion = nn.CrossEntropyLoss()

# train the model
train(model, encoded, epochs, batch_size, seq_length, criterion, optimizer, lr, clip, print_every=10)

  nn.utils.clip_grad_norm(model.parameters(), clip)


Epochs: 1 	Step: 10 	Training loss: 3.287554 	Validation loss: 3.229743
Epochs: 1 	Step: 20 	Training loss: 3.173907 	Validation loss: 3.140636
Epochs: 1 	Step: 30 	Training loss: 3.138572 	Validation loss: 3.126788
Epochs: 1 	Step: 40 	Training loss: 3.125086 	Validation loss: 3.118804
Epochs: 1 	Step: 50 	Training loss: 3.118336 	Validation loss: 3.118044
Epochs: 1 	Step: 60 	Training loss: 3.109410 	Validation loss: 3.114468
Epochs: 1 	Step: 70 	Training loss: 3.108942 	Validation loss: 3.111491
Epochs: 1 	Step: 80 	Training loss: 3.127609 	Validation loss: 3.102601
Epochs: 1 	Step: 90 	Training loss: 3.120541 	Validation loss: 3.085549
Epochs: 1 	Step: 100 	Training loss: 3.059779 	Validation loss: 3.044197
Epochs: 1 	Step: 110 	Training loss: 2.977266 	Validation loss: 2.953002
Epochs: 1 	Step: 120 	Training loss: 2.881481 	Validation loss: 2.843322
Epochs: 1 	Step: 130 	Training loss: 2.779883 	Validation loss: 2.786189
Epochs: 1 	Step: 140 	Training loss: 2.687161 	Validation lo

In [16]:
import torch.nn.functional as F

def predict(model, char, h=None, top_k=None):

  x = np.array([[model.char_to_int[char]]])
  x = one_hot_encode(x, len(model.chars))
  inputs = torch.from_numpy(x)

  inputs = inputs.to(device)

  h = tuple([each.data for each in h])
  out, h = model(inputs, h)

  p = F.softmax(out, dim=1).data 

  if train_on_gpu:
    p = p.cpu()

  if top_k is None:
    top_ch = np.arange(len(model.chars))
  else:
    p, top_ch = p.topk(top_k)
    top_ch = top_ch.numpy().squeeze()

  p = p.numpy().squeeze()
  char = np.random.choice(top_ch, p = p/p.sum())
  return model.int_to_char[char], h

In [20]:
def sample(model, size, prime="The", top_k=None):
  model.to(device)
  
  model.eval()
  
  chars = [ch for ch in prime]
  h = model.initialize_hidden(1)
  for ch in prime:
    char, h = predict(model, ch, h, top_k=top_k)

  chars.append(char)

  for ii in range(size):
    char, h = predict(model, chars[-1], h, top_k=top_k)
    chars.append(char)

  return "".join(chars)

In [21]:
print(sample(model, 2000, prime="Anna", top_k=10))

Anna (what she had gone, and he, finishing his
difficulty. Only the way of those people had flown over, said himself
in the same still more often.

"No, I meant," she said to him, a steppe--obviously changing to her.
"But I suppose I'm one man say it's approaching for such the shriek of
article.... No, I'm going to the peasants), but I, supposing
that it's all there to me," he said.

He listened to his wound an offer to sound something in the rest of
her cain over the position. "Something ill was said.

"Here's the reading solitude. What would he has? This is!... I've
liked to come home!" Kitty shouted to him. This glance
with a gained humble watch was a letter with a low wife. He walked
away with him.

"Yes, yes," she went on, never being sank having from bringing his
gun in the reflection and looking at the saboves of the
carriage and white and smile that had smiled business, and saw
down on the doorway when he caught sight of his spring article.

"Yes, I shall be the pure, and all d

In [22]:
model_name = "char_rnn.model"

checkpoint = {"n_hidden": model.n_hidden,
              "n_layers": model.n_layers,
              "state_dict": model.state_dict(),
              "tokens": model.chars}

with open(model_name, "wb") as f:
  torch.save(checkpoint, f)

In [23]:
with open("char_rnn.model", "rb") as f:
  checkpoint = torch.load(f)

loaded = RNN(checkpoint['tokens'], n_hidden=checkpoint["n_hidden"], n_layers=checkpoint["n_layers"])
loaded.load_state_dict(checkpoint["state_dict"])
print(sample(loaded, size = 2000, prime="And Levin said", top_k=10))

And Levin said to
Vronsky, who had changed, trying to prove theretime than ever. The tempte
of the peasants with poscible, horse-gasted, as wondering stations.

"I know this, I'll do, that's the sound to anybody."

The meaning of the subject.

"But that's only this subject."

"Anna Arkadyevna," Kitty answered, with a lawyer's eyes sitting,
held it to the door.



Chapter 18


And the conversation was not as a fascinating day to stem serious prechous
court, of words were so much as a day before. He tried to think about
the people who did not recognize his lips, and he remembered the
most plight-defeneed matters that she could not love, and went forward to
him. In the lest minutes as to the baby all in right, in all
reposts, he felt now with a warm conduct of his own thoughts, the
fearful children, that she could not sleep for the counting house, but
a decidering, which stronger his movements, and atcompressed their
dress, and of their fearful painting his brother adoining them,
watched 