<a href="https://colab.research.google.com/github/shazzad-hasan/practice-deep-learning-with-pytorch/blob/main/seq_to_seq/char_level_lstm.ipynb" target="_parent"><img src="https://colab.research.google.com/assets/colab-badge.svg" alt="Open In Colab"/></a>

In [None]:
# upload kaggle API key from your local machine
from google.colab import files
files.upload()

In [None]:
# make a kaggle dir, copy the API key to it
# and make sure the file in only readable by yourself (chmod 600)
!mkdir ~/.kaggle 
!cp kaggle.json ~/.kaggle/
!chmod 600 ~/.kaggle/kaggle.json

In [None]:
# use API command to download the dataset
!kaggle datasets download -d wanderdust/anna-karenina-book

In [None]:
# uncompress the dataset
!unzip -qq anna-karenina-book.zip

In [None]:
# open text file and read in some data as text
with open("/content/anna.txt", "r") as f:
  text = f.read()

text[:100]

In [None]:
# import required libraries
import torch
import numpy as np

In [None]:
# check if cuda is available
train_on_gpu = torch.cuda.is_available()

if not train_on_gpu:
  print("CUDA is not available")
else:
  print("CUDA is available")

device = torch.device('cuda') if train_on_gpu else torch.device('cpu')

### Pre-process the dataset

In [None]:
# tokenization

chars = tuple(set(text))
# map each int to char
int_to_char = dict(enumerate(chars))
# map each char to int
char_to_int = {ch:idx for idx, ch in int_to_char.items()}

# encode 
encoded = np.array([char_to_int[ch] for ch in text])
encoded[:100]

In [None]:
def one_hot_encode(arr, n_labels):
    
    # initialize the the encoded array with zeros
    one_hot = np.zeros((arr.size, n_labels), dtype=np.float32)
    
    # fill with ones where appropriate
    one_hot[np.arange(one_hot.shape[0]), arr.flatten()] = 1.
    
    # reshape to get back to the original array
    one_hot = one_hot.reshape((*arr.shape, n_labels))
    
    return one_hot

In [None]:
def get_batches(arr, batch_size, seq_length):
    total_batch_size = batch_size * seq_length
    # total number of batches
    n_batches = len(arr)//total_batch_size
    
    # keep enough characters to make full batches
    arr = arr[:n_batches * total_batch_size]
    # reshape into batch_size rows
    arr = arr.reshape((batch_size, -1))
    
    for n in range(0, arr.shape[1], seq_length):
        # features
        x = arr[:, n:n+seq_length]
        # targets, shifted by one
        y = np.zeros_like(x)
        try:
            y[:, :-1], y[:, -1] = x[:, 1:], arr[:, n+seq_length]
        except IndexError:
            y[:, :-1], y[:, -1] = x[:, 1:], arr[:, 0]
        yield x, y
  

### Model

In [None]:
import torch.nn as nn
import torch.optim as optim

class CharRNN(nn.Module):
  def __init__(self, tokens, n_hidden, n_layers, drop_prob, lr):
    super().__init__()
    self.drop_prob = drop_prob 
    self.n_layers = n_layers 
    self.n_hidden = n_hidden 
    self.lr = lr 

    self.chars = tokens 
    self.int_to_char = dict(enumerate(self.chars))
    self.char_to_int = {ch:idx for idx, ch in self.int_to_char.items()}

    self.lstm = nn.LSTM(len(self.chars), n_hidden, n_layers, dropout=drop_prob, batch_first=True)
    self.dropout = nn.Dropout(drop_prob)
    self.fc = nn.Linear(n_hidden, len(self.chars))


  def forward(self, x, hidden):
    output, hidden = self.lstm(x, hidden)
    output = self.dropout(output)
    output = output.contiguous().view(-1, self.n_hidden)
    output = self.fc(output)
    return output, hidden

  def init_hidden(self, batch_size):
    weight = next(self.parameters()).data

    hidden = (weight.new(self.n_layers, batch_size, self.n_hidden).zero_().to(device),
             weight.new(self.n_layers, batch_size, self.n_hidden).zero_().to(device))
    
    return hidden

In [None]:
n_hidden = 512 
n_layers = 2 
drop_prob=0.5
lr=0.001

model = CharRNN(chars, n_hidden, n_layers, drop_prob, lr)
print(model)

### Train

In [None]:
def train(model, data, epochs, batch_size, seq_length, lr, clip, valid_size, print_every=10):
  model.train()
  
  optimizer = optim.Adam(model.parameters(), lr = lr)
  criterion = nn.CrossEntropyLoss()

  valid_idx = int(len(data)*(1-valid_size))
  train_data, valid_data = data[:valid_idx], data[valid_idx:]

  model.to(device)

  counter = 0
  n_chars = len(model.chars)
  for epoch in range(epochs):
    h = model.init_hidden(batch_size)
    for inputs, targets in get_batches(data, batch_size, seq_length):
      counter += 1 
      inputs = one_hot_encode(inputs, n_chars)
      inputs, targets = torch.from_numpy(inputs), torch.from_numpy(targets)
      inputs, targets = inputs.to(device), targets.to(device)

      h = tuple([each.data for each in h])

      model.zero_grad()
      output, h = model(inputs, h)
      loss = criterion(output, targets.view(batch_size * seq_length).long())
      loss.backward()
      nn.utils.clip_grad_norm(model.parameters(), clip)
      optimizer.step()

      if counter % print_every == 0:
        valid_h = model.init_hidden(batch_size)
        valid_losses = []

        model.eval()
        for inputs, targets in get_batches(valid_data, batch_size, seq_length):
          inputs = one_hot_encode(inputs, n_chars)
          inputs, targets = torch.from_numpy(inputs), torch.from_numpy(targets)

          valid_h = tuple([each for each in valid_h])

          inputs, targets = inputs.to(device), targets.to(device)

          output, valid_h = model(inputs, valid_h)
          valid_loss = criterion(output, targets.view(batch_size * seq_length).long())

          valid_losses.append(valid_loss.item())
        model.train()

        print("Epochs: {} \tStep: {} \tTraining loss: {:.6f} \tValidation loss: {:.6f}".format(epoch+1, counter, loss.item(), np.mean(valid_losses)))

In [None]:
epochs = 5
batch_size = 128
seq_length = 100
n_epochs = 20
lr=0.001
clip = 5
val_size=0.1
print_every=10

# train the model
train(model, encoded, epochs, batch_size, seq_length, lr, clip, val_size, print_every=10)