<a href="https://colab.research.google.com/github/shazzad-hasan/practice-deep-learning-with-pytorch/blob/main/seq_to_seq/char_level_lstm.ipynb" target="_parent"><img src="https://colab.research.google.com/assets/colab-badge.svg" alt="Open In Colab"/></a>

In [1]:
# upload kaggle API key from your local machine
from google.colab import files
files.upload()

Saving kaggle.json to kaggle.json


{'kaggle.json': b'{"username":"shazzadraihan","key":"da63bbe0f8dcb3bd7fb35034046ca758"}'}

In [2]:
# make a kaggle dir, copy the API key to it
# and make sure the file in only readable by yourself (chmod 600)
!mkdir ~/.kaggle 
!cp kaggle.json ~/.kaggle/
!chmod 600 ~/.kaggle/kaggle.json

In [3]:
# use API command to download the dataset
!kaggle datasets download -d wanderdust/anna-karenina-book

Downloading anna-karenina-book.zip to /content
  0% 0.00/739k [00:00<?, ?B/s]
100% 739k/739k [00:00<00:00, 145MB/s]


In [4]:
# uncompress the dataset
!unzip -qq anna-karenina-book.zip

In [5]:
# open text file and read in some data as text
with open("/content/anna.txt", "r") as f:
  text = f.read()

text[:100]

'Chapter 1\n\n\nHappy families are all alike; every unhappy family is unhappy in its own\nway.\n\nEverythin'

In [6]:
# import required libraries
import torch
import numpy as np

In [7]:
# check if cuda is available
train_on_gpu = torch.cuda.is_available()

if not train_on_gpu:
  print("CUDA is not available")
else:
  print("CUDA is available")

device = torch.device('cuda') if train_on_gpu else torch.device('cpu')

CUDA is available


### Pre-process the dataset

In [8]:
# tokenization

chars = tuple(set(text))
# map each int to char
int_to_char = dict(enumerate(chars))
# map each char to int
char_to_int = {ch:idx for idx, ch in int_to_char.items()}

# encode 
encoded = np.array([char_to_int[ch] for ch in text])
encoded[:100]

array([33, 24, 14, 52,  4, 39, 76,  8, 32,  2,  2,  2, 45, 14, 52, 52, 10,
        8, 68, 14, 21, 58, 60, 58, 39,  6,  8, 14, 76, 39,  8, 14, 60, 60,
        8, 14, 60, 58,  7, 39, 61,  8, 39, 11, 39, 76, 10,  8,  1, 27, 24,
       14, 52, 52, 10,  8, 68, 14, 21, 58, 60, 10,  8, 58,  6,  8,  1, 27,
       24, 14, 52, 52, 10,  8, 58, 27,  8, 58,  4,  6,  8, 55, 19, 27,  2,
       19, 14, 10, 25,  2,  2, 34, 11, 39, 76, 10,  4, 24, 58, 27])

In [9]:
def one_hot_encode(arr, n_labels):
    
    # initialize the the encoded array with zeros
    one_hot = np.zeros((arr.size, n_labels), dtype=np.float32)
    
    # fill with ones where appropriate
    one_hot[np.arange(one_hot.shape[0]), arr.flatten()] = 1.
    
    # reshape to get back to the original array
    one_hot = one_hot.reshape((*arr.shape, n_labels))
    
    return one_hot

In [10]:
def get_batches(arr, batch_size, seq_length):
    total_batch_size = batch_size * seq_length
    # total number of batches
    n_batches = len(arr)//total_batch_size
    
    # keep enough characters to make full batches
    arr = arr[:n_batches * total_batch_size]
    # reshape into batch_size rows
    arr = arr.reshape((batch_size, -1))
    
    for n in range(0, arr.shape[1], seq_length):
        # features
        x = arr[:, n:n+seq_length]
        # targets, shifted by one
        y = np.zeros_like(x)
        try:
            y[:, :-1], y[:, -1] = x[:, 1:], arr[:, n+seq_length]
        except IndexError:
            y[:, :-1], y[:, -1] = x[:, 1:], arr[:, 0]
        yield x, y
  

### Model

In [11]:
import torch.nn as nn
import torch.optim as optim

class CharRNN(nn.Module):
  def __init__(self, tokens, n_hidden, n_layers, drop_prob, lr):
    super().__init__()
    self.drop_prob = drop_prob 
    self.n_layers = n_layers 
    self.n_hidden = n_hidden 
    self.lr = lr 

    self.chars = tokens 
    self.int_to_char = dict(enumerate(self.chars))
    self.char_to_int = {ch:idx for idx, ch in self.int_to_char.items()}

    self.lstm = nn.LSTM(len(self.chars), n_hidden, n_layers, dropout=drop_prob, batch_first=True)
    self.dropout = nn.Dropout(drop_prob)
    self.fc = nn.Linear(n_hidden, len(self.chars))


  def forward(self, x, hidden):
    output, hidden = self.lstm(x, hidden)
    output = self.dropout(output)
    output = output.contiguous().view(-1, self.n_hidden)
    output = self.fc(output)
    return output, hidden

  def init_hidden(self, batch_size):
    weight = next(self.parameters()).data

    hidden = (weight.new(self.n_layers, batch_size, self.n_hidden).zero_().to(device),
             weight.new(self.n_layers, batch_size, self.n_hidden).zero_().to(device))
    
    return hidden

### Train

In [11]:
def train(model, data, epochs, batch_size, seq_length, lr, clip, valid_size, print_every=10):
  model.train()
  
  optimizer = optim.Adam(model.parameters(), lr = lr)
  criterion = nn.CrossEntropyLoss()

  valid_idx = int(len(data))*(1-valid_size)
  train_data, valid_data = data[:valid_idx], data[valid_idx:]

  model.to(device)