<a href="https://colab.research.google.com/github/prabhsuratsingh/Recurrent-Neural-Networks/blob/master/RNN_Generative.ipynb" target="_parent"><img src="https://colab.research.google.com/assets/colab-badge.svg" alt="Open In Colab"/></a>

In [1]:
!pip install numpy pandas matplotlib torch



In [2]:
!wget https://www.gutenberg.org/files/1268/1268-0.txt

--2025-12-01 17:23:29--  https://www.gutenberg.org/files/1268/1268-0.txt
Resolving www.gutenberg.org (www.gutenberg.org)... 152.19.134.47, 2610:28:3090:3000:0:bad:cafe:47
Connecting to www.gutenberg.org (www.gutenberg.org)|152.19.134.47|:443... connected.
HTTP request sent, awaiting response... 200 OK
Length: 1151796 (1.1M) [text/plain]
Saving to: ‘1268-0.txt’


2025-12-01 17:23:32 (836 KB/s) - ‘1268-0.txt’ saved [1151796/1151796]



In [3]:
import numpy as np

with open('1268-0.txt', 'r', encoding="utf8") as fp:
  text = fp.read()

start_idx = text.find("THE MYSTERIOUS ISLAND")
end_idx = text.find('End of the Project Gutenberg')
text = text[start_idx:end_idx]
char_set = set(text)
print(f'Total Length : {len(text)}')
print(f'Unique Characters : {len(char_set)}')

Total Length : 1112310
Unique Characters : 80


In [4]:
chars_sorted = sorted(char_set)
char2int = {ch:i for i,ch in enumerate(chars_sorted)}
char_array = np.array(chars_sorted)

text_encoded = np.array(
    [char2int[ch] for ch in text],
    dtype=np.int32
)

print(f'Text Encoded shape : {text_encoded.shape}')

print(text[:15], '== Encoding ==>', text_encoded[:15])
print(text_encoded[15:21], '== Reverse ==>', ''.join(char_array[text_encoded[15:21]]))

Text Encoded shape : (1112310,)
THE MYSTERIOUS  == Encoding ==> [44 32 29  1 37 48 43 44 29 42 33 39 45 43  1]
[33 43 36 25 38 28] == Reverse ==> ISLAND


In [5]:
for ex in text_encoded[:5]:
  print(f'{ex} -> {char_array[ex]}')

44 -> T
32 -> H
29 -> E
1 ->  
37 -> M


In [6]:
import torch
from torch.utils.data import Dataset

seq_length = 40
chunk_size = seq_length + 1
text_chunks = [text_encoded[i:i+chunk_size] for i in range(len(text_encoded) - chunk_size + 1)]

class TextDataset(Dataset):
  def __init__(self, text_chunks):
    self.text_chunks = text_chunks

  def __len__(self):
    return len(self.text_chunks)

  def __getitem__(self, idx):
    text_chunk = self.text_chunks[idx]
    return text_chunk[:-1].long(), text_chunk[1:].long()

seq_dataset = TextDataset(torch.tensor(text_chunks))

  seq_dataset = TextDataset(torch.tensor(text_chunks))


In [7]:
for i, (seq, target) in enumerate(seq_dataset):
  print(f'Input (x) : {repr(''.join(char_array[seq]))}')
  print(f'Output (y) : {repr(''.join(char_array[target]))}')
  print()
  if i == 1:
    break

Input (x) : 'THE MYSTERIOUS ISLAND\n\nby Jules Verne\n\n1'
Output (y) : 'HE MYSTERIOUS ISLAND\n\nby Jules Verne\n\n18'

Input (x) : 'HE MYSTERIOUS ISLAND\n\nby Jules Verne\n\n18'
Output (y) : 'E MYSTERIOUS ISLAND\n\nby Jules Verne\n\n187'



In [9]:
from torch.utils.data import DataLoader

batch_size = 64
torch.manual_seed(1)
seq_dl = DataLoader(
    seq_dataset,
    batch_size=batch_size,
    shuffle=True,
    drop_last=True
)

In [16]:
import torch.nn as nn

class RNN(nn.Module):
  def __init__(
      self,
      vocab_size,
      embed_dim,
      rnn_hidden_size
  ):
    super().__init__()
    self.embedding = nn.Embedding(vocab_size, embed_dim)
    self.rnn_hidden_size = rnn_hidden_size
    self.rnn = nn.LSTM(embed_dim, rnn_hidden_size, batch_first=True)
    self.fc = nn.Linear(rnn_hidden_size, vocab_size)\

  def forward(self, x, hidden, cell):
    out = self.embedding(x).unsqueeze(1)
    out, (hidden, cell) = self.rnn(out, (hidden, cell))
    out = self.fc(out).reshape(out.size(0), -1)

    return out, hidden, cell

  def init_hidden(self, batch_size):
    hidden = torch.zeros(1, batch_size, self.rnn_hidden_size)
    cell = torch.zeros(1, batch_size, self.rnn_hidden_size)

    return hidden, cell

In [17]:
vocab_size = len(char_array)
embed_dim = 256
rnn_hidden_size = 512
torch.manual_seed(1)

model = RNN(vocab_size, embed_dim, rnn_hidden_size)
model

RNN(
  (embedding): Embedding(80, 256)
  (rnn): LSTM(256, 512, batch_first=True)
  (fc): Linear(in_features=512, out_features=80, bias=True)
)

In [18]:
loss_fn = nn.CrossEntropyLoss()
optimizer = torch.optim.Adam(model.parameters(), lr=0.005)

In [None]:
num_epochs = 10000
torch.manual_seed(1)

for epoch in range(num_epochs):
  hidden, cell = model.init_hidden(batch_size)
  seq_batch, target_batch = next(iter(seq_dl))
  optimizer.zero_grad()
  loss = 0

  for c in range(seq_length):
    pred, hidden, cell = model(seq_batch[:, c], hidden, cell)
    loss += loss_fn(pred, target_batch[:, c])
  loss.backward()
  optimizer.step()
  loss = loss.item() / seq_length
  if epoch % 500 == 0:
    print(f"Epoch {epoch} loss: {loss:.4f}")

Epoch 0 loss: 2.2958
Epoch 500 loss: 1.3883


In [None]:
from torch.distributions.categorical import Categorical

def sample(
    model,
    starting_str,
    len_generated_text=500,
    scale_factor=1.0
):
  encoded_input = torch.tensor(
      [char2int[s] for s in starting_str]
  )
  encoded_input = torch.reshape(
      encoded_input, (1, -1)
  )

  generated_str = starting_str

  model.eval()
  hidden, cell = model.init_hidden(1)
  for c in range(len(starting_str)-1):
    _, hidden, cell = model(
        encoded_input[:, c].view(1), hidden, cell
    )

  last_char = encoded_input[:, -1]
  for i in range(len_generated_text):
    logits, hidden, cell = model(
        last_char.view(1), hidden, cell
    )
    logits = torch.squeeze(logits, 0)
    scaled_logits = logits * scale_factor
    m = Categorical(logits=scaled_logits)
    last_char = m.sample()
    generated_str += str(char_array[last_char])

  return generated_str

In [None]:
torch.manual_seed(1)
print(sample(model, starting_str='The island'))