### Download the novel

In [36]:
import requests
url="https://gutenberg.org/cache/epub/73489/pg73489.txt"
response=requests.get(url)
if response.status_code == 200:
    with open("book.txt", "wb") as file:
        file.write(response.content)
else:
    print("File not found")

### Determine Start and End index of the text
#### Prepare an index_to_char and char_to_index dictionary that will come in handy later

In [78]:
with open("book.txt",'r') as fp:
    text=fp.read()
start_idx=text.index("Once upon a time there was a little girl named Anne Wilbraham")
end_idx=text.index("THE END")
text=text[start_idx:end_idx]
char_set=sorted(set(text))  ## Set function will prepare a list of unique characters in `text`
print(f"Total length of novel: {len(text)} \n")
print(f"Distinct characters: {char_set} \n")
print(f"Count of distinct characters: {len(char_set)} \n")

char_to_idx = { char:idx for idx,char  in enumerate(char_set) }
idx_to_char = { idx:char for idx,char  in enumerate(char_set) }

## Convert original text to integers
text_encoded = [ char_to_idx[char] for char in text ]
print(len(text_encoded))




Total length of novel: 192221 

Distinct characters: ['\n', ' ', '!', '(', ')', '*', '+', ',', '-', '.', '0', '1', '2', '3', '4', '5', '6', '7', '8', '9', ':', ';', '=', '?', 'A', 'B', 'C', 'D', 'E', 'F', 'G', 'H', 'I', 'J', 'K', 'L', 'M', 'N', 'O', 'P', 'Q', 'R', 'S', 'T', 'U', 'V', 'W', 'Y', 'Z', '[', ']', '_', 'a', 'b', 'c', 'd', 'e', 'f', 'g', 'h', 'i', 'j', 'k', 'l', 'm', 'n', 'o', 'p', 'q', 'r', 's', 't', 'u', 'v', 'w', 'x', 'y', 'z', '|', '£', '½', 'é', '‘', '’', '“', '”'] 

Count of distinct characters: 86 

192221


### Prepare a custom dataset

In [134]:
import torch
from torch.utils.data import Dataset
import numpy as np


class PrepareData(Dataset):
    def __init__(self,text_encoded, window_size=50):
        self.text_encoded = text_encoded
        self.window_size = window_size
        self.input, self.label=self.getdata()

    def __len__(self):
        return len(self.input)

    def __getitem__(self,idx):
        input=self.input[idx]
        label=self.label[idx]
        return torch.tensor(input).long(), torch.tensor(label)

    def getdata(self):
        input=[]
        label=[]
        for i in range(len(self.text_encoded) - self.window_size):
            input.append(self.text_encoded[i:i+self.window_size])
            label.append(self.text_encoded[i+self.window_size])
        return input,label
                
dataset=PrepareData(text_encoded)
for i,j in dataset:
    print(i, "\n", j)
    print("".join([idx_to_char[a.item()] for a in i]))   ##Print the first input(sequence)
    print(idx_to_char[j.item()])  ##Print the first label, i.e next character prediction
    break
print(len(dataset))


tensor([38, 65, 54, 56,  1, 72, 67, 66, 65,  1, 52,  1, 71, 60, 64, 56,  1, 71,
        59, 56, 69, 56,  1, 74, 52, 70,  1, 52,  1, 63, 60, 71, 71, 63, 56,  1,
        58, 60, 69, 63,  1, 65, 52, 64, 56, 55,  1, 24, 65, 65]) 
 tensor(56)
Once upon a time there was a little girl named Ann
e
192171


In [135]:
from torch.utils.data import DataLoader
batch_size = 64
torch.manual_seed(42)
seq_dl = DataLoader(dataset, batch_size=batch_size, shuffle=True, drop_last=True)
vocab_size = len(char_set)

In [150]:
import torch.nn as nn
class RNN(nn.Module):
    def __init__(self, vocab_size):
        super().__init__()
        self.embedding = nn.Embedding(vocab_size,10) ##  64, 50, 10
        self.lstm = nn.LSTM(10,16, batch_first=True)  ## 64, 50, 16
        self.linear = nn.Linear(16,vocab_size)

    def forward(self,x, hidden=None):
        embedded=self.embedding(x)
        #print(embedded.shape)
        output, (h_n, c_n) = self.lstm(embedded)
        #print(output.shape)
        output = output[:, -1, :] ## We only want the last timestep
        #print(output.shape)
        output = self.linear(output)
        
        #output = self.linear(output) #.reshape(output.size(0), -1)
        return output,h_n
input,label=next(iter(seq_dl))
rnn=RNN(vocab_size)
rnn(input)[0].shape

torch.Size([64, 86])

In [151]:
criterion = nn.CrossEntropyLoss()
optimizer = torch.optim.Adam(rnn.parameters(), lr=0.001)

# Training loop
for epoch in range(1000):
    total_loss = 0.0
    for inputs, labels in seq_dl:
        # Forward pass
        outputs,hidden = rnn(inputs)

        # Calculate loss
        loss = criterion(outputs, labels)

        # Backward pass and optimization
        optimizer.zero_grad()
        loss.backward()
        optimizer.step()

        total_loss += loss.item()

    # Print average loss for this epoch
    print(f"Epoch [{epoch+1}], Loss: {total_loss / len(seq_dl):.4f}")

Epoch [1], Loss: 2.7632
Epoch [2], Loss: 2.3698
Epoch [3], Loss: 2.2650
Epoch [4], Loss: 2.2036
Epoch [5], Loss: 2.1644
Epoch [6], Loss: 2.1352
Epoch [7], Loss: 2.1127
Epoch [8], Loss: 2.0943
Epoch [9], Loss: 2.0787
Epoch [10], Loss: 2.0659
Epoch [11], Loss: 2.0548
Epoch [12], Loss: 2.0451
Epoch [13], Loss: 2.0367
Epoch [14], Loss: 2.0294
Epoch [15], Loss: 2.0226
Epoch [16], Loss: 2.0166
Epoch [17], Loss: 2.0115
Epoch [18], Loss: 2.0067
Epoch [19], Loss: 2.0027
Epoch [20], Loss: 1.9988
Epoch [21], Loss: 1.9951
Epoch [22], Loss: 1.9919
Epoch [23], Loss: 1.9890
Epoch [24], Loss: 1.9857
Epoch [25], Loss: 1.9829
Epoch [26], Loss: 1.9806
Epoch [27], Loss: 1.9784
Epoch [28], Loss: 1.9755
Epoch [29], Loss: 1.9730
Epoch [30], Loss: 1.9708
Epoch [31], Loss: 1.9689
Epoch [32], Loss: 1.9665
Epoch [33], Loss: 1.9648
Epoch [34], Loss: 1.9626
Epoch [35], Loss: 1.9607
Epoch [36], Loss: 1.9587
Epoch [37], Loss: 1.9570
Epoch [38], Loss: 1.9549
Epoch [39], Loss: 1.9535
Epoch [40], Loss: 1.9518
Epoch [41

KeyboardInterrupt: 

In [154]:
import numpy as np

def generate_next_characters(model, seed, char_to_index, index_to_char, max_length=100, temperature=1.0):
    model.eval()  # Set the model to evaluation mode
    with torch.no_grad():
        # Convert the seed to tensor
        seed_tensor = torch.tensor([char_to_index[char] for char in seed], dtype=torch.long).unsqueeze(0)

        # Initialize the hidden state
        hidden = None

        # Generate next characters
        generated_text = seed
        for _ in range(max_length):
            # Forward pass
            output, hidden = model(seed_tensor,hidden)

            # Sample the next character using temperature
            probabilities = torch.softmax(output.squeeze() / temperature, dim=0)
            next_index = torch.multinomial(probabilities, 1).item()

            # Convert the index to character
            next_char = index_to_char[next_index]

            # Append the next character to the generated text
            generated_text += next_char

            # Update the seed for the next iteration
            seed_tensor = torch.tensor([[next_index]], dtype=torch.long)

            # Break if the generated text exceeds max length or if the next character is a newline
            if next_char == '\n' or len(generated_text) >= max_length:
                break

    # Print the generated text
    print("Generated text:")
    print(generated_text)

seed = "I am mad"
generate_next_characters(rnn, seed, char_to_idx, idx_to_char)


Generated text:
I am mad   5BheYiiRE1tukogheg,” LE). WheA,”    7 Kée. O,” 

