In [None]:
# !mkdir data/harry_potter_txt
!wget "https://raw.githubusercontent.com/priyammaz/PyTorch-Adventures/main/data/harry_potter_txt/Book%201%20-%20The%20Philosopher's%20Stone.txt"

--2025-04-29 13:01:47--  https://raw.githubusercontent.com/priyammaz/PyTorch-Adventures/main/data/harry_potter_txt/Book%201%20-%20The%20Philosopher's%20Stone.txt
Resolving raw.githubusercontent.com (raw.githubusercontent.com)... 185.199.108.133, 185.199.109.133, 185.199.110.133, ...
Connecting to raw.githubusercontent.com (raw.githubusercontent.com)|185.199.108.133|:443... connected.
HTTP request sent, awaiting response... 200 OK
Length: 492161 (481K) [text/plain]
Saving to: ‘Book 1 - The Philosopher's Stone.txt’


2025-04-29 13:01:48 (28.4 MB/s) - ‘Book 1 - The Philosopher's Stone.txt’ saved [492161/492161]



In [None]:
import os
import sys
from pathlib import Path

import json

import pandas as pd
import numpy as np

import seaborn as sns
import matplotlib.pyplot as plt

import torch
import torch.nn as nn
import torch.optim as optim
from torch.utils.data import Dataset, DataLoader


In [None]:
# dataset_dir_path= os.path.join("..", "data", "harry_potter_txt")
# files_indir= [ x for x in os.listdir(dataset_dir_path) if x.endswith('txt')]
path1= "/content/Book 1 - The Philosopher's Stone.txt"

allText=""


with open(path1, "r", encoding="utf-8") as f:
    text= f.readlines()
    f.close()

text = [line for line in text if "Page" not in line]
text = " ".join(text).replace("\n", "").lower()
# text= word_tokenize(text)
text = [word for word in text.split(" ") if len(word) > 0]
text = " ".join(text)
allText += text


In [None]:
unique_chars = sorted(list(set(allText)))
char2idx = {c:i for (i,c) in enumerate(unique_chars)}
idx2char = {i:c for (i,c) in enumerate(unique_chars)}

In [None]:
class DataBuilder:
    """Randomely slice the data"""
    def __init__(self, seq_len=100, text=allText):

        self.seq_len = seq_len
        self.text = text
        self.file_length = len(text)

    def grab_random_sample(self):

        start = np.random.randint(0, self.file_length-self.seq_len)
        end = start + self.seq_len
        text_slice = self.text[start:end]
        # print(start, end)
        # print(text_slice)

        input_text = text_slice[:-1]
        label = text_slice[1:]

        input_text = torch.tensor([char2idx[c] for c in input_text])
        label = torch.tensor([char2idx[c] for c in label])

        return input_text, label

    def grab_random_batch(self, batch_size):

        input_texts, labels = [], []

        for _ in range(batch_size):
            input_text, label = self.grab_random_sample()

            input_texts.append(input_text)
            labels.append(label)

        input_texts = torch.stack(input_texts)
        labels = torch.stack(labels)

        return input_texts, labels

In [None]:
class LSTMForGeneration(nn.Module):
    def __init__(self, embedding_dim=128,
                 num_characters=len(char2idx),
                 hidden_size=256,
                 n_layers=3,
                 device="cpu"):
        super().__init__()

        self.embedding_dim = embedding_dim
        self.num_characters = num_characters #vocab
        self.hidden_size = hidden_size
        self.n_layers = n_layers
        self.device = device

        self.embedding = nn.Embedding(num_characters, embedding_dim)
        self.lstm = nn.LSTM(input_size=embedding_dim,
                            hidden_size=hidden_size,
                            num_layers=n_layers,
                            batch_first=True)

        self.fc = nn.Linear(hidden_size, num_characters)

        self.softmax = nn.Softmax(dim=-1) # dim = -1 row wise (token wise ops)

    def forward(self, x):
        x = self.embedding(x) # batchsize, seq_len, embedding dims
        # print(x.shape)
        op, (hn, cn) = self.lstm(x)
        #op -> batch_size, seq_len, hidden_size
        #hn -> n_layers, batch_size, hidden_size
        #cn -> n_layers, batch_size, hidden_size
        logits = self.fc(op) #batch_size, vocab_size
        # logits= self.softmax(x) #batch_size, vocab_size
        return logits

    def write(self, text, max_characters, greedy=False):


         idx = torch.tensor([char2idx[c] for c in text], device=self.device)
         hidden = torch.zeros(self.n_layers, self.hidden_size).to(self.device)
         cell = torch.zeros(self.n_layers, self.hidden_size).to(self.device)

         for i in range(max_characters):
             if i == 0:
                selected_idx = idx
             else:
                selected_idx = idx[-1].unsqueeze(0)

             x = self.embedding(selected_idx)
             out, (hidden, cell) = self.lstm(x, (hidden, cell))
             out = self.fc(out)

             if len(out) > 1:

                out = out[-1, :].unsqueeze(0)


             probs = self.softmax(out)

             if greedy:
                idx_next = torch.argmax(probs)
             else:
                idx_next = torch.multinomial(probs, num_samples=1)

             idx = torch.cat([idx, idx_next[0]])

         gen_string = [idx2char[int(c)] for c in idx]
         gen_string = "".join(gen_string)

         return gen_string


model = LSTMForGeneration()
text = "hello"
model.write(text, 100, greedy=False)


'hello(;/c(■”a4zs■bga‘y?•■gd05■!3vhsfx"e)•?3ydwrpiy—xrdygnvh’d1\\0•:zakls•:l(x"eut\\s,■j—? fay—mu■?i476fe0rh'

In [None]:
iterations = 3000
# iterations = 8
max_len = 300
evaluate_interval = 300
embedding_dim = 128
hidden_size = 256
n_layers = 3
lr = 0.003
batch_size = 128

DEVICE = "cuda" if torch.cuda.is_available() else "cpu"

model = LSTMForGeneration(embedding_dim, len(char2idx), hidden_size, n_layers, DEVICE).to(DEVICE)
optimizer = optim.AdamW(model.parameters(), lr=lr)
loss_fn = nn.CrossEntropyLoss()

dataset = DataBuilder()

for iteration in range(iterations):
    input_texts, labels = dataset.grab_random_batch(batch_size=batch_size)
    input_texts, labels = input_texts.to(DEVICE), labels.to(DEVICE)

    optimizer.zero_grad()
    output= model(input_texts)
    output = output.transpose(1,2)

    loss = loss_fn(output, labels) #tensor(4.1691, grad_fn=<NllLoss2DBackward0>)

    loss.backward()
    optimizer.step()
    if iteration % evaluate_interval == 0:
        print("--------------------------------------")
        print(f"Iteration {iteration}")
        print(f"Loss {loss.item()}")
        generated_text = model.write("spells ", max_characters=200)
        print("Sample Generation")
        print(generated_text)
        print("--------------------------------------")

--------------------------------------
Iteration 0
Loss 4.036876201629639
Sample Generation
spells iw"l“xb‘”16h”•a?t?,vph4u0apjl.•")q1hvi8;j’a/—■e9"y6";x'’(9fc‘hx(/‘rr■q00b“lrk:■/fqaw7' z■b5(::hyjhf!\/“\/5als2;’)'!x"8v;—-‘?,ad8‘6”“c39jw),orml1 6e'de6ti8e’vvcfk"y,h‘r•8:,c2jxgga?'’?9"68’ea f (!mw (n‘
--------------------------------------
--------------------------------------
Iteration 300
Loss 1.5681543350219727
Sample Generation
spells and there tofter were be,” ...” mavate as wall, on the diend have day comclassing of couldn’t a calling witereon a tall seven to a before conce enestioned out, could normiled lesside had: harry was gl
--------------------------------------
--------------------------------------
Iteration 600
Loss 1.2880799770355225
Sample Generation
spells the thind you-know i show. seak and fer again?” ben look. so retome his tobes weren’t na grange all mowk to cabby grand what sropped adring, the jatch that were here shout. it’ll keet sweak into the s
----------------