In [37]:
%load_ext autoreload
%autoreload 2

The autoreload extension is already loaded. To reload it, use:
  %reload_ext autoreload


# Exercise 8

<img src="./images/08.png" width=800>

In [None]:
import torch
import torch.nn as nn
import torch.nn.functional as F
import torch.optim as optim
import torchvision
from torchvision import transforms

from torch.utils.data import Dataset, DataLoader
import numpy as np
from tqdm.autonotebook import tqdm
import seaborn as sns
import matplotlib.pyplot as plt
from matplotlib.pyplot import imshow
from utils import train_network, View, set_seed, weight_reset
import mlflow
from torchinfo import summary
import os

  from tqdm.autonotebook import tqdm


In [None]:
os.environ['MLFLOW_TRACKING_URI'] = './mlruns07_8'
mlflow.set_tracking_uri(os.environ.get('MLFLOW_TRACKING_URI'))

In [None]:
mlflow.set_experiment('Exercise07_8')

2025/06/27 07:43:52 INFO mlflow.tracking.fluent: Experiment with name 'Exercise07_7' does not exist. Creating a new experiment.


<Experiment: artifact_location='/home/spakdel/my_projects/Books/Inside-Deep-Learning/Exercises_InsideDeepLearning/Chapter_07/mlruns07_7/521635938366561066', creation_time=1750997632775, experiment_id='521635938366561066', last_update_time=1750997632775, lifecycle_stage='active', name='Exercise07_7', tags={}>

In [3]:
torch.backends.cudnn.deterministic = True
set_seed(42)

In [4]:
device = torch.device("cuda") if torch.cuda.is_available() else torch.device("cpu")

## Dataset and DataLoader

In [5]:
from io import BytesIO
from zipfile import ZipFile
from urllib.request import urlopen
import re

all_data = []
resp = urlopen("https://cs.stanford.edu/people/karpathy/char-rnn/shakespear.txt")
shakespear_100k = resp.read()
shakespear_100k = shakespear_100k.decode('utf-8').lower()

In [6]:
vocab2indx = {} #the vocab $\Sigma$
for char in shakespear_100k: 
    if char not in vocab2indx: #add every new character to the vocab
        vocab2indx[char] = len(vocab2indx) #set the index based on the current vocab size

#Some useful code to goe from index back to original characters. 
indx2vocab = {}
#Well simply iterate over all key,value pairs and create a dicionary with the inverse mapping. 
for k, v in vocab2indx.items():
    indx2vocab[v] = k
print("Vocab Size: ", len(vocab2indx))
print("Total Characters:", len(shakespear_100k))

Vocab Size:  36
Total Characters: 99993


### Original Dataset: start at everywhere

In [None]:
class AutoRegressiveDataset(Dataset):
    """
    Creates an autoregressive dataset from one single, long, source sequence by breaking it up into "chunks". 
    """

    def __init__(self, large_string, MAX_CHUNK=500):
        """
        large_string: the original long source sequence that chunks will be extracted from
        MAX_CHUNK: the maximum allowed size of any chunk. 
        """
        self.doc = large_string
        self.MAX_CHUNK = MAX_CHUNK

    def __len__(self):
        #The number of items is the number of characters divided by chunk size
        return (len(self.doc)-1) // self.MAX_CHUNK

    def __getitem__(self, idx):
        #Compute the starting position for the idx'th chunk
        start = idx*self.MAX_CHUNK
        #Grab the input sub-string
        sub_string = self.doc[start:start+self.MAX_CHUNK]
        #convert the sub-string into integers based on our vocab
        x = [vocab2indx[c] for c in sub_string]
        
        #grab the label sub-string by shifting over by 1
        sub_string = self.doc[start+1:start+self.MAX_CHUNK+1]
        #convert the label sub-string into integers based on our vocab
        y = [vocab2indx[c] for c in sub_string]
        #convert the 
        return torch.tensor(x, dtype=torch.int64), torch.tensor(y, dtype=torch.int64)
#Caption: Creating a dataset for autoregressive problems from a large text corpus. We assume the corpus exists as one long string, and it is OK to concatenate multiple files together into one long string since our chunks are smaller than most documents are anyway. 

In [None]:
autoRegData_everywhere = AutoRegressiveDataset(shakespear_100k, MAX_CHUNK=250)
batch_size = 128
autoReg_loader_everywhere = DataLoader(autoRegData_everywhere, batch_size=batch_size, shuffle=True)

### New Dataset: start at new lines

In [None]:
class NewLineAutoRegressiveDataset(Dataset):
    """
    Creates an autoregressive dataset from one single, long, source sequence by breaking it up into "chunks".
    Each chunk will start at the beginning of a new line.
    """

    def __init__(self, large_string, MAX_CHUNK=500):
        """
        large_string: the original long source sequence that chunks will be extracted from
        MAX_CHUNK: the maximum allowed size of any chunk.
        """
        self.doc = large_string
        self.MAX_CHUNK = MAX_CHUNK

        # Pre-filter newline indices to only include those that can form a valid chunk
        self.valid_start_indices = []
        if len(self.doc) >= self.MAX_CHUNK + 1 and self.doc[0] != '\n':
            self.valid_start_indices.append(0)
        for i, char in enumerate(large_string):
            if char == '\n':
                # +1 to start after the newline, +MAX_CHUNK for the full chunk length
                # We need MAX_CHUNK + 1 for 'y' as well
                if (i + 1 + MAX_CHUNK) <= len(self.doc): # Check if there's enough data for a full chunk
                    self.valid_start_indices.append(i + 1)


    def __len__(self):
        return len(self.valid_start_indices)

    def __getitem__(self, idx):
        start_char_index = self.valid_start_indices[idx]

        # Grab the input sub-string for x (length MAX_CHUNK)
        sub_string_x = self.doc[start_char_index : start_char_index + self.MAX_CHUNK]
        # Grab the input sub-string for y (shifted by 1, same length)
        sub_string_y = self.doc[start_char_index + 1 : start_char_index + self.MAX_CHUNK + 1]

        # Convert to integers based on our vocab
        x = [vocab2indx[c] for c in sub_string_x]
        y = [vocab2indx[c] for c in sub_string_y]

        return torch.tensor(x, dtype=torch.int64), torch.tensor(y, dtype=torch.int64)

In [None]:
autoRegData_newline = NewLineAutoRegressiveDataset(shakespear_100k, MAX_CHUNK=250)
batch_size = 128
autoReg_loader_newline = DataLoader(autoRegData_newline, batch_size=batch_size, shuffle=True)

## Model

In [None]:
class AutoRegressiveGRU(nn.Module):

    def __init__(self, num_embeddings, embd_size, hidden_size, layers=1):
        super().__init__()
        self.hidden_size = hidden_size
        self.embd = nn.Embedding(num_embeddings, embd_size)
        self.layers = nn.ModuleList([nn.GRUCell(embd_size, hidden_size)] + 
                                    [nn.GRUCell(hidden_size, hidden_size) for i in range(layers-1)])
        self.norms = nn.ModuleList([nn.LayerNorm(hidden_size) for i in range(layers)])
        
        self.pred_class = nn.Sequential(
            nn.Linear(hidden_size, hidden_size),# (B, *, D)
            nn.LeakyReLU(),
            nn.LayerNorm(hidden_size), # (B, *, D)
            nn.Linear(hidden_size, num_embeddings) #(B, *. D) -> B(B, *, VocabSize)
        )
        
    def initHiddenStates(self, B):
        """
        Creates an initial hidden state list for the RNN layers. 
        
        B: the batch size for the hidden states. 
        """
        return [torch.zeros(B, self.hidden_size, device=device) for _ in range(len(self.layers))]
        
    def step(self, x_in, h_prevs=None):
        """
        x_in: the input for this current time step and has shape (B) if the values need 
            to be embedded, and (B, D) if they have alreayd been embedded. 

        h_prevs: a list of hidden state tensors each with shape (B, self.hidden_size) for each 
            layer in the network. These contain the current hidden state of the RNN layers and 
            will be updated by this call. 
        """
        #Prep all three arguments to be in the final form
        if len(x_in.shape) == 1: #(B), we need to embed it
            x_in = self.embd(x_in) #now (B, D)

        if h_prevs is None:
            h_prevs = self.initHiddenStates(x_in.shape[0])
        
        #Process the input 
        for l in range(len(self.layers)):
            h_prev = h_prevs[l]
            h = self.norms[l](self.layers[l](x_in, h_prev))

            h_prevs[l] = h
            x_in = h
        #Make predictions about the token
        return self.pred_class(x_in), h_prevs
    
    def forward(self, input):
        #Input should be (B, T)
        #What is the batch size?
        B = input.size(0)
        #What is the max number of time steps?
        T = input.size(1)
        
        x = self.embd(input) #(B, T, D)
        
        #Initial hidden states
        h_prevs = self.initHiddenStates(B)
        
        last_activations = []
        for t in range(T):
            x_in = x[:,t,:] #(B, D)
            preds, h_prevs = self.step(x_in, h_prevs)
            last_activations.append(preds)
        
        last_activations = torch.stack(last_activations, dim=1) #(B, T, D)
        
        return last_activations

In [None]:
model = AutoRegressiveGRU(len(vocab2indx), 32, 128, layers=2).to(device)

for p in model.parameters():
    p.register_hook(lambda grad: torch.clamp(grad, -2, 2))

## Training

In [13]:
def CrossEntLossTime(x, y):
    """
    x: output with shape (B, T, V)
    y: labels with shape (B, T)
    
    """
    cel = nn.CrossEntropyLoss()
    
    T = x.size(1)
    
    loss = 0
    
    for t in range(T):#for every item in the sequence
        loss += cel(x[:,t,:], y[:,t]) #Compute the sum of prediction errors
    
    return loss

In [None]:
loss_func = CrossEntLossTime
epochs = 5
params = {
    'device': device,
    'loss_func': loss_func.__class__.__name__,
    'epochs': epochs,
    'batch_size': batch_size,
    }

In [None]:
dataloaders = {
    'start_at_everywhere': autoReg_loader_everywhere,
    'start_at_newlines': autoReg_loader_newline,
}

In [None]:
for experiment, dataloader in dataloaders.items():
    print(experiment)
    model.apply(weight_reset)
    optimizer = optim.AdamW(model.parameters())
    with open('model_summary.txt', 'w') as f:
        f.write(str(summary(model, inpt_size=(batch_size, 250, 1))))
    with mlflow.start_run(nested=True, run_name=experiment):
        params['optimizer'] = optimizer.defaults
        mlflow.log_artifact('model_summary.txt')
        mlflow.log_params(params)

        results = train_network(
            model=model,
            optimizer=optimizer,
            loss_func=loss_func,
            train_loader=dataloader,
            epochs=epochs,
            device=device,                
        )

Epoch:   0%|          | 0/1 [00:00<?, ?it/s]

Epoch: 100%|██████████| 1/1 [00:33<00:00, 33.98s/it]


## Results

Yes, I absolutely think this change would significantly alter the characteristics of the generated output. Here's why:

Improved Coherence at the Start of Generations:

Original Dataset: Because the original dataset could grab a subsequence from anywhere in the text, the model would be trained on inputs that started mid-word or mid-sentence. When generating text, the model might produce outputs that feel abruptly cut off or grammatically incomplete at the beginning of its generations, as it learned to complete fragments.
New Dataset: By forcing each training sequence to begin at the start of a new line, the model will learn to generate text that naturally flows from a "fresh" start. This is crucial for tasks like generating paragraphs, dialogue, or code snippets where new lines often signify new ideas or complete statements. The generated output is likely to be more grammatically sound and contextually appropriate from its very first characters.
Learning Line-Level Structure and Pacing:

Original Dataset: The original dataset didn't explicitly reinforce the concept of line breaks. The model might have learned some implicit patterns related to line breaks, but it wasn't a primary signal.
New Dataset: The new dataset emphasizes the importance of new lines. The model will be exposed to patterns of how sentences and phrases begin after a line break. This could lead to generated text that better respects line formatting, pacing, and the typical structure of written content (e.g., poetry, dialogue, prose where new lines indicate a new speaker or thought).
Potential Impact on Vocabulary Usage at Start of Lines:

Models often learn associations between starting words/characters and the context that follows. By consistently starting after a newline, the model might develop a stronger understanding of the typical words or phrases that begin a new line in the training data (e.g., common sentence starters, names in dialogue). This could subtly influence the vocabulary and phrasing at the beginning of generated sequences.
Slightly Different Statistical Distribution of Input:

Even though the overall character distribution remains the same, the sequences the model sees are now structurally different. The model will no longer see sequences like "ing the " or "s of th", but rather sequences that consistently start with the first character of a line (e.g., "The", "And", "Enter", a speaker's name, etc.). This change in the statistical distribution of the input data will inevitably affect the learned parameters of the model and, consequently, the generated output.
In summary, the NewAutoRegressiveDataset encourages the model to learn more about the structure and conventions of text at the line level. This should result in generated text that is more coherent, natural-sounding, and adheres better to typical textual formatting, especially at the beginning of generated passages.