In [37]:
%load_ext autoreload
%autoreload 2

The autoreload extension is already loaded. To reload it, use:
  %reload_ext autoreload


# Exercise 9

<img src="./images/09.png" width=800>

In [2]:
import torch
import torch.nn as nn
import torch.nn.functional as F
import torch.optim as optim
import torchvision
from torchvision import transforms

from torch.utils.data import Dataset, DataLoader
import numpy as np
from tqdm.autonotebook import tqdm
import seaborn as sns
import matplotlib.pyplot as plt
from matplotlib.pyplot import imshow
from utils import train_network, View, set_seed
import mlflow
from torchinfo import summary
import os

  from tqdm.autonotebook import tqdm


In [None]:
os.environ['MLFLOW_TRACKING_URI'] = './mlruns07_8'
mlflow.set_tracking_uri(os.environ.get('MLFLOW_TRACKING_URI'))

In [None]:
mlflow.set_experiment('Exercise07_8')

2025/06/27 07:43:52 INFO mlflow.tracking.fluent: Experiment with name 'Exercise07_7' does not exist. Creating a new experiment.


<Experiment: artifact_location='/home/spakdel/my_projects/Books/Inside-Deep-Learning/Exercises_InsideDeepLearning/Chapter_07/mlruns07_7/521635938366561066', creation_time=1750997632775, experiment_id='521635938366561066', last_update_time=1750997632775, lifecycle_stage='active', name='Exercise07_7', tags={}>

In [3]:
torch.backends.cudnn.deterministic = True
set_seed(42)

In [4]:
device = torch.device("cuda") if torch.cuda.is_available() else torch.device("cpu")

## Dataset and DataLoader

In [5]:
from io import BytesIO
from zipfile import ZipFile
from urllib.request import urlopen
import re

all_data = []
resp = urlopen("https://cs.stanford.edu/people/karpathy/char-rnn/shakespear.txt")
shakespear_100k = resp.read()
shakespear_100k = shakespear_100k.decode('utf-8').lower()

In [6]:
vocab2indx = {} #the vocab $\Sigma$
for char in shakespear_100k: 
    if char not in vocab2indx: #add every new character to the vocab
        vocab2indx[char] = len(vocab2indx) #set the index based on the current vocab size

#Some useful code to goe from index back to original characters. 
indx2vocab = {}
#Well simply iterate over all key,value pairs and create a dicionary with the inverse mapping. 
for k, v in vocab2indx.items():
    indx2vocab[v] = k
print("Vocab Size: ", len(vocab2indx))
print("Total Characters:", len(shakespear_100k))

Vocab Size:  36
Total Characters: 99993


### New Dataset: start at new lines

In [None]:
class NewLineAutoRegressiveDataset(Dataset):
    """
    Creates an autoregressive dataset from one single, long, source sequence by breaking it up into "chunks".
    Each chunk will start at the beginning of a new line.
    """

    def __init__(self, large_string, MAX_CHUNK=500):
        """
        large_string: the original long source sequence that chunks will be extracted from
        MAX_CHUNK: the maximum allowed size of any chunk.
        """
        self.doc = large_string
        self.MAX_CHUNK = MAX_CHUNK

        # Pre-filter newline indices to only include those that can form a valid chunk
        self.valid_start_indices = []
        if len(self.doc) >= self.MAX_CHUNK + 1 and self.doc[0] != '\n':
            self.valid_start_indices.append(0)
        for i, char in enumerate(large_string):
            if char == '\n':
                # +1 to start after the newline, +MAX_CHUNK for the full chunk length
                # We need MAX_CHUNK + 1 for 'y' as well
                if (i + 1 + MAX_CHUNK) <= len(self.doc): # Check if there's enough data for a full chunk
                    self.valid_start_indices.append(i + 1)


    def __len__(self):
        return len(self.valid_start_indices)

    def __getitem__(self, idx):
        start_char_index = self.valid_start_indices[idx]

        # Grab the input sub-string for x (length MAX_CHUNK)
        sub_string_x = self.doc[start_char_index : start_char_index + self.MAX_CHUNK]
        # Grab the input sub-string for y (shifted by 1, same length)
        sub_string_y = self.doc[start_char_index + 1 : start_char_index + self.MAX_CHUNK + 1]

        # Convert to integers based on our vocab
        x = [vocab2indx[c] for c in sub_string_x]
        y = [vocab2indx[c] for c in sub_string_y]

        return torch.tensor(x, dtype=torch.int64), torch.tensor(y, dtype=torch.int64)

In [None]:
autoRegData_newline = NewLineAutoRegressiveDataset(shakespear_100k, MAX_CHUNK=250)
batch_size = 128
autoReg_loader_newline = DataLoader(autoRegData_newline, batch_size=batch_size, shuffle=True)

## Model

In [None]:
class AutoRegressiveGRU(nn.Module):

    def __init__(self, num_embeddings, embd_size, hidden_size, layers=1):
        super().__init__()
        self.hidden_size = hidden_size
        self.embd = nn.Embedding(num_embeddings, embd_size)
        self.layers = nn.ModuleList([nn.GRUCell(embd_size, hidden_size)] + 
                                    [nn.GRUCell(hidden_size, hidden_size) for i in range(layers-1)])
        self.norms = nn.ModuleList([nn.LayerNorm(hidden_size) for i in range(layers)])
        
        self.pred_class = nn.Sequential(
            nn.Linear(hidden_size, hidden_size),# (B, *, D)
            nn.LeakyReLU(),
            nn.LayerNorm(hidden_size), # (B, *, D)
            nn.Linear(hidden_size, num_embeddings) #(B, *. D) -> B(B, *, VocabSize)
        )
        
    def initHiddenStates(self, B):
        """
        Creates an initial hidden state list for the RNN layers. 
        
        B: the batch size for the hidden states. 
        """
        return [torch.zeros(B, self.hidden_size, device=device) for _ in range(len(self.layers))]
        
    def step(self, x_in, h_prevs=None):
        """
        x_in: the input for this current time step and has shape (B) if the values need 
            to be embedded, and (B, D) if they have alreayd been embedded. 

        h_prevs: a list of hidden state tensors each with shape (B, self.hidden_size) for each 
            layer in the network. These contain the current hidden state of the RNN layers and 
            will be updated by this call. 
        """
        #Prep all three arguments to be in the final form
        if len(x_in.shape) == 1: #(B), we need to embed it
            x_in = self.embd(x_in) #now (B, D)

        if h_prevs is None:
            h_prevs = self.initHiddenStates(x_in.shape[0])
        
        #Process the input 
        for l in range(len(self.layers)):
            h_prev = h_prevs[l]
            h = self.norms[l](self.layers[l](x_in, h_prev))

            h_prevs[l] = h
            x_in = h
        #Make predictions about the token
        return self.pred_class(x_in), h_prevs
    
    def forward(self, input):
        #Input should be (B, T)
        #What is the batch size?
        B = input.size(0)
        #What is the max number of time steps?
        T = input.size(1)
        
        x = self.embd(input) #(B, T, D)
        
        #Initial hidden states
        h_prevs = self.initHiddenStates(B)
        
        last_activations = []
        for t in range(T):
            x_in = x[:,t,:] #(B, D)
            preds, h_prevs = self.step(x_in, h_prevs)
            last_activations.append(preds)
        
        last_activations = torch.stack(last_activations, dim=1) #(B, T, D)
        
        return last_activations

In [None]:
model = AutoRegressiveGRU(len(vocab2indx), 32, 128, layers=2).to(device)

for p in model.parameters():
    p.register_hook(lambda grad: torch.clamp(grad, -2, 2))

## Training

In [13]:
def CrossEntLossTime(x, y):
    """
    x: output with shape (B, T, V)
    y: labels with shape (B, T)
    
    """
    cel = nn.CrossEntropyLoss()
    
    T = x.size(1)
    
    loss = 0
    
    for t in range(T):#for every item in the sequence
        loss += cel(x[:,t,:], y[:,t]) #Compute the sum of prediction errors
    
    return loss

In [None]:
loss_func = CrossEntLossTime
epochs = 5
params = {
    'device': device,
    'loss_func': loss_func.__class__.__name__,
    'epochs': epochs,
    'batch_size': batch_size,
    }

In [None]:
optimizer = optim.AdamW(model.parameters())
with open('model_summary.txt', 'w') as f:
    f.write(str(summary(model, inpt_size=(batch_size, 250, 1))))
with mlflow.start_run(nested=True, run_name='exercise_9'):
    params['optimizer'] = optimizer.defaults
    mlflow.log_artifact('model_summary.txt')
    mlflow.log_params(params)

    results = train_network(
        model=model,
        optimizer=optimizer,
        loss_func=loss_func,
        train_loader=autoReg_loader_newline,
        epochs=epochs,
        device=device,                
    )

Epoch:   0%|          | 0/1 [00:00<?, ?it/s]

Epoch: 100%|██████████| 1/1 [00:33<00:00, 33.98s/it]


## Feature Extracting with LastTimeStep

LastTimeStep class from chapter 04:

In [None]:
class LastTimeStep(nn.Module):
    """
    A class for extracting the hidden activations of the last time step following 
    the output of a PyTorch RNN module. 
    """
    def __init__(self, rnn_layers=1, bidirectional=False):
        super(LastTimeStep, self).__init__()
        self.rnn_layers = rnn_layers
        if bidirectional:
            self.num_driections = 2
        else:
            self.num_driections = 1    
    
    def forward(self, input):
        #Result is either a tupe (out, h_t)
        #or a tuple (out, (h_t, c_t))
        rnn_output = input[0]
        last_step = input[1] #this will be h_t
        if(type(last_step) == tuple):#unless it's a tuple, 
            last_step = last_step[0]#then h_t is the first item in the tuple
        batch_size = last_step.shape[1] #per docs, shape is: '(num_layers * num_directions, batch, hidden_size)'
        #reshaping so that everything is separate 
        last_step = last_step.view(self.rnn_layers, self.num_driections, batch_size, -1)
        #We want the last layer's results
        last_step = last_step[self.rnn_layers-1] 
        #Re order so batch comes first
        last_step = last_step.permute(1, 0, 2)
        #Finally, flatten the last two dimensions into one
        return last_step.reshape(batch_size, -1)

In [None]:
last_timestep_extractor = LastTimeStep(rnn_layers=2, bidirectional=False).to(device)
all_sentence_features = []
with torch.no_grad():
    for input, label in tqdm(autoReg_loader_newline):
        input.to(device)
        predictions = model(input)
        stacked_final_hidden_states = torch.stack(model.h_prevs, dim=0)
        sentence_features_batch = last_timestep_extractor((predictions, stacked_final_hidden_states))

        all_sentence_features.append(sentence_features_batch.cpu())
final_feature_tensor = torch.cat(all_sentence_features, dim=0)

## Clustring

"Note: You may want to sub-sample a smaller number of sentences to make your clustering algorithm run faster."

Clustering algorithms, especially on high-dimensional data (your hidden_size could be 128 or more), can be computationally expensive.
If you have tens of thousands or hundreds of thousands of sentences, running clustering on all of them might take a very long time or consume too much memory.
Sub-sampling: Randomly select a subset of your sentences (e.g., 5,000 or 10,000) to perform the clustering on. The results from this subset should still give you a good indication of the overall clusters present in your data.

In [None]:
from sklearn.cluster import KMeans
X = 
kmeans = KMeans(n_clusters=X)
kmeans.fit(final_feature_tensor)
labels = kmeans.labels_

## Results

Why does this work? (Intuition)
The hidden states of an RNN (like a GRU) are often thought of as a "memory" of the sequence processed so far. By the time the model has processed an entire sentence, its final hidden state summarizes the information it has learned from that sentence. If the model is good at predicting the next character, it implies that its hidden states are rich and discriminative. Sentences that are "similar" in content, style, or structure will likely lead to similar final hidden states, which is what clustering algorithms look for. This process is a common way to derive "sentence embeddings" for various NLP tasks.

The overall goal is to use the learned internal representations of a generative model to perform an analytical task (clustering) to gain insights into the structure and patterns within the text data.