In [1]:
%load_ext autoreload
%autoreload 2

# Exercise 6

<img src="./images/06.png" width =800>

In [2]:
import requests, zipfile, io
import torch
from torch.utils.data import Dataset, DataLoader, random_split, Subset
import os
import mlflow
import torch.nn as nn
import torch.optim as optim
from utils import (train_network, roc_auc_score_micro_wrapper, 
                accuracy_score_wrapper,f1_score_wrapper,
                weight_reset, set_seed)
from torchinfo import summary
from  sklearn.model_selection import train_test_split
import numpy as np

  from tqdm.autonotebook import tqdm


In [None]:
os.environ['MLFLOW_TRACKING_URI'] = './mlruns04_6'
mlflow.set_tracking_uri(os.environ.get('MLFLOW_TRACKING_URI'))

In [4]:
mlflow.set_experiment('Exercise_6')

<Experiment: artifact_location='/home/spakdel/my_projects/Books/Inside-Deep-Learning/Exercises_InsideDeepLearning/Chapter_04/mlruns_6/899091141742955243', creation_time=1749223413903, experiment_id='899091141742955243', last_update_time=1749223413903, lifecycle_stage='active', name='Exercise_6', tags={}>

In [5]:
zip_file_url =  "https://download.pytorch.org/tutorial/data.zip"
r = requests.get(zip_file_url)
z = zipfile.ZipFile(io.BytesIO(r.content))
z.extractall()
# z = zipfile.ZipFile('./data.zip')
# z.extractall()

## Datasets and DataLoaders

In [6]:
namge_language_data = {}

#We will use some code to remove UNICODE tokens to make life easy for us processing wise
#e.g., convert something like "Ślusàrski" to Slusarski
import unicodedata
import string

all_letters = string.ascii_letters + " .,;'"
n_letters = len(all_letters)
alphabet = {}
for i in range(n_letters):
    alphabet[all_letters[i]] = i
    
# Turn a Unicode string to plain ASCII, thanks to https://stackoverflow.com/a/518232/2809427
def unicodeToAscii(s):
    return ''.join(
        c for c in unicodedata.normalize('NFD', s)
        if unicodedata.category(c) != 'Mn'
        and c in all_letters
    )

In [7]:
class LanguageNameDataset_inferred_unicode(Dataset):
    def __init__(self, zipfile, vocabulary=None, unicode=False):
        self.namge_language_data = {}
        self.unicode_or_not(z=zipfile, unicode=unicode)
        self.label_names = [x for x in self.namge_language_data.keys()]
        self.data = []
        self.labels = []
        self.vocabulary = vocabulary
        for y, language in enumerate(self.label_names):
            for sample in self.namge_language_data[language]:
                self.data.append(sample)
                self.labels.append(y)
        if vocabulary is None:
            vocabulary_set = {char
                for names in self.data
                for char in names}
            vocabulary = {y:x
            for x, y in enumerate(vocabulary_set)
            }
        self.vocabulary = vocabulary
    def __len__(self):
        return len(self.data)
    
    def string2inputvector(self, input_string):
        T = len(input_string)
        name_vec = torch.zeros((T), dtype=torch.long)
        for pos, character in enumerate(input_string):
            name_vec[pos] = self.vocabulary[character]
        return name_vec
    
    def unicode_or_not(self, z, unicode=False):
        for zip_path in z.namelist():
            if "data/names/" in zip_path and zip_path.endswith(".txt"):
                lang = zip_path[len("data/names/"):-len(".txt")]
                with z.open(zip_path) as myfile:
                    if unicode:
                        lang_names = [line.lower() for line in str(myfile.read(), encoding='utf-8').strip().split("\n")]
                    else:
                        lang_names = [unicodeToAscii(line).lower() for line in str(myfile.read(), encoding='utf-8').strip().split("\n")]
                    self.namge_language_data[lang] = lang_names
                # print(lang, ": ", len(lang_names)) #Print out the name of each language too. 
    
    def __getitem__(self, index):
        name = self.data[index]
        label = self.labels[index]
        
        label_tensor = torch.tensor(label, dtype=torch.long)
        return self.string2inputvector(name), label_tensor

In [8]:
dataset = LanguageNameDataset_inferred_unicode(zipfile=z)
print(len(dataset.vocabulary))

29


In [9]:
train_idx, validation_idx = train_test_split(np.arange(len(dataset)),
                                            test_size=0.1,
                                            random_state=999,
                                            shuffle=True,
                                            stratify=dataset.labels)

# Subset dataset for train and val
train_dataset = Subset(dataset, train_idx)
test_dataset = Subset(dataset, validation_idx)

In [10]:
batch_size = 64

### collate_fn=pad_and_pack

In [11]:
def pad_and_pack(batch):
    input_tensors = []
    labels = []
    lengths = []
    for x, y in batch:
        input_tensors.append(x)
        labels.append(y)
        lengths.append(x.shape[0])
    x_padded = torch.nn.utils.rnn.pad_sequence(input_tensors, batch_first=False)
    x_packed = torch.nn.utils.rnn.pack_padded_sequence(x_padded, lengths, batch_first=False, enforce_sorted=False)
    y_batched = torch.as_tensor(labels, dtype=torch.long)
    return x_packed, y_batched

In [12]:
# Dataloader for train and val
train_loader_batch = DataLoader(train_dataset, batch_size=batch_size, shuffle=True, collate_fn=pad_and_pack)
test_loader_batch = DataLoader(test_dataset, batch_size=batch_size, shuffle=False, collate_fn=pad_and_pack)

### collate_fn=remove_half_batch

In [13]:
import random
def pad_and_pack_half(batch):
    input_tensors = []
    labels = []
    lengths = []
    size_batch = len(batch)
    selected_batch = random.sample(batch, size_batch//2)
    for x, y in selected_batch:
        input_tensors.append(x)
        labels.append(y)
        lengths.append(x.shape[0])
    x_padded = torch.nn.utils.rnn.pad_sequence(input_tensors, batch_first=False)
    x_packed = torch.nn.utils.rnn.pack_padded_sequence(x_padded, lengths, batch_first=False, enforce_sorted=False)
    y_batched = torch.as_tensor(labels, dtype=torch.long)
    return x_packed, y_batched

In [14]:
# Dataloader for train and val
train_loader_half = DataLoader(train_dataset, batch_size=batch_size, shuffle=True, collate_fn=pad_and_pack_half)
test_loader_half = DataLoader(test_dataset, batch_size=batch_size, shuffle=False, collate_fn=pad_and_pack)

## Models

In [15]:
class LasttimeStep(nn.Module):
    def __init__(self, rnn_layer=1, bidirectional=False):
        super().__init__()
        self.rnn_layer = rnn_layer
        if bidirectional:
            self.num_bidirectional = 2
        else:
            self.num_bidirectional = 1
    def forward(self, input):
        rnn_output = input[0]
        last_step = input[1]
        if isinstance(last_step, tuple):
            last_step = last_step[0]
        batch_size = last_step.shape[1]
        last_step = last_step.view(self.rnn_layer, self.num_bidirectional, batch_size, -1)
        last_step = last_step[-1]
        return last_step.reshape(batch_size, -1)

In [16]:
class EmbeddingPackable(nn.Module):
    def __init__(self, embed_layer):
        super().__init__()
        self.embed_layer = embed_layer
    
    def forward(self, input):
        if type(input)== torch.nn.utils.rnn.PackedSequence:
            sequences, lengths = torch.nn.utils.rnn.pad_packed_sequence(
                input.cpu(),
                batch_first=True
            )
            sequences = self.embed_layer(sequences.to(input.data.device))
            return torch.nn.utils.rnn.pack_padded_sequence(
                sequences, lengths.cpu(),
                batch_first=True, enforce_sorted=False
            )
        else:
            return self.embed_layer(input)

In [17]:
D = 64
vocab_size = len(dataset.vocabulary)
hidden_nodes = 256
classes = len(dataset.label_names)

model = nn.Sequential(
    EmbeddingPackable(nn.Embedding(vocab_size, D)),
    nn.RNN(D, hidden_size=hidden_nodes, batch_first=True),
    LasttimeStep(),
    nn.Linear(hidden_nodes, classes),
)

## Training

In [21]:
device = torch.device('cuda') if torch.cuda.is_available() else torch.device('cpu')
loss_func = nn.CrossEntropyLoss()
score_funcs = {"Accuracy": accuracy_score_wrapper}
epochs = 20
params = {
    'device': device,
    'loss_func': loss_func.__class__.__name__,
    'epochs': epochs,
    'batch_size': batch_size
    }

In [22]:
from utils import train_network_half_batch
train_loaders = (train_loader_batch, train_loader_half)
test_loaders = (test_loader_batch, test_loader_half)
train_modes = (train_network, train_network_half_batch)
name_experiment = ('train_batch', 'train_half_batch')

In [23]:
for i in range(0, 2):
    print(name_experiment[i])
    train_mode = train_modes[i]
    train_loader = train_loaders[i]
    test_loader = test_loaders[i]
    model.apply(weight_reset)
    optimizer = optim.SGD(model.parameters(), lr=0.001)
    params['optimizer'] = optimizer.defaults
    params['vocabulary'] = dataset.vocabulary
    with open('model_summary.txt', 'w') as f:
        f.write(str(summary(model)))
    with mlflow.start_run(nested=True, run_name=name_experiment[i]):
        mlflow.log_artifact('model_summary.txt')
        mlflow.log_params(params)
    
        batch_train = train_mode(
            model=model,
            optimizer=optimizer,
            loss_func=loss_func,
            train_loader=train_loader,
            valid_loader=test_loader,
            epochs=epochs,
            score_funcs=score_funcs,
            device=device,
        )

train_batch


Epoch: 100%|██████████| 20/20 [01:18<00:00,  3.91s/it]


train_half_batch


Epoch: 100%|██████████| 20/20 [02:34<00:00,  7.74s/it]


<img src="./images/E6_train_acc.png">

<img src="./images/E6_train_loss.png">

<img src="./images/E6_valid_acc.png">

<img src="./images/E6_valid_loss.png">

<img src="./images/E6_time_epoch.png">

<img src="./images/E6_valid_acc_time.png">

No, training with two epochs using a collate function that removes half the items from each batch (`pad_and_pack_half`) does not obtain the same results as training with one epoch of a normal collate function.

Why Not?

Even though the modified training loop iterates through the data twice per epoch, effectively seeing the entire dataset, **the training process is fundamentally different** due to several factors:

- **Regularization Effect**: The `pad_and_pack_half` collate function introduces a regularization effect by using **smaller, random** subsets of the data in each batch. This **noise** helps prevent the model from **overfitting** to the training data, potentially leading to better **generalization** (improved validation accuracy). A normal collate function doesn't have this regularization effect.

- **Optimizer Updates**: The optimizer updates the model parameters after each batch. With `pad_and_pack_half` and the double iteration, **the optimizer is updated twice as often per epoch**, using gradients calculated on smaller, randomized subsets of the data. This alters the optimization trajectory compared to a normal collate function with fewer, larger updates.

- **Randomness**: The `random.sample` function in `pad_and_pack_half` introduces additional randomness. Each half-batch is a random subset of the original batch, leading to different gradient estimates and parameter updates compared to a normal collate function.

In summary: While both methods may process the entire dataset once per epoch, the way the data is presented to the model (smaller, randomized batches vs. full batches) and the frequency of parameter updates lead to different training dynamics and, therefore, different results. The `pad_and_pack_half` strategy introduces a form of regularization that can improve generalization but also changes the optimization process.