In [1]:
%load_ext autoreload
%autoreload 2

# Exercise 6

<img src="./images/06.png" width=800>

In [None]:
import requests, zipfile, io
import torch
from torch.utils.data import Dataset, DataLoader, random_split, Subset
import os
import mlflow
import torch.nn as nn
import torch.optim as optim
from utils import (train_network, roc_auc_score_micro_wrapper, 
                accuracy_score_wrapper,f1_score_wrapper,
                weight_reset, set_seed)
from torchinfo import summary
from  sklearn.model_selection import train_test_split
import numpy as np

  from tqdm.autonotebook import tqdm


In [None]:
os.environ['MLFLOW_TRACKING_URI'] = './mlruns06_6'
mlflow.set_tracking_uri(os.environ.get('MLFLOW_TRACKING_URI'))

In [None]:
mlflow.set_experiment('Exercise06_6')

2025/06/07 09:32:51 INFO mlflow.tracking.fluent: Experiment with name 'Exercise_7' does not exist. Creating a new experiment.


<Experiment: artifact_location='/home/spakdel/my_projects/Books/Inside-Deep-Learning/Exercises_InsideDeepLearning/Chapter_04/mlruns_7/451609404090313110', creation_time=1749276171051, experiment_id='451609404090313110', last_update_time=1749276171051, lifecycle_stage='active', name='Exercise_7', tags={}>

In [None]:
zip_file_url =  "https://download.pytorch.org/tutorial/data.zip"
r = requests.get(zip_file_url)
z = zipfile.ZipFile(io.BytesIO(r.content))
z.extractall()
# z = zipfile.ZipFile('./data.zip')
# z.extractall()

## Datasets and DataLoaders

In [None]:
namge_language_data = {}

#We will use some code to remove UNICODE tokens to make life easy for us processing wise
#e.g., convert something like "Ślusàrski" to Slusarski
import unicodedata
import string

all_letters = string.ascii_letters + " .,;'"
n_letters = len(all_letters)
alphabet = {}
for i in range(n_letters):
    alphabet[all_letters[i]] = i
    
# Turn a Unicode string to plain ASCII, thanks to https://stackoverflow.com/a/518232/2809427
def unicodeToAscii(s):
    return ''.join(
        c for c in unicodedata.normalize('NFD', s)
        if unicodedata.category(c) != 'Mn'
        and c in all_letters
    )

In [None]:
class LanguageNameDataset_inferred_unicode(Dataset):
    def __init__(self, zipfile, vocabulary=None, unicode=False):
        self.namge_language_data = {}
        self.unicode_or_not(z=zipfile, unicode=unicode)
        self.label_names = [x for x in self.namge_language_data.keys()]
        self.data = []
        self.labels = []
        self.vocabulary = vocabulary
        for y, language in enumerate(self.label_names):
            for sample in self.namge_language_data[language]:
                self.data.append(sample)
                self.labels.append(y)
        if vocabulary is None:
            vocabulary_set = {char
                for names in self.data
                for char in names}
            vocabulary = {y:x
            for x, y in enumerate(vocabulary_set)
            }
        self.vocabulary = vocabulary
    def __len__(self):
        return len(self.data)
    
    def string2inputvector(self, input_string):
        T = len(input_string)
        name_vec = torch.zeros((T), dtype=torch.long)
        for pos, character in enumerate(input_string):
            name_vec[pos] = self.vocabulary[character]
        return name_vec
    
    def unicode_or_not(self, z, unicode=False):
        for zip_path in z.namelist():
            if "data/names/" in zip_path and zip_path.endswith(".txt"):
                lang = zip_path[len("data/names/"):-len(".txt")]
                with z.open(zip_path) as myfile:
                    if unicode:
                        lang_names = [line.lower() for line in str(myfile.read(), encoding='utf-8').strip().split("\n")]
                    else:
                        lang_names = [unicodeToAscii(line).lower() for line in str(myfile.read(), encoding='utf-8').strip().split("\n")]
                    self.namge_language_data[lang] = lang_names
                # print(lang, ": ", len(lang_names)) #Print out the name of each language too. 
    
    def __getitem__(self, index):
        name = self.data[index]
        label = self.labels[index]
        
        label_tensor = torch.tensor(label, dtype=torch.long)
        return self.string2inputvector(name), label_tensor

In [None]:
dataset = LanguageNameDataset_inferred_unicode(zipfile=z)
print(len(dataset.vocabulary))

29


In [None]:
train_idx, validation_idx = train_test_split(np.arange(len(dataset)),
                                            test_size=0.1,
                                            random_state=999,
                                            shuffle=True,
                                            stratify=dataset.labels)

# Subset dataset for train and val
train_dataset = Subset(dataset, train_idx)
test_dataset = Subset(dataset, validation_idx)

In [None]:
def pad_and_pack(batch):
    input_tensors = []
    labels = []
    lengths = []
    for x, y in batch:
        input_tensors.append(x)
        labels.append(y)
        lengths.append(x.shape[0])
    x_padded = torch.nn.utils.rnn.pad_sequence(input_tensors, batch_first=False)
    x_packed = torch.nn.utils.rnn.pack_padded_sequence(x_padded, lengths, batch_first=False, enforce_sorted=False)
    y_batched = torch.as_tensor(labels, dtype=torch.long)
    return x_packed, y_batched

In [None]:
batch_size = 1
train_loader = DataLoader(train_dataset, batch_size=batch_size, shuffle=True, collate_fn=pad_and_pack)
test_loader = DataLoader(test_dataset, batch_size=batch_size, shuffle=False, collate_fn=pad_and_pack)

## Models

In [None]:
class LasttimeStep(nn.Module):
    def __init__(self, rnn_layer=1, bidirectional=False):
        super().__init__()
        self.rnn_layer = rnn_layer
        if bidirectional:
            self.num_bidirectional = 2
        else:
            self.num_bidirectional = 1
    def forward(self, input):
        rnn_output = input[0]
        last_step = input[1]
        if isinstance(last_step, tuple):
            last_step = last_step[0]
        batch_size = last_step.shape[1]
        last_step = last_step.view(self.rnn_layer, self.num_bidirectional, batch_size, -1)
        last_step = last_step[-1]
        return last_step.reshape(batch_size, -1)

In [None]:
class EmbeddingPackable(nn.Module):
    def __init__(self, embed_layer):
        super().__init__()
        self.embed_layer = embed_layer
    
    def forward(self, input):
        if type(input)== torch.nn.utils.rnn.PackedSequence:
            sequences, lengths = torch.nn.utils.rnn.pad_packed_sequence(
                input.cpu(),
                batch_first=True
            )
            sequences = self.embed_layer(sequences.to(input.data.device))
            return torch.nn.utils.rnn.pack_padded_sequence(
                sequences, lengths.cpu(),
                batch_first=True, enforce_sorted=False
            )
        else:
            return self.embed_layer(input)

In [None]:
D = 64
vocab_size = len(dataset.vocabulary)
hidden_nodes = 256
classes = len(dataset.label_names)

def lstm_3layer(bidirectional: bool):
    return nn.Sequential(
        EmbeddingPackable(nn.Embedding(vocab_size, D)),
        nn.LSTM(D, hidden_size=hidden_nodes, batch_first=True, num_layers=3, bidirectional=bidirectional),
        LasttimeStep(rnn_layer=3, bidirectional=bidirectional),
        nn.Linear(hidden_nodes*(bidirectional+1), classes),
        )

## Training

In [None]:
device = torch.device('cuda') if torch.cuda.is_available() else torch.device('cpu')
loss_func = nn.CrossEntropyLoss()
score_funcs = {"Accuracy": accuracy_score_wrapper}
epochs = 10
params = {
    'device': device,
    'loss_func': loss_func.__class__.__name__,
    'epochs': epochs,
    'batch_size': batch_size
    }

In [None]:
for bidirectional in [0, 1]:
    model = lstm_3layer(bool(bidirectional))
    for p in model.parameters():
        p.register_hook(lambda grad: torch.clamp(grad, -5, 5))
    optimizer = optim.SGD(model.parameters(), lr=0.001)
    params['optimizer'] = optimizer.defaults
    params['vocabulary'] = dataset.vocabulary
    with open('model_summary.txt', 'w') as f:
        f.write(str(summary(model)))
    with mlflow.start_run(nested=True, run_name=f'bidirectional={bool(bidirectional)}'):
        mlflow.log_artifact('model_summary.txt')
        mlflow.log_params(params)
    
        batch_train = train_network(
            model=model,
            optimizer=optimizer,
            loss_func=loss_func,
            train_loader=train_loader,
            valid_loader=test_loader,
            epochs=epochs,
            score_funcs=score_funcs,
            device=device,
        )

<img src="./images/E6_train_acc.png">

<img src="./images/E6_train_loss.png">

<img src="./images/E6_valid_acc.png">

<img src="./images/E6_valid_loss.png">

<img src="./images/E6_time.png">