In [1]:
%load_ext autoreload
%autoreload 2

# Exercise 3

<img src="./images/03.png" width=800>

In [2]:
import requests, zipfile, io
import torch
from torch.utils.data import Dataset, DataLoader, random_split
import os
import mlflow
import torch.nn as nn
import torch.optim as optim
from utils import (train_network, roc_auc_score_micro_wrapper, 
                accuracy_score_wrapper,f1_score_wrapper,
                weight_reset, set_seed)
from torchinfo import summary

  from tqdm.autonotebook import tqdm


In [None]:
os.environ['MLFLOW_TRACKING_URI'] = './mlruns04_3'
mlflow.set_tracking_uri(os.environ.get('MLFLOW_TRACKING_URI'))

In [4]:
mlflow.set_experiment('Exercise_3')

<Experiment: artifact_location='/home/spakdel/my_projects/Books/Inside-Deep-Learning/Exercises_InsideDeepLearning/Chapter_04/mlruns_3/807985352384486332', creation_time=1749204861344, experiment_id='807985352384486332', last_update_time=1749204861344, lifecycle_stage='active', name='Exercise_3', tags={}>

In [5]:
zip_file_url =  "https://download.pytorch.org/tutorial/data.zip"
r = requests.get(zip_file_url)
z = zipfile.ZipFile(io.BytesIO(r.content))
z.extractall()
# z = zipfile.ZipFile('./data.zip')
# z.extractall()

## Datasets and DataLoaders

In [6]:
namge_language_data = {}

#We will use some code to remove UNICODE tokens to make life easy for us processing wise
#e.g., convert something like "Ślusàrski" to Slusarski
import unicodedata
import string

all_letters = string.ascii_letters + " .,;'"
n_letters = len(all_letters)
alphabet = {}
for i in range(n_letters):
    alphabet[all_letters[i]] = i
    
# Turn a Unicode string to plain ASCII, thanks to https://stackoverflow.com/a/518232/2809427
def unicodeToAscii(s):
    return ''.join(
        c for c in unicodedata.normalize('NFD', s)
        if unicodedata.category(c) != 'Mn'
        and c in all_letters
    )

#Loop through every language, open the zip file entry, and read all the lines from the text file. 
def unicode_or_not(unicode=False):
    for zip_path in z.namelist():
        if "data/names/" in zip_path and zip_path.endswith(".txt"):
            lang = zip_path[len("data/names/"):-len(".txt")]
            with z.open(zip_path) as myfile:
                if unicode:
                    lang_names = [line.lower() for line in str(myfile.read(), encoding='utf-8').strip().split("\n")]
                else:
                    lang_names = [unicodeToAscii(line).lower() for line in str(myfile.read(), encoding='utf-8').strip().split("\n")]
                namge_language_data[lang] = lang_names
            print(lang, ": ", len(lang_names)) #Print out the name of each language too. 

In [7]:
from collections import defaultdict
class LanguageNameDataset(Dataset):
    def __init__(self, zipfile, vocabulary=None, unicode=False, min_count=1):
        self.namge_language_data = {}
        self.unicode_or_not(z=zipfile, unicode=unicode)
        self.label_names = [x for x in self.namge_language_data.keys()]
        self.data = []
        self.labels = []
        self.vocabulary = vocabulary
        self.min_count = min_count 
        for y, language in enumerate(self.label_names):
            for sample in self.namge_language_data[language]:
                self.data.append(sample)
                self.labels.append(y)
        if vocabulary is None:
            self.vocabulary_count = defaultdict(int)
            for names in self.data:
                for char in names:
                    self.vocabulary_count[char] += 1
            vocabulary = {}
            self.vocabulary_unk_detected = {}
            idx = 0
            for char, count in self.vocabulary_count.items():
                if count > self.min_count:
                    vocabulary[char] = idx
                    idx +=1
                else:
                    self.vocabulary_unk_detected[char] = count
            if len(self.vocabulary_unk_detected) != 0:
                vocabulary["UNK"] = idx
            # vocabulary_set = {char for char, count in self.vocabulary_count.items() if count > self.min_count}
            # for char, idx in enumerate(vocabulary_set):
            #     if min_count>=
            # if self.min_count >= 1:
            #     vocabulary["UNK"] = 0
        self.vocabulary = vocabulary
    
    def __len__(self):
        return len(self.data)
    
    def string2inputvector(self, input_string):
        T = len(input_string)
        name_vec = torch.zeros((T), dtype=torch.long)
        for pos, character in enumerate(input_string):
            if character in self.vocabulary:
                name_vec[pos] = self.vocabulary[character]
            else:
                if self.min_count >= 1:
                    name_vec[pos] = self.vocabulary["UNK"]
        return name_vec
    
    def unicode_or_not(self, z, unicode=False):
        for zip_path in z.namelist():
            if "data/names/" in zip_path and zip_path.endswith(".txt"):
                lang = zip_path[len("data/names/"):-len(".txt")]
                with z.open(zip_path) as myfile:
                    if unicode:
                        lang_names = [line.lower() for line in str(myfile.read(), encoding='utf-8').strip().split("\n")]
                    else:
                        lang_names = [unicodeToAscii(line).lower() for line in str(myfile.read(), encoding='utf-8').strip().split("\n")]
                    self.namge_language_data[lang] = lang_names
                # print(lang, ": ", len(lang_names)) #Print out the name of each language too. 
    
    def __getitem__(self, index):
        name = self.data[index]
        label = self.labels[index]
        
        label_tensor = torch.tensor(label, dtype=torch.long)
        return self.string2inputvector(name), label_tensor

In [8]:
def pad_and_pack(batch):
    input_tensors = []
    labels = []
    lengths = []
    for x, y in batch:
        input_tensors.append(x)
        labels.append(y)
        lengths.append(x.shape[0])
    x_padded = torch.nn.utils.rnn.pad_sequence(input_tensors, batch_first=False)
    x_packed = torch.nn.utils.rnn.pack_padded_sequence(x_padded, lengths, batch_first=False, enforce_sorted=False)
    y_batched = torch.as_tensor(labels, dtype=torch.long)
    return x_packed, y_batched

## Model

In [10]:
class LasttimeStep(nn.Module):
    def __init__(self, rnn_layer=1, bidirectional=False):
        super().__init__()
        self.rnn_layer = rnn_layer
        if bidirectional:
            self.num_bidirectional = 2
        else:
            self.num_bidirectional = 1
    def forward(self, input):
        rnn_output = input[0]
        last_step = input[1]
        if isinstance(last_step, tuple):
            last_step = last_step[0]
        batch_size = last_step.shape[1]
        last_step = last_step.view(self.rnn_layer, self.num_bidirectional, batch_size, -1)
        last_step = last_step[-1]
        return last_step.reshape(batch_size, -1)

In [9]:
class EmbeddingPackable(nn.Module):
    def __init__(self, embed_layer):
        super().__init__()
        self.embed_layer = embed_layer
    
    def forward(self, input):
        if type(input)== torch.nn.utils.rnn.PackedSequence:
            sequences, lengths = torch.nn.utils.rnn.pad_packed_sequence(
                input.cpu(),
                batch_first=True
            )
            sequences = self.embed_layer(sequences.to(input.data.device))
            return torch.nn.utils.rnn.pack_padded_sequence(
                sequences, lengths.cpu(),
                batch_first=True, enforce_sorted=False
            )
        else:
            return self.embed_layer(input)

## Training

In [11]:
device = torch.device('cuda') if torch.cuda.is_available() else torch.device('cpu')
loss_func = nn.CrossEntropyLoss()
score_funcs = {"Accuracy": accuracy_score_wrapper}
epochs = 20
params = {
    'device': device,
    'loss_func': loss_func.__class__.__name__,
    'epochs': epochs,
    }

In [12]:
D = 64
hidden_nodes = 256
batch_size = 256
for min_count in [1, 5, 10, 100, 300]:
    print(f'min_count: {min_count}')
    dataset = LanguageNameDataset(zipfile=z, min_count=min_count)
    
    train_data, test_data = random_split(
        dataset, 
        (len(dataset)-300, 300),
        )
    train_loader = DataLoader(train_data, batch_size=batch_size, shuffle=True, collate_fn=pad_and_pack)
    test_loader = DataLoader(test_data, batch_size=batch_size, shuffle=False, collate_fn=pad_and_pack)
    vocab_size = len(dataset.vocabulary)
    classes = len(dataset.label_names)
    model = nn.Sequential(
        EmbeddingPackable(nn.Embedding(vocab_size, D)),
        nn.RNN(D, hidden_size=hidden_nodes, batch_first=True),
        LasttimeStep(),
        nn.Linear(hidden_nodes, classes),
    )

    optimizer = optim.SGD(model.parameters(), lr=0.001)
    params['optimizer'] = optimizer.defaults
    params['batch_size'] = batch_size
    params['min_count'] = min_count
    params['vocabulary'] = dataset.vocabulary
    params['vocabulary_unk_detected'] = dataset.vocabulary_unk_detected
    params['number_count_under_min_count'] = len(dataset.vocabulary_unk_detected)
    with open('model_summary.txt', 'w') as f:
        f.write(str(summary(model)))
    with mlflow.start_run(nested=True, run_name=f'min_count= {min_count}'):
        mlflow.log_artifact('model_summary.txt')
        mlflow.log_params(params)
    
        results = train_network(
            model=model,
            optimizer=optimizer,
            loss_func=loss_func,
            train_loader=train_loader,
            valid_loader=test_loader,
            epochs=epochs,
            score_funcs=score_funcs,
            device=device,
        )

min_count: 1


Epoch: 100%|██████████| 20/20 [01:16<00:00,  3.80s/it]


min_count: 5


Epoch: 100%|██████████| 20/20 [01:14<00:00,  3.74s/it]


min_count: 10


Epoch: 100%|██████████| 20/20 [01:15<00:00,  3.80s/it]


min_count: 100


Epoch: 100%|██████████| 20/20 [01:23<00:00,  4.18s/it]


min_count: 300


Epoch: 100%|██████████| 20/20 [01:17<00:00,  3.85s/it]


<img src="./images/E3_train_acc.png">

<img src="./images/E3_train_loss.png">

<img src="./images/E3_valid_acc.png">

<img src="./images/E3_valid_loss.png">

<img src="./images/E3_time.png">

<img src="./images/E3_valid_acc_time.png">

**Results and Downstream Effects:**

- Model Input Representation: Since many characters will be replaced with "UNK", the model will receive less fine-grained input data. This can hurt performance if rare characters carry important distinguishing information.
- Generalization vs. Specificity: On the other hand, filtering out very rare characters can help in reducing noise and possibly overfitting, but only when the loss in information is acceptable.
- Error Propagation: In languages or names where rare characters play a key role, the higher min_count threshold may cause misinterpretations by the model since different rare characters are now conflated into one token.

Essentially, choosing min_count=300 is a trade-off. It simplifies the vocabulary (and thus the model's burden by reducing the input space), but it can also result in a loss of detail, adversely affecting the model’s ability to distinguish between inputs that differ in those less frequent characters.