In [None]:
%load_ext autoreload
%autoreload 2

# Exercise 1

<img src="./images/01.png" width=800>

In [1]:
import requests, zipfile, io
import torch
from torch.utils.data import Dataset, DataLoader, random_split
import os
import mlflow
import torch.nn as nn
import torch.optim as optim
from utils import (train_network, roc_auc_score_micro_wrapper, 
                accuracy_score_wrapper,f1_score_wrapper,
                weight_reset, set_seed)
from torchinfo import summary

  from tqdm.autonotebook import tqdm


In [None]:
os.environ['MLFLOW_TRACKING_URI'] = './mlruns04_1'
mlflow.set_tracking_uri(os.environ.get('MLFLOW_TRACKING_URI'))

In [3]:
mlflow.set_experiment('Exercise_1')

2025/06/03 16:12:37 INFO mlflow.tracking.fluent: Experiment with name 'Exercise_1' does not exist. Creating a new experiment.


<Experiment: artifact_location='/home/spakdel/my_projects/Books/Inside-Deep-Learning/Exercises_InsideDeepLearning/Chapter_04/mlruns_1/369693178547761732', creation_time=1748954557678, experiment_id='369693178547761732', last_update_time=1748954557678, lifecycle_stage='active', name='Exercise_1', tags={}>

In [4]:
zip_file_url =  "https://download.pytorch.org/tutorial/data.zip"
r = requests.get(zip_file_url)
z = zipfile.ZipFile(io.BytesIO(r.content))
z.extractall()
# z = zipfile.ZipFile('./data.zip')
# z.extractall()

### Dataset and DataLoader

In [5]:
namge_language_data = {}

#We will use some code to remove UNICODE tokens to make life easy for us processing wise
#e.g., convert something like "Ślusàrski" to Slusarski
import unicodedata
import string

all_letters = string.ascii_letters + " .,;'"
n_letters = len(all_letters)
alphabet = {}
for i in range(n_letters):
    alphabet[all_letters[i]] = i
    
# Turn a Unicode string to plain ASCII, thanks to https://stackoverflow.com/a/518232/2809427
def unicodeToAscii(s):
    return ''.join(
        c for c in unicodedata.normalize('NFD', s)
        if unicodedata.category(c) != 'Mn'
        and c in all_letters
    )

#Loop through every language, open the zip file entry, and read all the lines from the text file. 
for zip_path in z.namelist():
    if "data/names/" in zip_path and zip_path.endswith(".txt"):
        lang = zip_path[len("data/names/"):-len(".txt")]
        with z.open(zip_path) as myfile:
            lang_names = [unicodeToAscii(line).lower() for line in str(myfile.read(), encoding='utf-8').strip().split("\n")]
            namge_language_data[lang] = lang_names
        print(lang, ": ", len(lang_names)) #Print out the name of each language too. 

Arabic :  2000
Chinese :  268
Czech :  519
Dutch :  297
English :  3668
French :  277
German :  724
Greek :  203
Irish :  232
Italian :  709
Japanese :  991
Korean :  94
Polish :  139
Portuguese :  74
Russian :  9408
Scottish :  100
Spanish :  298
Vietnamese :  73


In [6]:
class LanguageNameDataset_inferred(Dataset):
    def __init__(self, lang_name_dict, vocabulary=None):
        self.label_names = [x for x in lang_name_dict.keys()]
        self.data = []
        self.labels = []
        for y, language in enumerate(self.label_names):
            for sample in lang_name_dict[language]:
                self.data.append(sample)
                self.labels.append(y)
        if vocabulary is None:
            vocabulary_set = {char
                for names in self.data
                for char in names
                }
            vocabulary = {y:x
            for x, y in enumerate(vocabulary_set)
            }
            # for names in self.data:
            #     for char in names:
            #         if char == ' ':
            #             print(names)
        self.vocabulary = vocabulary
    def __len__(self):
        return len(self.data)
    
    def string2inputvector(self, input_string):
        T = len(input_string)
        name_vec = torch.zeros((T), dtype=torch.long)
        for pos, character in enumerate(input_string):
            name_vec[pos] = self.vocabulary[character]
        return name_vec
    
    def __getitem__(self, index):
        name = self.data[index]
        label = self.labels[index]
        
        label_tensor = torch.tensor(label, dtype=torch.long)
        return self.string2inputvector(name), label_tensor

In [7]:
dataset = LanguageNameDataset_inferred(namge_language_data, alphabet)
dataset_inferred = LanguageNameDataset_inferred(namge_language_data)

In [8]:
print(len(dataset.vocabulary))
len(dataset_inferred.vocabulary)

57


29

In [9]:
train_data, test_data = random_split(
    dataset, 
    (len(dataset)-300, 300),
    )
train_loader = DataLoader(train_data, batch_size=1, shuffle=True)
test_loader = DataLoader(test_data, batch_size=1, shuffle=False)

In [10]:
train_data_inferred, test_data_inferred = random_split(
    dataset_inferred, 
    (len(dataset_inferred)-300, 300),
    )
train_loader_inferred = DataLoader(train_data_inferred, batch_size=1, shuffle=True)
test_loader_inferred = DataLoader(test_data_inferred, batch_size=1, shuffle=False)

### Model

In [11]:
class LasttimeStep(nn.Module):
    def __init__(self, rnn_layer=1, bidirectional=False):
        super().__init__()
        self.rnn_layer = rnn_layer
        if bidirectional:
            self.num_bidirectional = 2
        else:
            self.num_bidirectional = 1
    def forward(self, input):
        rnn_output = input[0]
        last_step = input[1]
        if isinstance(last_step, tuple):
            last_step = last_step[0]
        batch_size = last_step.shape[1]
        last_step = last_step.view(self.rnn_layer, self.num_bidirectional, batch_size, -1)
        last_step = last_step[-1]
        return last_step.reshape(batch_size, -1)

In [12]:
D = 64
vocab_size = len(dataset.vocabulary)
hidden_nodes = 256
classes = len(dataset.label_names)

rnn = nn.Sequential(
    nn.Embedding(vocab_size, D),
    nn.RNN(D, hidden_size=hidden_nodes, batch_first=True),
    LasttimeStep(),
    nn.Linear(hidden_nodes, classes),
)

In [13]:
D = 64
vocab_size = len(dataset_inferred.vocabulary)
hidden_nodes = 256
classes = len(dataset_inferred.label_names)

rnn_inferred = nn.Sequential(
    nn.Embedding(vocab_size, D),
    nn.RNN(D, hidden_size=hidden_nodes, batch_first=True),
    LasttimeStep(),
    nn.Linear(hidden_nodes, classes),
)

### Training

In [14]:
device = torch.device('cuda') if torch.cuda.is_available() else torch.device('cpu')
loss_func = nn.CrossEntropyLoss()
score_funcs = {"Accuracy": accuracy_score_wrapper}
epochs = 20
params = {
    'device': device,
    'loss_func': loss_func.__class__.__name__,
    'epochs': epochs,
    }

In [15]:
datasets = (dataset, dataset_inferred)
train_loaders = (train_loader ,train_loader_inferred)
test_loaders = (test_loader ,test_loader_inferred)
models = (rnn, rnn_inferred)
name_experiment = ('assumed_vocabulary', 'inferred_vocabulary')

In [21]:
for i in range(2):
    model = models[i]
    optimizer = optim.SGD(model.parameters(), lr=0.001)
    params['optimizer'] = optimizer.defaults
    train_loader = train_loaders[i]
    test_loader = test_loaders[i]
    
    params['vocabulary'] = datasets[i].vocabulary
    with open('model_summary.txt', 'w') as f:
        f.write(str(summary(model)))
    with mlflow.start_run(nested=True, run_name=name_experiment[i]):
        mlflow.log_artifact('model_summary.txt')
        mlflow.log_params(params)
    
        batch_one_train = train_network(
            model=model,
            optimizer=optimizer,
            loss_func=loss_func,
            train_loader=train_loader,
            valid_loader=test_loader,
            epochs=epochs,
            score_funcs=score_funcs,
            device=device,
        )

Epoch:   0%|          | 0/20 [00:00<?, ?it/s]

Epoch: 100%|██████████| 20/20 [21:56<00:00, 65.81s/it]
Epoch: 100%|██████████| 20/20 [18:10<00:00, 54.51s/it]


<img src="./images/E1_train_acc.png">

<img src="./images/E1_train_loss.png">

<img src="./images/E1_valid_acc.png">

<img src="./images/E1_valid_loss.png">