# Exercise: Classification of Occupational Descriptions

1. Reproduce the HISCO classification model from the slides by downloading and loading the datasets and running the associated code (see [Exercise 1](##exercise-1))
1. Try changing the `nn.GRU` layer to an `nn.RNN` layer and then to an `nn.LSTM` layer and train your new models
1. Try changing the optimizer to something other than `torch.optim.AdamW`. A list of optimizers is available [here](https://pytorch.org/docs/stable/optim.html#algorithms)
1. Experiment with, e.g., the types or numbers of layers in your model, the choice of optimizer and learning rate, or the number of epochs. How high performance can you achieve on the test split?

## Data

Start by downloading the datasets `toy_data_train.csv` and `toy_data_test.csv`.

Alternatively, if you have `histocc` installed, you can run the code below to prepare the data:

In [None]:
from histocc import DATASETS
from sklearn.model_selection import train_test_split


def download_and_prepare_data():
    keys = DATASETS['keys']()
    mapping = dict(keys[['hisco', 'code']].values)

    toydata = DATASETS['toydata']()
    toydata['label'] = toydata['hisco_1'].transform(lambda x: mapping[x])

    train, test = train_test_split(
        toydata[['occ1', 'label']],
        test_size=0.1,
        random_state=42,
        )

    train.to_csv('./toy_data_train.csv', index=False)
    test.to_csv('./toy_data_test.csv', index=False)


download_and_prepare_data()   

## Setup

#### Dependencies

In [None]:
from functools import partial

import torch

from torch import Tensor, nn
from torch.utils.data import Dataset, DataLoader

import pandas as pd

#### `Dataset` class

In [None]:
# These are all the individual tokens present in the toy dataset
CHARS_IN_TOYDATA = [' ', '"', "'", '(', ')', '*', '+', ',', '-', '.', '/', '0', '1', '2', '3', '4', '5', '6', '7', '8', '9', ':', ';', '?', '@', '[', ']', '_', '`', 'a', 'b', 'c', 'd', 'e', 'f', 'g', 'h', 'i', 'j', 'k', 'l', 'm', 'n', 'o', 'p', 'q', 'r', 's', 't', 'u', 'v', 'w', 'x', 'y', 'z', '{', '¢', '£', '©', '¬', 'Â', 'Ã', 'â', 'œ', 'š', 'ž', '‚', '„', '€']
MAP_CHAR_IDX = {char: idx for idx, char in enumerate(CHARS_IN_TOYDATA, start=2)}

def tokenize(hisco: str, max_len: int) -> list[int]:
    encoded = [MAP_CHAR_IDX.get(char, 0) for char in hisco]
    encoded = encoded[:max_len]
    encoded += [1] * (max_len - len(encoded))

    return encoded


class HISCODataset(Dataset):
    def __init__(self, dataset: pd.DataFrame):
        super().__init__()

        self.dataset = dataset
        self.tokenizer = partial(tokenize, max_len=32)

    def __len__(self) -> int:
        return len(self.dataset)

    def __getitem__(self, item: int) -> dict[str, str | Tensor]:
        record = self.dataset.iloc[item]
        encoded = self.tokenizer(record.occ1)

        package = {
            'occ1': record.occ1,
            'encoded': torch.tensor(encoded, dtype=torch.long),
            'label': torch.tensor(record.label, dtype=torch.long),
        }

        return package

#### Load and prepare data

In [None]:
# NOTE: Make sure to adjust file paths
train_data = pd.read_csv('path/to/toy_data_train.csv')
test_data = pd.read_csv('path/to/toy_data_test.csv')

train_dataset = HISCODataset(train_data)
test_dataset = HISCODataset(test_data)

train_data_loader = DataLoader(train_dataset, batch_size=32)
test_data_loader = DataLoader(test_dataset, batch_size=32)

In [None]:
# (Optional) load in label <-> HISCO code mapping. Requires `histocc`
from histocc import DATASETS

keys = DATASETS['keys']()
map_hisco_label = dict(keys[['hisco', 'code']].values)
map_label_hisco = {v: k for k, v in map_hisco_label.items()}

## Exercise 1

In [None]:
class HISCOClassifierGRU(nn.Module):
    def __init__(self, vocab_size: int = 100, hidden_size: int = 128):
        super().__init__()

        self.embedding = nn.Embedding(vocab_size, hidden_size)
        self.gru = nn.GRU(hidden_size, hidden_size, batch_first=True)
        self.classifier = nn.Linear(hidden_size, 1919)

    def forward(self, input_seq: Tensor) -> Tensor:
        out = self.embedding(input_seq)
        out, _ = self.gru(out)
        out = out[:, -1, :]
        out = self.classifier(out)

        return out


In [None]:
model_baseline = HISCOClassifierGRU()
optimizer_baseline = torch.optim.AdamW(model_baseline.parameters(), lr=0.01)

loss_fn = torch.nn.CrossEntropyLoss()

In [None]:
def train_epoch(model, optimizer, data_loader, loss_fn):
    model.train()

    for batch in data_loader:
        optimizer.zero_grad()
        
        out = model(batch['encoded']) # make predictions
        loss = loss_fn(out, batch['label']) # calculate loss
        loss.backward() # calculate derivatives

        optimizer.step() # update network parameters

In [None]:
@torch.no_grad
def evaluate(model, data_loader):
    model.eval()

    total_correct = 0 # keep count of correct predictions
    total_count = 0 # keep count of total number of predictions

    for batch in data_loader:
        out = model(batch['encoded']).argmax(1)

        total_correct += (out == batch['label']).sum().item()
        total_count += batch['label'].size(0)

    return total_correct / total_count # calculate accuracy

In [None]:
for epoch in range(1, 11):
    train_epoch(model_baseline, optimizer_baseline, train_data_loader, loss_fn)
    acc = evaluate(model_baseline, test_data_loader)

    print(f'Trained for {epoch} epochs. Validation accuracy: {100 * acc}%')

## Exercise 2

In [None]:
class HISCOClassifierSimpleRNN(nn.Module):
    def __init__(self, vocab_size: int = 100, hidden_size: int = 128):
        super().__init__()

        self.embedding = nn.Embedding(vocab_size, hidden_size)
        self.rnn = nn.??(hidden_size, hidden_size, batch_first=True)
        self.classifier = nn.Linear(hidden_size, 1919)

    def forward(self, input_seq: Tensor) -> Tensor:
        out = self.embedding(input_seq)
        out, _ = self.rnn(out)
        out = out[:, -1, :]
        out = self.classifier(out)

        return out


class HISCOClassifierLSTM(nn.Module):
    def __init__(self, vocab_size: int = 100, hidden_size: int = 128):
        super().__init__()

        self.embedding = nn.Embedding(vocab_size, hidden_size)
        self.lstm = nn.??(hidden_size, hidden_size, batch_first=True)
        self.classifier = nn.Linear(hidden_size, 1919)

    def forward(self, input_seq: Tensor) -> Tensor:
        out = self.embedding(input_seq)
        out, _ = self.lstm(out)
        out = out[:, -1, :]
        out = self.classifier(out)

        return out

In [None]:
model_simple = HISCOClassifierSimpleRNN()
optimizer_simple = torch.optim.AdamW(model_simple.parameters(), lr=0.01)

In [None]:
model_lstm = HISCOClassifierLSTM()
optimizer_lstm = torch.optim.AdamW(model_lstm.parameters(), lr=0.01)

In [None]:
for epoch in range(1, 11):
    train_epoch(model_simple, optimizer_simple, train_data_loader, loss_fn)
    acc = evaluate(model_simple, test_data_loader)

    print(f'Trained "SimpleRNN" model for {epoch} epochs. Validation accuracy: {100 * acc}%')

In [None]:
for epoch in range(1, 11):
    train_epoch(??)
    acc = ??

    print(f'Trained "LSTM" model for {epoch} epochs. Validation accuracy: {100 * acc}%')

## Exercise 3

In [None]:
model_lstm_new_optimizer = HISCOClassifierLSTM()
optimizer_lstm_new_optimizer = torch.optim.??

In [None]:
for epoch in range(1, 11):
    train_epoch(??)
    acc = ??

    print(f'Trained "LSTM" (RMSprop) model for {epoch} epochs. Validation accuracy: {100 * acc}%')

## Exercise 4

In [None]:
class FancyHISCOClassifier(nn.Module):
    def __init__(self, vocab_size: int = 100, hidden_size: int = 128, dropout: float = 0.0):
        super().__init__()

        self.embedding = ??
        self.?? = nn.?? # your recurrent layer(s) here
        self.dropout = nn.Dropout(dropout)
        self.classifier = ??

    def forward(self, input_seq: Tensor) -> Tensor:
        out = self.embedding(input_seq)
        out, _ = self.??(out)
        out = out[:, -1, :]
        out = self.dropout(out)
        out = self.classifier(out)

        return out

In [None]:
device = 'cpu' # if GPU available, change this to 'cuda' for faster training

model_fancy = FancyHISCOClassifier(hidden_size=??, dropout=0.??).to(device)
optimizer_fancy = torch.optim.??(model_fancy.parameters(), lr=??)
scheduler_fancy = torch.optim.lr_scheduler.??(optimizer_fancy)

In [None]:
def train_epoch_select_device(model, optimizer, data_loader, loss_fn, device):
    model.train()

    for batch in data_loader:
        optimizer.zero_grad()
        
        out = model(batch['encoded'].to(device)) # make predictions
        loss = loss_fn(out, batch['label'].to(device)) # calculate loss
        loss.backward() # calculate derivatives

        optimizer.step() # update network parameters

In [None]:
@torch.no_grad
def evaluate_select_device(model, data_loader, device):
    model.eval()

    total_correct = 0 # keep count of correct predictions
    total_count = 0 # keep count of total number of predictions

    for batch in data_loader:
        out = model(batch['encoded'].to(device)).argmax(1).cpu()

        total_correct += (out == batch['label']).sum().item()
        total_count += batch['label'].size(0)

    return total_correct / total_count # calculate accuracy

In [None]:
for epoch in range(1, ??):
    train_epoch_select_device(model_fancy, optimizer_fancy, train_data_loader, loss_fn, device)
    acc = evaluate_select_device(model_fancy, test_data_loader, device)

    print(f'Trained "fancy" model for {epoch} epochs. Validation accuracy: {100 * acc}%')

    scheduler_fancy.step()