In [None]:
from pathlib import Path

from vectormesh.data.cache import VectorCache

artefacts = Path("../artefacts")
trainpath = next(artefacts.glob("*bert*train/"))
validpath = next(artefacts.glob("*bert*valid/"))
traincache = VectorCache.load(path=trainpath)
validcache = VectorCache.load(path=validpath)

We load the cache

In [None]:
traincache, validcache

For this notebook, lets create a subset, 1024 for train, 1024 for validation

In [None]:
train = traincache.select(range(1024))
valid = validcache.select(range(1024))

In [None]:
train, valid

Check how a single item looks like 

In [None]:
train[0]

In [None]:
column_name = "legal_dutch"  # the vector we want to use

If we iterate a batch, we get a list of tensors, where every tensor has a different chunk size (because the texts have different lengths). 

In [None]:
for batch in train.iter(batch_size=16):
    emb = batch[column_name]
    print("Checking the shapes of the embeddings in the batch:")
    for e in emb:
        print(e.shape)
    break
print(f" Note that the type of emb is {type(emb)}\n")

In [None]:
from collections import Counter

import matplotlib.pyplot as plt

cnt = Counter()
for e in train.iter(batch_size=64):
    for tensor in e[column_name]:
        cnt[tensor.shape[0]] += 1

shapes = sorted(cnt.keys())
frequencies = [cnt[s] for s in shapes]

# Creating the plot
plt.figure(figsize=(10, 6))
plt.bar(shapes, frequencies, color="skyblue", edgecolor="black", alpha=0.8)

plt.xlabel("Tensor Shape (Dimension 0)", fontsize=12)
plt.ylabel("Frequency", fontsize=12)
plt.title("Distribution of Tensor Shapes", fontsize=14)

If we want to batch, we need to pad them. Lets use a FixedPadding, such that every tensor has a shape (chunk dim) with every chunk the same size. Note this means we loose data for some documents!

(question: which type of model can handle (chunk dim) tensors where every batch has a different chunk size?)

In [None]:
from vectormesh.components import FixedPadding

padder = FixedPadding(max_chunks=30)
# change the max_chunks, or use DynamicPadding if your model can handle dynamic sizes

i = 0
for batch in train.iter(batch_size=16):
    emb = padder(batch[column_name])
    print(emb.shape)
    i += 1
    if i == 2:
        break

There are models that handle 3D tensors well. 

However, we can also aggregate the 3D tensors; see vectormesh.components.aggregation for a few examplea few examples. Lets just use the simplest, a mean aggregation over the chunk dimension.

In [None]:
from vectormesh.components import MeanAggregator

aggregator = MeanAggregator()
padder = FixedPadding(max_chunks=30)
for batch in train.iter(batch_size=16):
    emb = padder(batch[column_name])
    agg = aggregator(emb)
    print(agg.shape)
    break

We can wrap this `MeanAggregator` in a `Serial` pipeline. This will process the components sequentially. 
Because we have just one component, this is exactly the same as just using the `MeanAggregator` directly.

In [None]:
from vectormesh.components import Serial

pipeline = Serial([MeanAggregator()])

In [None]:
padder = FixedPadding(max_chunks=30)
for batch in train.iter(batch_size=16):
    emb = padder(batch[column_name])
    output = pipeline(emb)
    print(output.shape)
    break

But the advantage of `Serial` is that we can easily add more components.

In [None]:
from vectormesh.components import NeuralNet

pipeline = Serial([MeanAggregator(), NeuralNet(hidden_size=768, out_size=32)])

In [None]:
padder = FixedPadding(max_chunks=30)
for batch in train.iter(batch_size=16):
    emb = padder(batch[column_name])
    output = pipeline(emb)
    print(output.shape)
    break

We want to run predictions on this output. Currently, we have a model that does:

1. Input: (batch, chunks, dim)
2. Aggregation over chunks -> (batch, dim)
3. Feed to a Linear layer -> (batch, num_classes)

All we need is the label to do supervised machine learning.

Lets turn the labels into one-hot encoded vectors


In [None]:
import torch
from pydantic import BaseModel


class OneHot(BaseModel):
    """
    Turns a sparse integer label into a one-hot encoded vector.
    """

    num_classes: int
    label_col: str
    target_col: str

    def __call__(self, observation):
        vec = torch.zeros(self.num_classes, dtype=torch.float32)
        vec[observation[self.label_col]] = 1.0
        return {self.target_col: vec}


onehot = OneHot(num_classes=32, label_col="labels", target_col="onehot")

In [None]:
train_oh = train.map(onehot)
valid_oh = valid.map(onehot)

In [None]:
train_oh[0]

The main issue we now have, is how to batch the dictionarys into padded tensors, such that we can feed them into our pipeline and directly use `mltrainer`

In [None]:
from typing import Callable


class Collate(BaseModel):
    """
    processes a batch of Dataset items into padded tensors
    """

    embedding_col: str
    target_col: str
    padder: Callable

    def __call__(self, batch):
        embeddings = [item[self.embedding_col] for item in batch]
        X = self.padder(embeddings)
        y = torch.stack([item[self.target_col] for item in batch]).float()
        return X, y


collate_fn = Collate(
    embedding_col="legal_dutch",
    target_col="onehot",
    padder=FixedPadding(max_chunks=30),
)

We can now connect the `collate_fn` to the `DataLoader`, and we will get batches tensors.

In [None]:
from torch.utils.data import DataLoader

trainloader = DataLoader(train_oh, batch_size=32, shuffle=True, collate_fn=collate_fn)
validloader = DataLoader(valid_oh, batch_size=32, shuffle=False, collate_fn=collate_fn)

Let us check the shapes of the tensors

In [None]:
X, y = next(iter(trainloader))
X.shape, y.shape

In [None]:
from pathlib import Path

from mltrainer import ReportTypes, TrainerSettings

from vectormesh.components.metrics import F1Score

log_dir = Path("demo").absolute()

settings = TrainerSettings(
    epochs=50,
    metrics=[F1Score()],
    logdir=log_dir,
    train_steps=len(trainloader),
    valid_steps=len(validloader),
    reporttypes=[ReportTypes.TENSORBOARD, ReportTypes.TOML],
)
settings

We have multilabels, so we use Binary Cross Entropy as loss function

In [None]:
loss_fn = torch.nn.BCEWithLogitsLoss()

In [None]:
import torch.optim as optim
from mltrainer import Trainer

from vectormesh.data.vectorizers import detect_device

device = detect_device()
print(f"Using device: {device}")

trainer = Trainer(
    model=pipeline,
    settings=settings,
    loss_fn=loss_fn,
    optimizer=optim.Adam,
    traindataloader=trainloader,
    validdataloader=validloader,
    scheduler=optim.lr_scheduler.ReduceLROnPlateau,
    device=device,
)

In [None]:
trainer.loop()

In [None]:
import shutil

shutil.rmtree(log_dir, ignore_errors=True)
shutil.rmtree(Path("logs"), ignore_errors=True)
shutil.rmtree(Path("tmp"), ignore_errors=True)