In [None]:
from pathlib import Path

from vectormesh.data.cache import VectorCache

assets = Path("../artefacts")
trainpath = next(
    assets.glob("aktes*/")
)  # change this if you dont want the first folder
tag = trainpath.name
cache = VectorCache.load(path=trainpath)
train = cache.select(range(1024))
valid = cache.select(range(1024, 2048))
column_name = "legal_dutch"

We load a sample of the full dataset for this demo.
For the exam, obviously, dont do that, and use the full 15k documents!

The next piece of code will load the regexvectorizer and add it to the cache.

In [None]:
from vectormesh import RegexVectorizer
from vectormesh.data.vectorizers import (
    build_legal_reference_pattern,
    harmonize_legal_reference,
)

# Initialize & fit with training_texts
regexvectorizer = RegexVectorizer(
    col_name="regex",
    pattern_builder=build_legal_reference_pattern,
    harmonizer=harmonize_legal_reference,
    min_doc_frequency=15,
    max_features=200,
    device="cpu",
    training_texts=cache["text"],  # we fit it on all 15k texts!
)

With the regexvectorizer fitted, we can extend our dataset.

In [None]:
extended_cache = VectorCache.create(
    cache_dir=Path("tmp/artefacts"),
    vectorizer=regexvectorizer,  # use our new regex vectorizer
    dataset=cache.dataset,  # use the existing dataset
    dataset_tag=tag,  # this will check for existing metadata.json
)

We loaded the vectorized cache from artefacts, and now we have added the onehot vectors created with the regexes. See `vectormesh.data.vectorizers` for more details.

Lets have a look at a concrete example.

In [None]:
extended_cache[0]["regex"], extended_cache[0]["regex"].dtype

What we will do now, is instead of feeding just one vector to the model, we want to have two vectors:

- one vector is the 2D tensor from the huggingface model, shaped `(chunks, dim)`
- the other is a 1D binary vector created with regexes. 

Lets call the `X1` and `X2` respectively.

We now want to feed the model a tuple `(X1, X2)` as input, and the label `y`.

In [None]:
from typing import Callable

import torch
from pydantic import BaseModel


class CollateParallel(BaseModel):
    """
    processes a batch of Dataset items into padded tensors
    """

    vec1_col: str
    vec2_col: str
    target_col: str
    padder: Callable

    def __call__(self, batch):
        embeddings1 = [
            item[self.vec1_col] for item in batch
        ]  # 2D tensors (chunks, dim)
        embeddings2 = [item[self.vec2_col] for item in batch]  # 1D tensors (dim,)
        X1 = self.padder(
            embeddings1
        )  # pad the 2D tensor, now it is a 3D (batch, chunks, dim)
        X2 = torch.stack(embeddings2).float()  # the regex doesnt need padding
        y = torch.stack([item[self.target_col] for item in batch]).float()
        return (X1, X2), y

You can find this `CollaterParallel` class in `vectormesh.data.dataset`, but i show it here for clarity.

Lets check all the input we have:

In [None]:
extended_cache.features

We can now apply everything:

In [None]:
from torch.utils.data import DataLoader

from vectormesh.components import FixedPadding
from vectormesh.data import OneHot

onehot = OneHot(num_classes=32, label_col="labels", target_col="onehot")
collate_fn = CollateParallel(
    vec1_col="legal_dutch",
    vec2_col="regex",
    target_col="onehot",
    padder=FixedPadding(max_chunks=30),
)

train = extended_cache.select(range(1024))
valid = extended_cache.select(range(1024, 2048))
train_oh = train.map(onehot)
valid_oh = valid.map(onehot)

trainloader = DataLoader(train_oh, batch_size=32, shuffle=True, collate_fn=collate_fn)
validloader = DataLoader(valid_oh, batch_size=32, shuffle=False, collate_fn=collate_fn)

In [None]:
X, y = next(iter(trainloader))
type(X)

X is a tuple, exactly as we wanted!

In [None]:
X[0].shape, X[1].shape

One is a 3D tensor, the other a 2D tensor.

So, how can we create a model?

We can use the `Parallel` pipeline, see `vectormesh.components.pipelines`

In [None]:
from vectormesh.components import (
    Concatenate2D,
    MeanAggregator,
    NeuralNet,
    Parallel,
    Serial,
)

parallel = Parallel(
    [
        # (batch, chunks, dims) -> (batch, dims) -> (batch, 32)
        Serial([MeanAggregator(), NeuralNet(hidden_size=768, out_size=32)]),
        # (batch, dims) -> (batch, 32)
        Serial([NeuralNet(hidden_size=123, out_size=32)]),
    ]
)

pipeline = Serial(
    [
        parallel,  # (X1, X2) -> (batch, 32), (batch, 32)
        Concatenate2D(),  # (batch, 32), (batch, 32) -> (batch, 64)
        NeuralNet(hidden_size=64, out_size=32),  # (batch, 64) -> (batch, 32)
    ]
)

The first branch takes in the 3D tensor, we apply `MeanAggregator` to get a 2D tensor, and then a `NeuralNet` to get the final output. Check `vectormesh.components.neural` for more models.

The second branch doesnt need a aggregator, we already have 2D tensors, so we can directly apply the `NeuralNet`.

We want to take the output of the parallel pipeline, and we can do that with `Concatenate2D`, see `vectormesh.components.connectors`. Because we had two (batch, 32) tensors, after concatenation we have (batch,64) vectors, so we apply a final `NeuralNet` to get the final output.

In [None]:
yhat = pipeline(X)
yhat.shape

In [None]:
import torch.optim as optim
from mltrainer import ReportTypes, Trainer, TrainerSettings

from vectormesh.components.metrics import F1Score
from vectormesh.data.vectorizers import detect_device

device = detect_device()
print(f"Using device: {device}")

log_dir = Path("demo").absolute()

settings = TrainerSettings(
    epochs=3,
    metrics=[F1Score()],
    logdir=log_dir,
    train_steps=len(trainloader),
    valid_steps=len(trainloader),
    reporttypes=[ReportTypes.TENSORBOARD, ReportTypes.TOML],
)

loss_fn = torch.nn.BCEWithLogitsLoss()

trainer = Trainer(
    model=pipeline,
    settings=settings,
    loss_fn=loss_fn,
    optimizer=optim.Adam,
    traindataloader=trainloader,
    validdataloader=trainloader,
    scheduler=optim.lr_scheduler.ReduceLROnPlateau,
    device=device,
)

trainer.loop()

And, finally, lets have a look at `vectormesh.components.gating`. I have implemented a few variations of skip/gating, here is a demo of `Skip`.

In [None]:
from vectormesh.components import Projection, Skip

parallel = Parallel(
    [
        Serial([MeanAggregator(), NeuralNet(hidden_size=768, out_size=32)]),
        # (batch, chunks, dims) -> (batch, dims) -> (batch, 32)
        Serial([NeuralNet(hidden_size=123, out_size=32)]),
        # (batch, dims) -> (batch, 32)
    ]
)

pipeline = Serial(
    [
        parallel,  # (X1, X2) -> (batch, 32), (batch, 32)
        Concatenate2D(),  # (batch, 32), (batch, 32) -> (batch, 64)
        Projection(in_size=64, out_size=32),  # (batch, 64) -> (batch, 32)
        Skip(
            transform=NeuralNet(hidden_size=32, out_size=32),
            in_size=32,
        ),
    ]
)

In [None]:
trainer = Trainer(
    model=pipeline,
    settings=settings,
    loss_fn=loss_fn,
    optimizer=optim.Adam,
    traindataloader=trainloader,
    validdataloader=trainloader,
    scheduler=optim.lr_scheduler.ReduceLROnPlateau,
    device=device,
)

trainer.loop()