## Train ORIGAMI model on Dungeons dataset


In [None]:
import json

from sklearn.model_selection import train_test_split
from sklearn.pipeline import Pipeline

from origami.datasets.dungeons import generate_data
from origami.preprocessing import (
    DocTokenizerPipe,
    PadTruncTokensPipe,
    SchemaParserPipe,
    TargetFieldPipe,
    TokenEncoderPipe,
    docs_to_df,
)

# generate Dungeons dataset (see origami/datasets/dungeons.py)
data = generate_data(
    num_instances=10_000,
    num_doors_range=(5, 10),
    num_colors=3,
    with_monsters=True,
    num_treasures=5,
)

# print example dictionary
print(json.dumps(data[0], indent=2))

# load data into dataframe and split into train/test
df = docs_to_df(data)
train_docs_df, test_docs_df = train_test_split(df, test_size=0.2, shuffle=True)

TARGET_FIELD = "treasure"

# create train and test pipelines
pipes = {
    "schema": SchemaParserPipe(),
    "target": TargetFieldPipe(TARGET_FIELD),
    "tokenizer": DocTokenizerPipe(path_in_field_tokens=True),
    "padding": PadTruncTokensPipe(length="max"),
    "encoder": TokenEncoderPipe(),
}

pipeline = Pipeline([(name, pipes[name]) for name in ("schema", "target", "tokenizer", "padding", "encoder")])

# process train, eval and test data
train_df = pipeline.fit_transform(train_docs_df)
test_df = pipeline.transform(test_docs_df)

# get stateful objects
schema = pipes["schema"].schema
encoder = pipes["encoder"].encoder
block_size = pipes["padding"].length

# print data stats
print(f"len train: {len(train_df)}, len test: {len(test_df)}")
print(f"vocab size {encoder.vocab_size}")
print(f"block size {block_size}")

In [None]:
# create datasets, VPDA and model

from origami.model import ORIGAMI
from origami.model.vpda import DocumentVPDA
from origami.preprocessing import DFDataset
from origami.utils import ModelConfig, TrainConfig

# model and train configs
model_config = ModelConfig.from_preset("medium")
model_config.position_encoding = "NONE"
model_config.vocab_size = encoder.vocab_size
model_config.block_size = block_size

train_config = TrainConfig()
train_config.learning_rate = 1e-3
train_config.n_warmup_batches = 1000

# datasets
train_dataset = DFDataset(train_df)
test_dataset = DFDataset(test_df)

vpda = DocumentVPDA(encoder, schema)
model = ORIGAMI(model_config, train_config, vpda=vpda)


In [None]:
from origami.inference import Predictor
from origami.utils.guild import print_guild_scalars

# create a predictor
predictor = Predictor(model, encoder, TARGET_FIELD)


# model callback during training, prints training and test metrics
def progress_callback(model):
    if model.batch_num % train_config.eval_every == 0:
        print_guild_scalars(
            step=f"{int(model.batch_num / train_config.eval_every)}",
            epoch=model.epoch_num,
            batch_num=model.batch_num,
            batch_dt=f"{model.batch_dt*1000:.2f}",
            batch_loss=f"{model.loss:.4f}",
            test_loss=f"{predictor.ce_loss(test_dataset.sample(n=100)):.4f}",
            test_acc=f"{predictor.accuracy(test_dataset.sample(n=100)):.4f}",
            lr=f"{model.learning_rate:.2e}",
        )


model.set_callback("on_batch_end", progress_callback)
model.train_model(train_dataset, batches=5000)


In [None]:
acc = predictor.accuracy(test_dataset, show_progress=True)
print(f"Test accuracy: {acc:.4f}")

# we can also access the predictions with the `predict()` method
predictions = predictor.predict(test_dataset)
print("Model predictions: ", predictions[:10])
print("Correct labels: ", test_dataset.df["target"].to_list()[:10])