## Train ORIGAMI model on Dungeons dataset


In [1]:
import json

from sklearn.model_selection import train_test_split
from sklearn.pipeline import Pipeline

from origami.datasets.dungeons import generate_data
from origami.preprocessing import (
    DocTokenizerPipe,
    PadTruncTokensPipe,
    SchemaParserPipe,
    TargetFieldPipe,
    TokenEncoderPipe,
    docs_to_df,
)

# generate Dungeons dataset (see origami/datasets/dungeons.py)
data = generate_data(
    num_instances=10_000,
    num_doors_range=(5, 10),
    num_colors=3,
    with_monsters=True,
    num_treasures=5,
    shuffle_doors=True
)

# print example dictionary
print(json.dumps(data[0], indent=2))

# load data into dataframe and split into train/test
df = docs_to_df(data)
train_docs_df, test_docs_df = train_test_split(df, test_size=0.2, shuffle=True)

TARGET_FIELD = "treasure"

# create train and test pipelines
pipes = {
    "schema": SchemaParserPipe(),
    "target": TargetFieldPipe(TARGET_FIELD),
    "tokenizer": DocTokenizerPipe(path_in_field_tokens=True),
    "padding": PadTruncTokensPipe(length="max"),
    "encoder": TokenEncoderPipe(),
}

pipeline = Pipeline([(name, pipes[name]) for name in ("schema", "target", "tokenizer", "padding", "encoder")])

# process train, eval and test data
train_df = pipeline.fit_transform(train_docs_df)
test_df = pipeline.transform(test_docs_df)

# get stateful objects
schema = pipes["schema"].schema
encoder = pipes["encoder"].encoder
block_size = pipes["padding"].length

# print data stats
print(f"len train: {len(train_df)}, len test: {len(test_df)}")
print(f"vocab size {encoder.vocab_size}")
print(f"block size {block_size}")

{
  "door": 4,
  "key_color": "red",
  "corridor": [
    {
      "monsters": [
        "orc",
        "troll"
      ],
      "door_no": 0,
      "red_key": "spellbooks",
      "blue_key": "spellbooks",
      "green_key": "gold"
    },
    {
      "door_no": 1,
      "red_key": "artifacts",
      "blue_key": "gold",
      "green_key": "artifacts"
    },
    {
      "monsters": [
        "troll"
      ],
      "door_no": 2,
      "red_key": "diamonds",
      "blue_key": "spellbooks",
      "green_key": "spellbooks"
    },
    {
      "door_no": 3,
      "red_key": "artifacts",
      "blue_key": "spellbooks",
      "green_key": "gemstones"
    },
    {
      "monsters": [
        "wolf"
      ],
      "door_no": 4,
      "red_key": "spellbooks",
      "blue_key": "gold",
      "green_key": "gemstones"
    },
    {
      "door_no": 5,
      "red_key": "diamonds",
      "blue_key": "diamonds",
      "green_key": "diamonds"
    },
    {
      "monsters": [
        "wolf",
        "troll"
   

In [2]:
# create datasets, VPDA and model

from origami.model import ORIGAMI
from origami.model.vpda import ObjectVPDA
from origami.preprocessing import DFDataset
from origami.utils import ModelConfig, TrainConfig

# model and train configs
model_config = ModelConfig.from_preset("medium")
model_config.position_encoding = "NONE"
model_config.vocab_size = encoder.vocab_size
model_config.block_size = block_size

train_config = TrainConfig()
train_config.learning_rate = 1e-3
train_config.n_warmup_batches = 1000
train_config.print_every = 100
train_config.eval_every = 1000

# datasets
train_dataset = DFDataset(train_df)
test_dataset = DFDataset(test_df)

vpda = ObjectVPDA(encoder, schema)
model = ORIGAMI(model_config, train_config, vpda=vpda)


In [3]:
from origami.inference import Predictor
from origami.utils import make_progress_callback

# create a predictor
predictor = Predictor(model, encoder, TARGET_FIELD)

# create and register progress callback
progress_callback = make_progress_callback(
    train_config, train_dataset=train_dataset, test_dataset=test_dataset, predictor=predictor
)
model.set_callback("on_batch_end", progress_callback)

# train model (train and test accuracy should start to go towards 1.0 after ~3000 batches as loss drops below 0.6)
model.train_model(train_dataset, batches=5000)

|  step: 0  |  epoch: 0  |  batch_num: 0  |  batch_dt: 0.00  |  batch_loss: 2.8099  |  lr: 1.01e-06  |  train_acc: 0.0000  |  test_loss: 2.7963  |  test_acc: 0.0000  |
|  step: 1  |  epoch: 1  |  batch_num: 100  |  batch_dt: 69.12  |  batch_loss: 1.2363  |  lr: 1.01e-04  |
|  step: 2  |  epoch: 2  |  batch_num: 200  |  batch_dt: 72.31  |  batch_loss: 0.7872  |  lr: 2.01e-04  |
|  step: 3  |  epoch: 3  |  batch_num: 300  |  batch_dt: 75.93  |  batch_loss: 0.6798  |  lr: 3.01e-04  |
|  step: 4  |  epoch: 5  |  batch_num: 400  |  batch_dt: 77.41  |  batch_loss: 0.6413  |  lr: 4.01e-04  |
|  step: 5  |  epoch: 6  |  batch_num: 500  |  batch_dt: 77.41  |  batch_loss: 0.6531  |  lr: 5.01e-04  |
|  step: 6  |  epoch: 7  |  batch_num: 600  |  batch_dt: 80.36  |  batch_loss: 0.6199  |  lr: 6.01e-04  |
|  step: 7  |  epoch: 8  |  batch_num: 700  |  batch_dt: 80.23  |  batch_loss: 0.6177  |  lr: 7.01e-04  |
|  step: 8  |  epoch: 10  |  batch_num: 800  |  batch_dt: 80.93  |  batch_loss: 0.6192  | 

In [4]:
# calculate test accuracy
acc = predictor.accuracy(test_dataset, show_progress=True)
print(f"Test accuracy: {acc:.4f}")

# we can also access the predictions with the `predict()` method
predictions = predictor.predict(test_dataset)
print("Model predictions: ", predictions[:10])
print("Correct labels: ", test_dataset.df["target"].to_list()[:10])

Predicting:   0%|          | 0/85 [00:00<?, ?it/s]

Test accuracy: 0.9815
Model predictions:  ['spellbooks', 'gold', 'gold', 'spellbooks', 'spellbooks', 'gold', 'artifacts', 'spellbooks', 'gold', 'gemstones']
Correct labels:  ['spellbooks', 'gold', 'gold', 'spellbooks', 'spellbooks', 'gold', 'artifacts', 'spellbooks', 'gold', 'gemstones']
