## Training an ORiGAMi model on the Dungeons dataset

The Dungeons dataset is a (dungeons-themed) challenging synthetic dataset for supervised classification on
semi-structured data.

Each instance constains a corridor array with several rooms. Each room has a door number and contains multiple
treasure chests with different-colored keys. All but one of the treasures are fake though.

The goal is to find the correct room number and key color in each dungeon based on some clues and return the
only non-fake treasure.

The clues are given at the top-level of the object with keys `door`, `key_color`.

To make it even harder, the `corridor` array may be shuffled, and room objects may have a number of monsters as
their first field, shifting the token positions of the serialized object by a variable amount.

The following dictionary represents one example JSON instance:

```json
{
  "door": 1, // clue which door is the correct one
  "key_color": "blue", // clue which key is the correct one
  "corridor": [
    {
      "monsters": ["troll", "wolf"], // optional monsters in front of the door
      "door_no": 1, // door number in the corridor
      "red_key": "gemstones", // different keys return different treasures,
      "blue_key": "spellbooks", // but only one is real, the others are fake
      "green_key": "artifacts"
    },
    {
      // another room
      "door_no": 0, // rooms can be shuffled, here room 0 comes after 1
      "red_key": "diamonds",
      "blue_key": "gold",
      "green_key": "gemstones"
    }
    // ... more doors ...
  ],
  "treasure": "spellbooks" // correct treasure (target label)
}
```

The correct answer for this instance is "spellbooks", because the `door` is 1 and the `key_color` is "blue".


In [1]:
import json

from sklearn.model_selection import train_test_split
from sklearn.pipeline import Pipeline

from origami.datasets.dungeons import generate_data
from origami.preprocessing import (
    DocTokenizerPipe,
    PadTruncTokensPipe,
    SchemaParserPipe,
    TargetFieldPipe,
    TokenEncoderPipe,
    docs_to_df,
)

# generate Dungeons dataset (see origami/datasets/dungeons.py)
data = generate_data(
    num_instances=10_000,
    num_doors_range=(5, 10),
    num_colors=3,
    num_treasures=5,
    with_monsters=True,  # makes it harder as token positions get shifted by variable amount
    shuffle_rooms=True,  # makes it harder because rooms are in random order
)

# print example dictionary
print(json.dumps(data[0], indent=2))

# load data into dataframe and split into train/test
df = docs_to_df(data)
train_docs_df, test_docs_df = train_test_split(df, test_size=0.2, shuffle=True)

TARGET_FIELD = "treasure"

# create train and test pipelines
pipes = {
    "schema": SchemaParserPipe(),
    "target": TargetFieldPipe(TARGET_FIELD),
    "tokenizer": DocTokenizerPipe(path_in_field_tokens=True),
    "padding": PadTruncTokensPipe(length="max"),
    "encoder": TokenEncoderPipe(),
}

pipeline = Pipeline([(name, pipes[name]) for name in ("schema", "target", "tokenizer", "padding", "encoder")])

# process train, eval and test data
train_df = pipeline.fit_transform(train_docs_df)
test_df = pipeline.transform(test_docs_df)

# get stateful objects
schema = pipes["schema"].schema
encoder = pipes["encoder"].encoder
block_size = pipes["padding"].length

# print data stats
print(f"len train: {len(train_df)}, len test: {len(test_df)}")
print(f"vocab size {encoder.vocab_size}")
print(f"block size {block_size}")

{
  "door": 1,
  "key_color": "green",
  "corridor": [
    {
      "door_no": 0,
      "red_key": "gemstones",
      "blue_key": "spellbooks",
      "green_key": "diamonds"
    },
    {
      "monsters": [
        "dragon",
        "orc"
      ],
      "door_no": 1,
      "red_key": "gemstones",
      "blue_key": "diamonds",
      "green_key": "gold"
    },
    {
      "monsters": [
        "troll"
      ],
      "door_no": 2,
      "red_key": "artifacts",
      "blue_key": "diamonds",
      "green_key": "spellbooks"
    },
    {
      "monsters": [
        "goblin"
      ],
      "door_no": 3,
      "red_key": "artifacts",
      "blue_key": "gemstones",
      "green_key": "spellbooks"
    },
    {
      "door_no": 4,
      "red_key": "gemstones",
      "blue_key": "gemstones",
      "green_key": "spellbooks"
    }
  ],
  "treasure": "gold"
}
len train: 8000, len test: 2000
vocab size 53
block size 148


In [2]:
# create datasets, VPDA and model

from origami.model import ORIGAMI
from origami.model.vpda import ObjectVPDA
from origami.preprocessing import DFDataset
from origami.utils import ModelConfig, TrainConfig

# model and train configs
model_config = ModelConfig.from_preset("medium")  # see origami/utils/config.py for different presets
model_config.vocab_size = encoder.vocab_size
model_config.block_size = block_size

train_config = TrainConfig()
train_config.learning_rate = 1e-3

# datasets
train_dataset = DFDataset(train_df)
test_dataset = DFDataset(test_df)

vpda = ObjectVPDA(encoder, schema)
model = ORIGAMI(model_config, train_config, vpda=vpda)


In [3]:
from origami.inference import Predictor
from origami.utils import make_progress_callback

# create a predictor
predictor = Predictor(model, encoder, TARGET_FIELD)

# create and register progress callback
progress_callback = make_progress_callback(
    train_config, train_dataset=train_dataset, test_dataset=test_dataset, predictor=predictor
)
model.set_callback("on_batch_end", progress_callback)

# train model (train and test accuracy should start to go towards 1.0 after ~3000 batches as loss drops below 0.6)
model.train_model(train_dataset, batches=20000)

|  step: 0  |  epoch: 0  |  batch_num: 0  |  batch_dt: 0.00  |  batch_loss: 2.5856  |  lr: 1.01e-06  |  train_acc: 0.2100  |  test_loss: 2.5782  |  test_acc: 0.1600  |
|  step: 1  |  epoch: 1  |  batch_num: 100  |  batch_dt: 122.21  |  batch_loss: 1.1130  |  lr: 1.01e-04  |
|  step: 2  |  epoch: 2  |  batch_num: 200  |  batch_dt: 130.67  |  batch_loss: 0.8074  |  lr: 2.01e-04  |
|  step: 3  |  epoch: 3  |  batch_num: 300  |  batch_dt: 127.39  |  batch_loss: 0.7669  |  lr: 3.01e-04  |
|  step: 4  |  epoch: 5  |  batch_num: 400  |  batch_dt: 115.76  |  batch_loss: 0.7419  |  lr: 4.01e-04  |
|  step: 5  |  epoch: 6  |  batch_num: 500  |  batch_dt: 119.28  |  batch_loss: 0.7220  |  lr: 5.01e-04  |
|  step: 6  |  epoch: 7  |  batch_num: 600  |  batch_dt: 116.48  |  batch_loss: 0.7229  |  lr: 6.01e-04  |
|  step: 7  |  epoch: 8  |  batch_num: 700  |  batch_dt: 118.20  |  batch_loss: 0.7161  |  lr: 7.01e-04  |
|  step: 8  |  epoch: 10  |  batch_num: 800  |  batch_dt: 117.42  |  batch_loss: 0.

In [None]:
# calculate test accuracy
acc = predictor.accuracy(test_dataset, show_progress=True)
print(f"Test accuracy: {acc:.4f}")

# we can also access the predictions with the `predict()` method
predictions = predictor.predict(test_dataset)
print("Model predictions: ", predictions[:10])
print("Correct labels: ", test_dataset.df["target"].to_list()[:10])