# Learning MongoDB queries and index choices

## 1. Setup and Imports

In [1]:
import json
import random
from pathlib import Path

import torch

from origami import DataConfig, ModelConfig, OrigamiConfig, OrigamiPipeline, TrainingConfig

# For reproducibility
SEED = 42
random.seed(SEED)
torch.manual_seed(SEED)

print(f"PyTorch version: {torch.__version__}")

PyTorch version: 2.9.1


## 2. Load and Explore the Data

The car dataset contains nested JSON objects with car attributes and acceptability ratings.

In [12]:
# Load data from JSONL file
train_path = Path("../datasets/mongodb_workload_train.jsonl")
with open(train_path) as f:
    train_data = [json.loads(line) for line in f][:4000]

test_path = Path("../datasets/mongodb_workload_test.jsonl")
with open(test_path) as f:
    test_data = [json.loads(line) for line in f]

print(f"Loaded {len(train_data)} train records")
print(f"Loaded {len(test_data)} test records")

print(f"\nSample record:")
print(json.dumps(train_data[0], indent=2))

Loaded 4000 train records
Loaded 2000 test records

Sample record:
{
  "filter": {
    "SCHEDULED_ARRIVAL": {
      "$lte": "2118"
    },
    "ORIGIN_AIRPORT": {
      "$gte": "TVC"
    },
    "SCHEDULED_DEPARTURE": {
      "$lte": "0534"
    }
  },
  "sort": {
    "SCHEDULED_DEPARTURE": 1
  },
  "limit": 819,
  "projection": {
    "SCHEDULED_ARRIVAL": 1,
    "CANCELLED": 1,
    "_id": 0
  },
  "index_id": 3
}


## 4. Training with Custom Configuration

`OrigamiConfig` with nested `ModelConfig`, `TrainingConfig`, and `DataConfig` lets you customize model architecture, training parameters, and preprocessing options.

In [21]:
from origami.training import accuracy

# Create a custom configuration with nested structure
custom_config = OrigamiConfig(
    model=ModelConfig(
        d_model=128,  # Embedding dimension
        n_heads=4,  # Attention heads (must divide d_model)
        n_layers=4,  # Transformer layers
        d_ff=512,  # Feed-forward dimension
        dropout=0.0,  # Dropout rate
        use_continuous_head=True,
        continuous_loss_weight=-1.0,
    ),
    training=TrainingConfig(
        batch_size=100,
        learning_rate=1e-3,
        num_epochs=50,
        warmup_steps=1000,
        shuffle_keys=True,  # Data augmentation via key order shuffling
        eval_strategy="steps",
        eval_steps=100,
        eval_metrics={"acc": accuracy},
        target_key="index_id",
        eval_sample_size=100,
        eval_on_train=True,  # Evaluate on training data as well
    ),
    data=DataConfig(
        numeric_mode="scale",  # Car dataset has no high-cardinality numerics
        max_vocab_size=2000,
    ),
)

print("Custom configuration:")
print(f"  d_model: {custom_config.model.d_model}")
print(f"  n_layers: {custom_config.model.n_layers}")
print(f"  batch_size: {custom_config.training.batch_size}")
print(f"  shuffle_keys: {custom_config.training.shuffle_keys}")

Custom configuration:
  d_model: 128
  n_layers: 4
  batch_size: 100
  shuffle_keys: True


In [14]:
from origami.training import TableLogCallback

# Create and train pipeline with custom config
pipeline = OrigamiPipeline(custom_config)
pipeline.fit(train_data, eval_data=test_data, callbacks=[TableLogCallback(print_every=10)])

print(f"\nTraining complete!")
print(f"Model parameters: {pipeline._model.get_num_parameters():,}")

| step: 10 | epoch: 0 | lr: 1.00e-05 | batch_dt:   86ms | loss: 4.9749 |
| step: 20 | epoch: 0 | lr: 2.00e-05 | batch_dt:   90ms | loss: 4.8637 |
| step: 30 | epoch: 0 | lr: 3.00e-05 | batch_dt:   86ms | loss: 4.7389 |
| step: 40 | epoch: 0 | lr: 4.00e-05 | batch_dt:   84ms | loss: 4.5851 |
| step: 50 | epoch: 1 | lr: 5.00e-05 | batch_dt:   87ms | loss: 4.3660 |
| step: 60 | epoch: 1 | lr: 6.00e-05 | batch_dt:   78ms | loss: 4.1497 |
| step: 70 | epoch: 1 | lr: 7.00e-05 | batch_dt:   86ms | loss: 3.9410 |
| step: 80 | epoch: 1 | lr: 8.00e-05 | batch_dt:   90ms | loss: 3.7794 |
| step: 90 | epoch: 2 | lr: 9.00e-05 | batch_dt:   84ms | loss: 3.5506 |
| step: 100 | epoch: 2 | lr: 1.00e-04 | batch_dt:   87ms | loss: 3.3752 | train_acc: 0.1600 | train_loss: 3.3737 | val_acc: 0.1200 | val_loss: 3.3927 |
| step: 110 | epoch: 2 | lr: 1.10e-04 | batch_dt:   82ms | loss: 3.1924 |
| step: 120 | epoch: 2 | lr: 1.20e-04 | batch_dt:   87ms | loss: 3.0812 |
| step: 130 | epoch: 3 | lr: 1.30e-04 | bat

In [15]:
pipeline.evaluate(test_data, metrics={"acc": accuracy})

{'loss': 1.5894116503851754, 'acc': 0.868}

In [19]:
embeddings = pipeline.embed_batch(test_data)