In [None]:
from pathlib import Path
from etl.canon import build_training_data

# Path to the most recent diagnostic dataset
project_root = Path.cwd().parent
input_file = project_root / "output" / "2025-11-03" / "diagnostic_dataset_2025-11-03_20-59-20.json"

print(f"Building training data from: {input_file}\n")
stats = build_training_data(str(input_file))

print(f"✓ Created training data from {stats['total']} samples")
print(f"  - Train: {stats['train']} samples")
print(f"  - Val: {stats['val']} samples")
print(f"  - Test: {stats['test']} samples")
print(f"\n✓ Vocabulary sizes:")
print(f"  - Symptoms: {stats['vocab_sizes']['symptoms']}")
print(f"  - Families: {stats['vocab_sizes']['families']}")
print(f"  - Subtypes: {stats['vocab_sizes']['subtypes']}")
print(f"  - Brands: {stats['vocab_sizes']['brands']}")
print(f"  - Diagnostics: {stats['vocab_sizes']['diagnostics']}")
print(f"\n✓ Output files written to: {stats['output_dir']}/")
print("  - train.jsonl, val.jsonl, test.jsonl")
print("  - vocabs.json")


## Optional: Run ETL Pipeline

Uncomment and run the cell below if you need to rebuild the training data from raw diagnostic dataset:

```python
# Example: Build training data from diagnostic dataset
# INPUT_FILE = "../../output/2025-11-03/diagnostic_dataset_2025-11-03_20-59-20.json"
# stats = build_training_data(INPUT_FILE, output_dir="data")
# 
# print(f"✓ Created training data from {stats['total']} samples")
# print(f"  - Train: {stats['train']} | Val: {stats['val']} | Test: {stats['test']}")
# print(f"  - Vocab sizes: {stats['vocab_sizes']}")
```


## Load Processed Data and Setup Dimensions


In [None]:
import json
import torch
from torch.utils.data import DataLoader
from dataloader import HVACDataset

# Define paths (data files are in etl/data/)
VOCAB_PATH = "etl/data/vocabs.json"
TRAIN_PATH = "etl/data/train.jsonl"
VAL_PATH   = "etl/data/val.jsonl"
TEST_PATH  = "etl/data/test.jsonl"

# Load vocabs and calculate dimensions
v = json.load(open(VOCAB_PATH))

SYM_DIM = len(v["symptom2id"])
FAM_DIM = len(v["family2id"])
SUB_DIM = len(v["subtype2id"])
BR_DIM  = len(v["brand2id"])
INPUT_DIM = SYM_DIM + FAM_DIM + SUB_DIM + BR_DIM
NUM_CLASSES = len(v["diag2id"])

print(f"INPUT_DIM={INPUT_DIM}  NUM_CLASSES={NUM_CLASSES}")




In [None]:
# Create datasets
train_ds = HVACDataset(TRAIN_PATH, v)
val_ds   = HVACDataset(VAL_PATH, v)
test_ds  = HVACDataset(TEST_PATH, v)

print(f"rows: train={len(train_ds)} val={len(val_ds)} test={len(test_ds)}")

# Create dataloaders
train_dl = DataLoader(train_ds, batch_size=32, shuffle=True, num_workers=0, pin_memory=False)
val_dl   = DataLoader(val_ds,   batch_size=64, shuffle=False, num_workers=0, pin_memory=False)

## Create PyTorch Datasets and DataLoaders


In [None]:
# Get a batch and inspect shapes
xb, yb = next(iter(train_dl))
xb.shape, yb.shape, xb.dtype, yb.dtype


INPUT_DIM=132  NUM_CLASSES=12


## Inspect Batch Shape and Dtype


rows: train=24 val=7 test=4


(torch.Size([24, 132]), torch.Size([24, 12]), torch.float32, torch.float32)