In [13]:
import sys
import numpy as np
from pathlib import Path
import torchopt
import posteriors

# Add paths for importing utilities and models
current_dir = Path.cwd()
sys.path.append(str(current_dir))
sys.path.append(str(current_dir.parent))
sys.path.append(str(current_dir.parent / "baselines"))


from src.nanogpt_utils import load_model, load_tokenizer, encode, decode
from src.bayesian_utils import create_training_batches, run_bayesian_pipeline
from config import CONFIG, MODEL_PATH, META_PATH, DATA_DIR

for path in [MODEL_PATH, META_PATH, DATA_DIR]:
    assert path.exists(), f"Path {path} does not exist."

#### Load pre-trained baseline model

In [14]:
model, checkpoint = load_model(Path(MODEL_PATH))
    
# Load tokenizer and vocabulary size
stoi, itos = load_tokenizer(Path(META_PATH))
vocab_size = len(itos)

# Extract model parameters for posteriors
params = dict(model.named_parameters())

if vocab_size == 65:
    print("Running character-level model")

Model arguments: {'n_layer': 6, 'n_head': 6, 'n_embd': 384, 'block_size': 256, 'bias': False, 'vocab_size': 65, 'dropout': 0.2}
number of parameters: 10.65M
Model loaded successfully!
Number of parameters: 10,745,088
Running character-level model


#### Preparing training data

In [15]:
# Prepare training data for Bayesian inference
train_data_path = Path(DATA_DIR / 'train.bin')
data = np.memmap(str(train_data_path), dtype=np.uint16, mode='r')

training_batches = create_training_batches(
    data, 
    CONFIG['batch_size'], 
    CONFIG['max_seq_length'], 
    CONFIG['train_samples']
)

print(f"Created {len(training_batches)} training batches")
print(f"Batch shape: {training_batches[0][0].shape}")
print(f"Target shape: {training_batches[0][1].shape}")

# Calculate number of data points for posteriors
num_data = CONFIG['train_samples']
print(f"Total training samples: {num_data}")
    

Created 32 training batches
Batch shape: torch.Size([16, 128])
Target shape: torch.Size([16, 1])
Total training samples: 500


#### Setup Variational Inference

In [None]:
# Quick test with minimal config
test_config = CONFIG.copy()
test_config['num_epochs'] = 1
test_config['train_samples'] = 32  # Just 2 batches worth

# Create smaller test batches
test_batches = create_training_batches(
    data, 
    test_config['batch_size'], 
    test_config['max_seq_length'], 
    test_config['train_samples']
)

print(f"Test run with {len(test_batches)} batches, {test_config['num_epochs']} epoch")
print("This should complete quickly if the fix is working...")

In [26]:
state_vi, metrics_vi, eval_vi = run_bayesian_pipeline(training_batches, 
                                                      'vi',
                                                      use_wandb=False)


Setting up VI sampler
VI configured with:
- Learning rate: 5e-06
- Temperature: 0.001
- Samples per update: 1

Starting Bayesian Training with VI
Configuration:
  - Epochs: 3
  - Batches per epoch: 32
  - Total iterations: 96

Epoch 1/3
------------------------------------------------------------
VI configured with:
- Learning rate: 5e-06
- Temperature: 0.001
- Samples per update: 1

Starting Bayesian Training with VI
Configuration:
  - Epochs: 3
  - Batches per epoch: 32
  - Total iterations: 96

Epoch 1/3
------------------------------------------------------------
NLL computed successfully: 67.66502380371094
NLL computed successfully: 67.66502380371094
Log prior computed successfully: -15255060.0
Log posterior computed successfully: -30577.783203125
Log prior computed successfully: -15255060.0
Log posterior computed successfully: -30577.783203125
NLL computed successfully: 1.8471720218658447
NLL computed successfully: 1.8471720218658447
Log prior computed successfully: -9881134.0
L

KeyboardInterrupt: 

#### Run Bayesian Training

#### 8. Compare Deterministic vs Bayesian Predictions