# MOAI-LLM Basic Usage

This notebook demonstrates basic usage of MOAI-LLM:
1. Training a tokenizer
2. Creating a model from scratch
3. Training (small example)
4. Inference
5. Evaluation

## Setup

In [None]:
import torch
from moai_llm.config import MoaiConfig, MoaiConfig3B
from moai_llm.modeling.model import MoaiForCausalLM
from moai_llm.losses import create_loss_function
from moai_llm.data import HierarchicalBalancePacker
from moai_llm.inference import MoaiInferencePipeline

print(f"PyTorch version: {torch.__version__}")
print(f"CUDA available: {torch.cuda.is_available()}")
if torch.cuda.is_available():
    print(f"CUDA version: {torch.version.cuda}")
    print(f"GPU: {torch.cuda.get_device_name(0)}")

## 1. Create Model Configuration

In [None]:
# Option 1: Use predefined 3B config
config = MoaiConfig3B()

# Option 2: Create custom config
# config = MoaiConfig(
#     vocab_size=128000,
#     hidden_size=3840,
#     num_hidden_layers=28,
#     num_attention_heads=28,
#     num_key_value_heads=4,
# )

print("Model Configuration:")
print(f"Hidden size: {config.hidden_size}")
print(f"Layers: {config.num_hidden_layers}")
print(f"Attention heads: {config.num_attention_heads}")
print(f"KV heads: {config.num_key_value_heads}")
print(f"Vocab size: {config.vocab_size}")
print(f"Max sequence: {config.max_position_embeddings}")

## 2. Initialize Model

In [None]:
# Create model from config
model = MoaiForCausalLM(config)

# Calculate parameters
total_params = sum(p.numel() for p in model.parameters())
trainable_params = sum(p.numel() for p in model.parameters() if p.requires_grad)

print(f"Total parameters: {total_params:,} ({total_params/1e9:.2f}B)")
print(f"Trainable parameters: {trainable_params:,}")

# Model summary
print("\nModel architecture:")
print(model)

## 3. Test Forward Pass

In [None]:
# Create dummy input
batch_size = 2
seq_length = 128

input_ids = torch.randint(0, config.vocab_size, (batch_size, seq_length))
labels = input_ids.clone()

# Forward pass
model.eval()
with torch.no_grad():
    outputs = model(input_ids=input_ids, labels=labels)

print(f"Loss: {outputs.loss.item():.4f}")
print(f"Logits shape: {outputs.logits.shape}")
print(f"Expected shape: (batch_size={batch_size}, seq_len={seq_length}, vocab_size={config.vocab_size})")

## 4. Test Loss Functions

In [None]:
# Create sample logits and labels
vocab_size = 1000
logits = torch.randn(batch_size * seq_length, vocab_size)
labels = torch.randint(0, vocab_size, (batch_size * seq_length,))

# Test different loss functions
print("Loss Function Comparison:")
print("="*50)

# Cross-entropy
ce_loss = create_loss_function({"type": "cross_entropy"})
ce_value = ce_loss(logits, labels)
print(f"Cross-Entropy: {ce_value.item():.4f}")

# Focal loss
focal_loss = create_loss_function({"type": "focal", "params": {"gamma": 2.0}})
focal_value = focal_loss(logits, labels)
print(f"Focal Loss (γ=2.0): {focal_value.item():.4f}")

# Label smoothing
smooth_loss = create_loss_function({"type": "label_smoothing", "params": {"smoothing": 0.1}})
smooth_value = smooth_loss(logits, labels)
print(f"Label Smoothing (α=0.1): {smooth_value.item():.4f}")

# Multi-objective
multi_loss = create_loss_function({
    "type": "multi_objective",
    "params": {
        "ce_weight": 0.6,
        "focal_weight": 0.3,
        "smooth_weight": 0.1,
    }
})
multi_value = multi_loss(logits, labels)
print(f"Multi-Objective: {multi_value.item():.4f}")

## 5. Test Sequence Packing

In [None]:
# Create sample sequences of varying lengths
sequences = [
    list(range(100, 200)),      # Length 100
    list(range(200, 450)),      # Length 250
    list(range(450, 600)),      # Length 150
    list(range(600, 1100)),     # Length 500
    list(range(1100, 1300)),    # Length 200
    list(range(1300, 1450)),    # Length 150
    list(range(1450, 1850)),    # Length 400
    list(range(1850, 1950)),    # Length 100
]

print(f"Original sequences: {len(sequences)}")
print(f"Sequence lengths: {[len(s) for s in sequences]}")
print(f"Total tokens: {sum(len(s) for s in sequences)}")

# Create packer
packer = HierarchicalBalancePacker(
    max_seq_length=512,
    num_bins=4,
)

# Pack sequences
packed = packer.pack(sequences, return_tensors=True)

print(f"\nPacked batches: {len(packed)}")
print(f"Total capacity: {len(packed) * 512}")
print(f"Efficiency: {sum(len(s) for s in sequences) / (len(packed) * 512):.2%}")

# Show packing details
for i, pack in enumerate(packed):
    print(f"\nBatch {i+1}:")
    print(f"  Sequences packed: {pack.num_sequences}")
    print(f"  Sequence lengths: {pack.sequence_lengths}")
    print(f"  Total tokens: {sum(pack.sequence_lengths)}")
    print(f"  Padding: {512 - sum(pack.sequence_lengths)}")

## 6. Inference Example

Note: This requires a trained model and tokenizer. Uncomment when available.

In [None]:
# Uncomment when you have a trained model

# # Initialize inference pipeline
# pipeline = MoaiInferencePipeline(
#     model_path="outputs/moai-3b/final_model",
#     tokenizer_path="tokenizers/moai_tokenizer",
#     device="cuda" if torch.cuda.is_available() else "cpu",
# )

# # Simple generation
# text = pipeline.generate(
#     "Once upon a time",
#     max_new_tokens=100,
#     temperature=0.7,
# )
# print("Generated text:")
# print(text)

# # Chat example
# messages = [
#     {"role": "system", "content": "You are a helpful assistant."},
#     {"role": "user", "content": "What is machine learning?"},
# ]
# response = pipeline.chat(messages, max_new_tokens=200)
# print("\nChat response:")
# print(response)

## 7. Save and Load Model

In [None]:
import os
from pathlib import Path

# Create output directory
output_dir = Path("test_model")
output_dir.mkdir(exist_ok=True)

# Save model
model.save_pretrained(output_dir)
config.save_pretrained(output_dir)
print(f"Model saved to: {output_dir}")

# Load model
loaded_model = MoaiForCausalLM.from_pretrained(output_dir)
print(f"Model loaded from: {output_dir}")

# Verify loaded model
loaded_params = sum(p.numel() for p in loaded_model.parameters())
print(f"Loaded model parameters: {loaded_params:,}")
print(f"Match original: {loaded_params == total_params}")

## Summary

This notebook demonstrated:
- ✅ Creating model configuration
- ✅ Initializing MOAI-LLM model
- ✅ Testing forward pass
- ✅ Comparing loss functions
- ✅ Using sequence packing
- ✅ Saving and loading models

Next steps:
1. Train a tokenizer: `python scripts/train_tokenizer.py`
2. Pre-train the model: `python scripts/pretrain.py`
3. Evaluate: `python scripts/evaluate.py`
4. Use inference pipeline for generation