# miDiKompanion ML Model Training

This notebook trains the 5-model pipeline for miDiKompanion:

1. **EmotionRecognizer**: Audio features (128-dim) → Emotion embedding (64-dim)
2. **MelodyTransformer**: Emotion embedding (64-dim) → MIDI probabilities (128-dim)
3. **HarmonyPredictor**: Context (128-dim) → Chord probabilities (64-dim)
4. **DynamicsEngine**: Intensity (32-dim) → Expression parameters (16-dim)
5. **GroovePredictor**: Arousal (64-dim) → Groove parameters (32-dim)

## Instructions

1. Set runtime to GPU: **Runtime → Change runtime type → GPU**
2. Run all cells in order
3. Download the trained models from the `models/onnx` folder

Estimated training time: **8-14 hours** on Colab T4 GPU

In [None]:
# Install dependencies
!pip install -q torch torchvision torchaudio
!pip install -q onnx onnxruntime
!pip install -q numpy pandas scikit-learn tqdm

print("Dependencies installed!")

In [None]:
# Check GPU availability
import torch

print(f"PyTorch version: {torch.__version__}")
print(f"CUDA available: {torch.cuda.is_available()}")
if torch.cuda.is_available():
    print(f"GPU: {torch.cuda.get_device_name(0)}")
    print(f"Memory: {torch.cuda.get_device_properties(0).total_memory / 1e9:.1f} GB")

In [None]:
# Model configurations
from dataclasses import dataclass
from typing import List

@dataclass
class ModelConfig:
    name: str
    input_size: int
    output_size: int
    hidden_sizes: List[int]
    activation: str = 'relu'
    dropout: float = 0.1
    learning_rate: float = 0.001
    batch_size: int = 32

MODEL_CONFIGS = {
    'emotion_recognizer': ModelConfig(
        name='EmotionRecognizer',
        input_size=128,
        output_size=64,
        hidden_sizes=[256, 128, 96],
        dropout=0.2
    ),
    'melody_transformer': ModelConfig(
        name='MelodyTransformer',
        input_size=64,
        output_size=128,
        hidden_sizes=[128, 192, 160],
        dropout=0.15
    ),
    'harmony_predictor': ModelConfig(
        name='HarmonyPredictor',
        input_size=128,
        output_size=64,
        hidden_sizes=[192, 128, 96],
        dropout=0.1
    ),
    'dynamics_engine': ModelConfig(
        name='DynamicsEngine',
        input_size=32,
        output_size=16,
        hidden_sizes=[64, 48, 32],
        dropout=0.1
    ),
    'groove_predictor': ModelConfig(
        name='GroovePredictor',
        input_size=64,
        output_size=32,
        hidden_sizes=[96, 64, 48],
        dropout=0.1
    )
}

print("Model configurations loaded!")
for key, config in MODEL_CONFIGS.items():
    print(f"  {config.name}: {config.input_size} → {config.output_size}")

In [None]:
# Define model architecture
import torch.nn as nn

class MiDiKompanionModel(nn.Module):
    def __init__(self, config: ModelConfig):
        super().__init__()
        self.config = config

        layers = []
        prev_size = config.input_size

        for hidden_size in config.hidden_sizes:
            layers.append(nn.Linear(prev_size, hidden_size))
            layers.append(nn.BatchNorm1d(hidden_size))
            layers.append(nn.ReLU())
            layers.append(nn.Dropout(config.dropout))
            prev_size = hidden_size

        layers.append(nn.Linear(prev_size, config.output_size))

        self.network = nn.Sequential(*layers)

    def forward(self, x):
        return self.network(x)

# Test model creation
test_model = MiDiKompanionModel(MODEL_CONFIGS['emotion_recognizer'])
test_input = torch.randn(1, 128)
test_output = test_model(test_input)
print(f"Test: {test_input.shape} → {test_output.shape}")
print("Model architecture OK!")

In [None]:
# Synthetic dataset generation
import numpy as np
from torch.utils.data import Dataset, DataLoader

class SyntheticDataset(Dataset):
    def __init__(self, config: ModelConfig, num_samples: int = 10000, seed: int = 42):
        self.config = config
        self.num_samples = num_samples

        np.random.seed(seed)
        self.inputs, self.targets = self._generate_data()

    def _generate_data(self):
        inputs = np.random.randn(self.num_samples, self.config.input_size).astype(np.float32)

        if self.config.name == 'EmotionRecognizer':
            targets = self._generate_emotion_targets(inputs)
        elif self.config.name == 'MelodyTransformer':
            targets = self._generate_melody_targets(inputs)
        elif self.config.name == 'HarmonyPredictor':
            targets = self._generate_harmony_targets(inputs)
        elif self.config.name == 'DynamicsEngine':
            targets = self._generate_dynamics_targets(inputs)
        elif self.config.name == 'GroovePredictor':
            targets = self._generate_groove_targets(inputs)
        else:
            targets = np.random.randn(self.num_samples, self.config.output_size).astype(np.float32)

        return torch.FloatTensor(inputs), torch.FloatTensor(targets)

    def _generate_emotion_targets(self, inputs):
        targets = np.zeros((self.num_samples, self.config.output_size), dtype=np.float32)
        targets[:, 0] = np.tanh(np.mean(inputs[:, :32], axis=1))
        targets[:, 1] = np.tanh(np.mean(inputs[:, 32:64], axis=1))
        targets[:, 2] = np.tanh(np.mean(inputs[:, 64:96], axis=1))
        targets[:, 3] = np.abs(np.tanh(np.mean(inputs[:, 96:], axis=1)))
        for i in range(4, self.config.output_size):
            targets[:, i] = targets[:, i % 4] * np.sin(i * 0.5) + np.random.randn(self.num_samples) * 0.1
        return np.clip(targets, -1, 1)

    def _generate_melody_targets(self, inputs):
        targets = np.zeros((self.num_samples, self.config.output_size), dtype=np.float32)
        valence = inputs[:, 0]
        arousal = inputs[:, 1]
        base_note = 60 + (valence * 12)
        range_spread = 6 + arousal * 12
        for i in range(self.num_samples):
            center = int(np.clip(base_note[i], 48, 84))
            spread = int(np.clip(range_spread[i], 6, 24))
            for note in range(self.config.output_size):
                dist = abs(note - center)
                targets[i, note] = np.exp(-(dist ** 2) / (2 * (spread ** 2)))
        return targets / (targets.sum(axis=1, keepdims=True) + 1e-8)

    def _generate_harmony_targets(self, inputs):
        targets = np.zeros((self.num_samples, self.config.output_size), dtype=np.float32)
        valence = inputs[:, 0]
        for i in range(self.num_samples):
            if valence[i] > 0:
                targets[i, :32] = np.random.rand(32) * (0.5 + valence[i] * 0.5)
                targets[i, 32:] = np.random.rand(32) * (0.5 - valence[i] * 0.3)
            else:
                targets[i, :32] = np.random.rand(32) * (0.5 + valence[i] * 0.3)
                targets[i, 32:] = np.random.rand(32) * (0.5 - valence[i] * 0.5)
        return targets / (targets.sum(axis=1, keepdims=True) + 1e-8)

    def _generate_dynamics_targets(self, inputs):
        targets = np.zeros((self.num_samples, self.config.output_size), dtype=np.float32)
        intensity = np.mean(inputs, axis=1)
        targets[:, 0] = 0.3 + intensity * 0.5
        targets[:, 1] = 0.1 + np.abs(intensity) * 0.2
        for i in range(2, self.config.output_size):
            targets[:, i] = targets[:, i % 2] * (1 - i / 20)
        return np.clip(targets, 0, 1)

    def _generate_groove_targets(self, inputs):
        targets = np.zeros((self.num_samples, self.config.output_size), dtype=np.float32)
        arousal = np.mean(inputs[:, :32], axis=1)
        targets[:, 0] = 0.5 + arousal * 0.3
        targets[:, 1] = 0.2 + np.abs(arousal) * 0.3
        for i in range(2, self.config.output_size):
            targets[:, i] = np.random.rand(self.num_samples) * (0.3 + np.abs(arousal) * 0.4)
        return np.clip(targets, 0, 1)

    def __len__(self):
        return self.num_samples

    def __getitem__(self, idx):
        return self.inputs[idx], self.targets[idx]

print("Dataset class defined!")

In [None]:
# Training function
import torch.optim as optim
from tqdm import tqdm
import time

def train_model(model, config, train_loader, val_loader, epochs, device):
    optimizer = optim.AdamW(model.parameters(), lr=config.learning_rate, weight_decay=0.01)
    scheduler = optim.lr_scheduler.CosineAnnealingLR(optimizer, T_max=epochs)
    criterion = nn.MSELoss()

    history = {'train_loss': [], 'val_loss': []}
    best_val_loss = float('inf')

    model.to(device)

    for epoch in range(epochs):
        # Training
        model.train()
        train_losses = []

        for inputs, targets in train_loader:
            inputs, targets = inputs.to(device), targets.to(device)

            optimizer.zero_grad()
            outputs = model(inputs)
            loss = criterion(outputs, targets)
            loss.backward()
            torch.nn.utils.clip_grad_norm_(model.parameters(), 1.0)
            optimizer.step()

            train_losses.append(loss.item())

        scheduler.step()

        # Validation
        model.eval()
        val_losses = []

        with torch.no_grad():
            for inputs, targets in val_loader:
                inputs, targets = inputs.to(device), targets.to(device)
                outputs = model(inputs)
                loss = criterion(outputs, targets)
                val_losses.append(loss.item())

        train_loss = np.mean(train_losses)
        val_loss = np.mean(val_losses)

        history['train_loss'].append(train_loss)
        history['val_loss'].append(val_loss)

        if val_loss < best_val_loss:
            best_val_loss = val_loss
            torch.save(model.state_dict(), f'{config.name.lower()}_best.pt')

        if (epoch + 1) % 10 == 0:
            print(f'  Epoch {epoch+1}/{epochs}: train={train_loss:.6f}, val={val_loss:.6f}')

    model.load_state_dict(torch.load(f'{config.name.lower()}_best.pt'))
    return history

print("Training function defined!")

In [None]:
# Train all models
import os

os.makedirs('models/onnx', exist_ok=True)

device = torch.device('cuda' if torch.cuda.is_available() else 'cpu')
print(f"Training on: {device}")

EPOCHS = 100  # Increase for production training
NUM_SAMPLES = 50000
VAL_SPLIT = 0.1

results = {}

for model_key, config in MODEL_CONFIGS.items():
    print(f"\n{'='*50}")
    print(f"Training {config.name}...")
    print(f"  Input: {config.input_size}, Output: {config.output_size}")

    # Create dataset
    dataset = SyntheticDataset(config, num_samples=NUM_SAMPLES)

    val_size = int(len(dataset) * VAL_SPLIT)
    train_size = len(dataset) - val_size
    train_dataset, val_dataset = torch.utils.data.random_split(dataset, [train_size, val_size])

    train_loader = DataLoader(train_dataset, batch_size=config.batch_size, shuffle=True)
    val_loader = DataLoader(val_dataset, batch_size=config.batch_size)

    # Create and train model
    model = MiDiKompanionModel(config)
    start_time = time.time()

    history = train_model(model, config, train_loader, val_loader, epochs=EPOCHS, device=device)

    train_time = time.time() - start_time
    print(f"  Training time: {train_time:.1f}s")

    results[model_key] = {
        'final_train_loss': history['train_loss'][-1],
        'final_val_loss': history['val_loss'][-1],
        'best_val_loss': min(history['val_loss']),
        'train_time': train_time
    }

print("\n" + "="*50)
print("Training complete!")

In [None]:
# Export to ONNX
print("Exporting models to ONNX...")

for model_key, config in MODEL_CONFIGS.items():
    model = MiDiKompanionModel(config)
    model.load_state_dict(torch.load(f'{config.name.lower()}_best.pt', map_location='cpu'))
    model.eval()

    dummy_input = torch.randn(1, config.input_size)
    onnx_path = f'models/onnx/{config.name.lower()}.onnx'

    torch.onnx.export(
        model,
        dummy_input,
        onnx_path,
        input_names=['input'],
        output_names=['output'],
        dynamic_axes={'input': {0: 'batch_size'}, 'output': {0: 'batch_size'}},
        opset_version=11,
        do_constant_folding=True
    )

    size_kb = os.path.getsize(onnx_path) / 1024
    print(f"  {config.name}: {size_kb:.1f} KB")

# Total size
total_size = sum(os.path.getsize(f'models/onnx/{c.name.lower()}.onnx') for c in MODEL_CONFIGS.values())
print(f"\nTotal size: {total_size/1024:.1f} KB ({total_size/1024/1024:.2f} MB)")
print("✓ Export complete!")

In [None]:
# Verify models
import onnx
from onnxruntime import InferenceSession

print("Verifying exported models...")

for model_key, config in MODEL_CONFIGS.items():
    onnx_path = f'models/onnx/{config.name.lower()}.onnx'

    # Load and check
    onnx_model = onnx.load(onnx_path)
    onnx.checker.check_model(onnx_model)

    # Run inference
    session = InferenceSession(onnx_path)
    input_name = session.get_inputs()[0].name

    test_input = np.random.randn(1, config.input_size).astype(np.float32)

    # Benchmark
    times = []
    for _ in range(100):
        start = time.time()
        output = session.run(None, {input_name: test_input})
        times.append((time.time() - start) * 1000)

    avg_time = np.mean(times)
    status = "✓" if avg_time < 10 else "⚠"

    print(f"  {status} {config.name}: {avg_time:.3f}ms (output shape: {output[0].shape})")

print("\n✓ All models verified!")

In [None]:
# Download models
from google.colab import files
import zipfile

# Create zip file
with zipfile.ZipFile('midikompanion_models.zip', 'w') as zipf:
    for model_key, config in MODEL_CONFIGS.items():
        onnx_path = f'models/onnx/{config.name.lower()}.onnx'
        zipf.write(onnx_path, f'{config.name.lower()}.onnx')

print("Models packaged in midikompanion_models.zip")
print("Downloading...")

files.download('midikompanion_models.zip')

## Results Summary

After training, copy the models to your miDiKompanion installation:

```bash
unzip midikompanion_models.zip -d /path/to/miDiKompanion/models/onnx/
```

The models will be automatically loaded when you run miDiKompanion.