In [2]:
import numpy as np
import tensorflow as tf
from tensorflow.keras import layers, models
from tensorflow.keras.optimizers import Adam
from tensorflow.keras.losses import CategoricalCrossentropy

In [3]:
def build_coconet_model(time_steps, pitches, num_instruments, num_layers=64, num_channels=128):
    inputs = layers.Input(shape=(time_steps, pitches, num_instruments * 2))  # Includes mask as extra channels
    x = inputs

    for i in range(num_layers):
        x = layers.Conv2D(
            filters=num_channels,
            kernel_size=(3, 3),
            padding="same",
            activation="relu"
        )(x)
        if i % 2 == 1:  # Add residual connections every second layer
            x = layers.Add()([x, inputs if i == 1 else prev_x])
        prev_x = x

    # Final layer to predict pitch probabilities
    x = layers.Conv2D(
        filters=num_instruments, 
        kernel_size=(1, 1), 
        padding="same", 
        activation="softmax"
    )(x)

    model = models.Model(inputs, x)
    return model

In [4]:
def mask_piano_roll(piano_roll, mask_prob=0.5):
    """
    Applies random masking to a piano roll.
    """
    mask = np.random.rand(*piano_roll.shape[:-1]) > mask_prob
    masked_roll = np.where(mask[..., None], piano_roll, 0)
    return masked_roll, mask[..., None]  # Return masked roll and mask


In [5]:
def preprocess_data(piano_rolls, mask_prob):
    """
    Prepares input and target tensors with random masking.
    """
    inputs, targets = [], []
    for piano_roll in piano_rolls:
        masked_roll, mask = mask_piano_roll(piano_roll, mask_prob)
        # Concatenate mask and masked roll for input
        inputs.append(np.concatenate([masked_roll, mask], axis=-1))
        targets.append(piano_roll)
    return np.array(inputs), np.array(targets)

In [6]:
# Training Loop
def train_coconet(model, piano_rolls, epochs, batch_size, mask_prob_start, mask_prob_end):
    optimizer = Adam()
    loss_fn = CategoricalCrossentropy(from_logits=False)  # Using probabilities
    steps_per_epoch = len(piano_rolls) // batch_size

    for epoch in range(epochs):
        # Linearly anneal masking probability
        mask_prob = mask_prob_start + (mask_prob_end - mask_prob_start) * (epoch / epochs)
        print(f"Epoch {epoch+1}/{epochs} - Masking Probability: {mask_prob:.2f}")

        for step in range(steps_per_epoch):
            # Get batch
            batch = piano_rolls[step * batch_size: (step + 1) * batch_size]

            # Prepare inputs and targets
            inputs, targets = preprocess_data(batch, mask_prob)

            # Training step
            with tf.GradientTape() as tape:
                predictions = model(inputs, training=True)
                loss = loss_fn(targets, predictions)
            
            # Backpropagation
            gradients = tape.gradient(loss, model.trainable_variables)
            optimizer.apply_gradients(zip(gradients, model.trainable_variables))

            # Logging
            print(f"Step {step+1}/{steps_per_epoch}, Loss: {loss.numpy():.4f}")
        print("-" * 50)

In [7]:
# Example Dataset (Simulated)
def generate_dummy_data(num_samples, time_steps, pitches, num_instruments):
    """
    Generate a dummy piano roll dataset for testing.
    """
    return np.random.randint(
        0, 2, size=(num_samples, time_steps, pitches, num_instruments), dtype=np.int32
    )

In [8]:
# Hyperparameters
TIME_STEPS = 128      # Number of time steps (T)
PITCHES = 53          # MIDI pitches 36 to 88
NUM_INSTRUMENTS = 4   # Four-part harmony
NUM_LAYERS = 64       # Total number of convolutional layers
NUM_CHANNELS = 128    # Number of channels per layer
EPOCHS = 10           # Number of epochs
BATCH_SIZE = 32       # Batch size
MASK_PROB_START = 0.75  # Initial masking probability
MASK_PROB_END = 0.25    # Final masking probability
NUM_SAMPLES = 1000    # Number of dummy samples

# Generate dummy data
piano_rolls = generate_dummy_data(NUM_SAMPLES, TIME_STEPS, PITCHES, NUM_INSTRUMENTS)

# Build the model
model = build_coconet_model(TIME_STEPS, PITCHES, NUM_INSTRUMENTS, NUM_LAYERS, NUM_CHANNELS)
model.compile(optimizer="adam", loss="categorical_crossentropy")
model.summary()

# Train the model
train_coconet(model, piano_rolls, EPOCHS, BATCH_SIZE, MASK_PROB_START, MASK_PROB_END)

I0000 00:00:1731904765.429210   77665 gpu_device.cc:2022] Created device /job:localhost/replica:0/task:0/device:GPU:0 with 1133 MB memory:  -> device: 0, name: NVIDIA GeForce RTX 3060 Laptop GPU, pci bus id: 0000:01:00.0, compute capability: 8.6


ValueError: Inputs have incompatible shapes. Received shapes (128, 53, 128) and (128, 53, 8)