In [None]:
# # %%
# !rm -rf /content/*

# # %%
# import gc
# import torch
# torch.cuda.empty_cache()
# gc.collect()

In [None]:
!pip install music21 --quiet

In [None]:
!git clone https://github.com/jukedeck/nottingham-dataset.git --quiet

In [None]:
!pip install torch scikit-learn --quiet

---
# Task 1 symbolic conditioned generation (Nottingham)
---

## 1. Multi-instrument REMI modeling

In [None]:
import os
from music21 import converter, note, chord
from sklearn.preprocessing import LabelEncoder
import torch
from torch.utils.data import Dataset, DataLoader

In [None]:
import time

def parse_multi_instrument_sequences(folder_path, max_files=14):
    melody_seqs, chord_seqs, bass_seqs = [], [], []
    files = sorted([f for f in os.listdir(folder_path) if f.endswith(".abc")])
    print(f"Found {len(files)} files. Parsing up to {max_files} files...")

    for f in files:
        if len(melody_seqs) >= max_files:
            break
        try:
            start_time = time.time()
            score = converter.parse(os.path.join(folder_path, f))
            if time.time() - start_time > 20:
                print(f"⏱️ Skipping {f} — parsing took too long")
                continue

            melody = [n.nameWithOctave for n in score.flat.notes if isinstance(n, note.Note)]
            chords = [c.root().name for c in score.chordify().flat.getElementsByClass('Chord')]
            bass = [c.bass().nameWithOctave if c.bass() else c.root().nameWithOctave
                    for c in score.chordify().flat.getElementsByClass('Chord')]

            min_len = min(len(melody), len(chords), len(bass))
            if min_len >= 16:
                melody_seqs.append(melody[:min_len])
                chord_seqs.append(chords[:min_len])
                bass_seqs.append(bass[:min_len])
            else:
                print(f"⚠️ {f} skipped: not enough notes")
        except Exception as e:
            print(f"⚠️ Error parsing {f}: {e}")
            continue

    print(f"✅ Parsed {len(melody_seqs)} sequences.")
    return melody_seqs, chord_seqs, bass_seqs

In [None]:
melody_seqs, chord_seqs, bass_seqs = parse_multi_instrument_sequences("/content/nottingham-dataset/ABC_cleaned")

In [None]:
# Data augmentation: transpose sequences up/down 1 or 2 semitones
from music21 import pitch
def transpose_sequence(seq, semitones):
    transposed = []
    for item in seq:
        try:
            transposed.append(pitch.Pitch(item).transpose(semitones).nameWithOctave)
        except Exception:
            transposed.append(item)
    return transposed

In [None]:
aug_melody, aug_chords, aug_bass = [], [], []
for m_seq, c_seq, b_seq in zip(melody_seqs, chord_seqs, bass_seqs):
    for st in [-2, -1, 1, 2]:
        aug_melody.append(transpose_sequence(m_seq, st))
        aug_chords.append([pitch.Pitch(ch + '4').transpose(st).name for ch in c_seq])
        aug_bass.append(transpose_sequence(b_seq, st))

In [None]:
# Combine original + augmented
melody_seqs += aug_melody
chord_seqs += aug_chords
bass_seqs += aug_bass

In [None]:
print(f"✅ After augmentation: {len(melody_seqs)} sequences")

## 2. REMI token modeling and training

In [None]:
def build_token_sequence(melody_seqs, chord_seqs, bass_seqs):
    sequences = []
    for melody, chords, basses in zip(melody_seqs, chord_seqs, bass_seqs):
        tokens = []
        tokens.append("Bar_0")
        for i, (mel, ch, ba) in enumerate(zip(melody, chords, basses)):
            tokens.append(f"Position_{i}")
            tokens.append(f"Track_Chords")
            tokens.append(f"Chord_{ch}")
            tokens.append(f"Track_Melody")
            tokens.append(f"Note_{mel}")
            tokens.append(f"Track_Bass")
            tokens.append(f"Note_{ba}")
        sequences.append(tokens)
    return sequences

In [None]:
token_seqs = build_token_sequence(melody_seqs, chord_seqs, bass_seqs)

In [None]:
from collections import Counter
flat_tokens = [tok for seq in token_seqs for tok in seq]
token_counts = Counter(flat_tokens)
vocab = {tok: idx for idx, tok in enumerate(sorted(token_counts))}
inv_vocab = {idx: tok for tok, idx in vocab.items()}

In [None]:
encoded_token_seqs = [[vocab[tok] for tok in seq] for seq in token_seqs]

In [None]:
print(f"Total unique tokens: {len(vocab)}")
print(f"Example tokenized sequence: {encoded_token_seqs[0][:20]}")

In [None]:
class TokenDataset(Dataset):
    def __init__(self, token_seqs, seq_len=64):
        self.seq_len = seq_len
        self.samples = []
        for seq in token_seqs:
            if len(seq) > seq_len:
                for i in range(len(seq) - seq_len):
                    self.samples.append((
                        torch.tensor(seq[i:i+seq_len], dtype=torch.long),
                        torch.tensor(seq[i+1:i+seq_len+1], dtype=torch.long)
                    ))
    def __len__(self):
        return len(self.samples)
    def __getitem__(self, idx):
        return self.samples[idx]

In [None]:
# Debug shortcut
remi_dataset = TokenDataset(encoded_token_seqs, seq_len=64)
remi_dataset.samples = remi_dataset.samples[:2000]
remi_loader = DataLoader(remi_dataset, batch_size=8, shuffle=True)

In [None]:
import torch.nn as nn
class MiniTransformer(nn.Module):
    def __init__(self, vocab_size, emb_size=128, nhead=4, num_layers=2):
        super().__init__()
        self.embedding = nn.Embedding(vocab_size, emb_size)
        self.pos_encoder = nn.Parameter(torch.randn(1, 512, emb_size))
        decoder_layer = nn.TransformerDecoderLayer(d_model=emb_size, nhead=nhead)
        self.transformer = nn.TransformerDecoder(decoder_layer, num_layers=num_layers)
        self.fc_out = nn.Linear(emb_size, vocab_size)
    def forward(self, x):
        x = self.embedding(x) + self.pos_encoder[:, :x.size(1), :]
        x = x.transpose(0, 1)
        tgt_mask = nn.Transformer.generate_square_subsequent_mask(x.size(0)).to(x.device)
        memory = torch.zeros_like(x)
        out = self.transformer(x, memory, tgt_mask=tgt_mask)
        out = out.transpose(0, 1)
        return self.fc_out(out)

In [None]:
transformer_model = MiniTransformer(vocab_size=len(vocab)).to("cuda" if torch.cuda.is_available() else "cpu")

In [None]:
import torch.nn.functional as F
device = "cuda" if torch.cuda.is_available() else "cpu"
criterion = nn.CrossEntropyLoss()
optimizer = torch.optim.Adam(transformer_model.parameters(), lr=0.001)

# === Original Transformer Training Loop (baseline, small subset) ===
transformer_model.train()
for epoch in range(10):
    total_loss = 0
    for x, y in remi_loader:
        x, y = x.to(device), y.to(device)
        optimizer.zero_grad()
        out = transformer_model(x)
        loss = criterion(out.view(-1, out.size(-1)), y.view(-1))
        loss.backward()
        optimizer.step()
        total_loss += loss.item()
    print(f"Epoch {epoch+1} - Loss: {total_loss / len(remi_loader):.4f}")

# === Fine-tuned Transformer (longer training, full data, LR scheduler) ===
# Remove the 2k-sample limit and use full data with validation, LR scheduling, and gradient clipping
from sklearn.model_selection import train_test_split
# Reconstruct full dataset (remi_dataset.samples may have been truncated above)
remi_dataset_full = TokenDataset(encoded_token_seqs, seq_len=64)
# Split into train/val
train_samples, val_samples = train_test_split(remi_dataset_full.samples, test_size=0.2, random_state=42)
train_loader = DataLoader(train_samples, batch_size=8, shuffle=True)
val_loader = DataLoader(val_samples, batch_size=8)

transformer_model.train()
optimizer = torch.optim.Adam(transformer_model.parameters(), lr=0.001)
scheduler = torch.optim.lr_scheduler.StepLR(optimizer, step_size=10, gamma=0.5)
criterion = nn.CrossEntropyLoss()
num_epochs = 30
for epoch in range(num_epochs):
    total_loss = 0
    for x, y in train_loader:
        x, y = x.to(device), y.to(device)
        optimizer.zero_grad()
        out = transformer_model(x)
        loss = criterion(out.view(-1, out.size(-1)), y.view(-1))
        loss.backward()
        # Gradient clipping
        torch.nn.utils.clip_grad_norm_(transformer_model.parameters(), max_norm=1.0)
        optimizer.step()
        total_loss += loss.item()
    scheduler.step()
    # Optional: validation loss
    with torch.no_grad():
        val_loss = 0
        for x, y in val_loader:
            x, y = x.to(device), y.to(device)
            out = transformer_model(x)
            loss = criterion(out.view(-1, out.size(-1)), y.view(-1))
            val_loss += loss.item()
        val_loss = val_loss / len(val_loader)
    print(f"[Fine-tune] Epoch {epoch+1} - Train Loss: {total_loss / len(train_loader):.4f} | Val Loss: {val_loss:.4f}")
torch.save(transformer_model.state_dict(), "fine_tuned_transformer_model.pt")

## 3. Feature-based modeling (pitch, duration, interval)

In [None]:
import time

def parse_feature_sequences(folder_path, max_files=14):
    pitch_seqs, duration_seqs, interval_seqs = [], [], []
    files = sorted([f for f in os.listdir(folder_path) if f.endswith(".abc")])[:max_files]
    print(f"Found {len(files)} files. Parsing up to {max_files}...")

    for f in files:
        if len(pitch_seqs) >= max_files:
            break
        try:
            start_time = time.time()
            score = converter.parse(os.path.join(folder_path, f))
            if time.time() - start_time > 20:
                print(f"⏱️ Skipping {f} — parsing took too long")
                continue

            pitches, durations = [], []
            for el in score.flat.notes:
                if isinstance(el, note.Note):
                    pitches.append(el.pitch.nameWithOctave)
                    durations.append(el.quarterLength)

            if len(pitches) > 10:
                midi_vals = [note.Note(p).pitch.midi for p in pitches]
                intervals = [0] + [midi_vals[i+1] - midi_vals[i] for i in range(len(midi_vals)-1)]
                pitch_seqs.append(pitches)
                duration_seqs.append(durations)
                interval_seqs.append(intervals)
            else:
                print(f"⚠️ {f} skipped: not enough notes")
        except Exception as e:
            print(f"⚠️ Error parsing {f}: {e}")
            continue

    print(f"✅ Parsed {len(pitch_seqs)} sequences.")
    return pitch_seqs, duration_seqs, interval_seqs

In [None]:
def build_dataset(pitch_seqs, duration_seqs, interval_seqs, features, seq_length=8):
    note_encoder = LabelEncoder().fit([p for seq in pitch_seqs for p in seq])
    from sklearn.preprocessing import MinMaxScaler
    scaler_dur = MinMaxScaler()
    scaler_int = MinMaxScaler()
    import numpy as np
    flat_durs = np.concatenate(duration_seqs).reshape(-1, 1)
    flat_ints = np.concatenate(interval_seqs).reshape(-1, 1)
    scaler_dur.fit(flat_durs)
    scaler_int.fit(flat_ints)
    input_features, target_classes = [], []
    for p_seq, d_seq, i_seq in zip(pitch_seqs, duration_seqs, interval_seqs):
        if len(p_seq) <= seq_length:
            continue
        p_encoded = note_encoder.transform(p_seq)
        d_scaled = scaler_dur.transform(np.array(d_seq).reshape(-1, 1)).flatten()
        i_scaled = scaler_int.transform(np.array(i_seq).reshape(-1, 1)).flatten()
        for i in range(len(p_seq) - seq_length):
            window_feats = []
            if 'pitch' in features:
                window_feats.append(p_encoded[i:i+seq_length])
            if 'duration' in features:
                window_feats.append(d_scaled[i:i+seq_length])
            if 'interval' in features:
                window_feats.append(i_scaled[i:i+seq_length])
            combined = np.stack(window_feats, axis=1)
            input_features.append(combined)
            target_classes.append(p_encoded[i+seq_length])
    return input_features, target_classes, note_encoder, scaler_dur

In [None]:
# Prepare data for variable pitch/duration/interval modeling
pitch_seqs, dur_seqs, int_seqs = parse_feature_sequences("/content/nottingham-dataset/ABC_cleaned")
input_features, _, note_encoder, scaler_dur = build_dataset(pitch_seqs, dur_seqs, int_seqs, features=['pitch', 'duration', 'interval'])

In [None]:

 # === Helper for decoding feature sequences to MIDI with duration quantization ===
def quantize_duration(d):
     """Clamp and quantize duration for more natural pacing."""
     # Clamp duration to [0.5, 2.5] and quantize to common musical values
     return min([0.5, 1.0, 1.5, 2.0, 3.0, 4.0], key=lambda x: abs(x - d))

def decode_feature_sequence_to_midi(pitches, durations, filename="feature_output.mid"):
     """Decodes a sequence of pitches and durations to a MIDI file, with quantized durations."""
     from music21 import stream, note as m21note
     s = stream.Score()
     p = stream.Part()
     offset = 0.0
     for pch, d in zip(pitches, durations):
         n = m21note.Note(pch)
         # Clamp and quantize duration for more natural pacing
         d = max(0.5, min(d, 2.5))
         n.quarterLength = quantize_duration(d)
         n.offset = offset
         p.append(n)
         offset += n.quarterLength
     s.append(p)
     s.write('midi', fp=filename)
     print(f"Saved feature-based MIDI to {filename}")

## 4. Symbolic Generation and Decoding

In [None]:
def generate_sequence(start_seq, model, vocab, inv_vocab, length=100):
    model.eval()
    generated = start_seq[:]
    input_seq = torch.tensor(start_seq[-64:], dtype=torch.long).unsqueeze(0).to(device)
    with torch.no_grad():
        for _ in range(length):
            out = model(input_seq)
            next_token = torch.multinomial(F.softmax(out[0, -1], dim=-1), 1).item()
            generated.append(next_token)
            input_seq = torch.tensor(generated[-64:], dtype=torch.long).unsqueeze(0).to(device)
    return [inv_vocab[i] for i in generated]

In [None]:
import random
start_tokens = random.choice(encoded_token_seqs)[:64]
generated_tokens = generate_sequence(start_tokens, transformer_model, vocab, inv_vocab, length=300)
print("Generated tokens:", generated_tokens[:30])

In [None]:
from music21 import note as m21note

# Helper function for pitch shifting symbolic tokens
def shift_note_token(token, semitone_shift):
    if not token.startswith("Note_"):
        return token
    try:
        pitch_name = token.replace("Note_", "")
        p = m21note.Note(pitch_name).transpose(semitone_shift)
        return f"Note_{p.nameWithOctave}"
    except:
        return token
# Ensure correct transformer version to avoid read-only property errors in MusicGen
import pkg_resources
from subprocess import run

def check_transformers_version(min_version="4.35.0"):
    try:
        current = pkg_resources.get_distribution("transformers").version
        print(f"✅ Transformers version: {current}")
        if pkg_resources.parse_version(current) < pkg_resources.parse_version(min_version):
            print(f"⚠️ Updating transformers to >= {min_version} to avoid config errors...")
            run(["pip", "install", f"transformers>={min_version}", "--upgrade"], check=True)
            print("✅ Transformers updated successfully. Please restart the kernel.")
    except Exception as e:
        print("⚠️ Transformers not found or version check failed. Installing latest version...")
        run(["pip", "install", "transformers", "--upgrade"], check=True)

check_transformers_version()

In [None]:
# === Regenerate full-length (1-minute) version for Task 1 ===
print("🎼 Regenerating 1-minute sequence for Task 1...")
start_tokens = random.choice(encoded_token_seqs)[:64]
long_tokens = generate_sequence(start_tokens, transformer_model, vocab, inv_vocab, length=1000)
# --- Patch symbolic generation for improved phrasing and realism ---
# Split into beginning, middle, ending for musical structure
beginning_tokens = long_tokens[:64]
middle_tokens = long_tokens[64:512]
ending_tokens = long_tokens[512:640]

# Apply pitch shifting and thinning to middle/ending
middle_tokens = [shift_note_token(tok, +5) for i, tok in enumerate(middle_tokens) if i % 4 != 0]
ending_tokens = [shift_note_token(tok, -4) for tok in ending_tokens]

# Stitch together
long_tokens = beginning_tokens + middle_tokens + ending_tokens

In [None]:
# === Structured symbolic generation for Task 1: intro → climax → resolution ===
print("🎼 Generating structured musical arc for Task 1 (intro → climax → resolution)")

# Section 1: calm intro
start_tokens = random.choice(encoded_token_seqs)[:64]
intro_tokens = generate_sequence(start_tokens, transformer_model, vocab, inv_vocab, length=300)

# Section 2: energetic middle (transpose notes up + remove every 5th token to thin)
mid_start = intro_tokens[-64:]
# The generate_sequence function returns tokens as strings like "Note_A4" or "Chord_C"
# We need to convert these string tokens back to indices for the generate_sequence function
# Use vocab.get(tok, 0) to handle potential missing tokens gracefully, though hopefully the vocab is complete
mid_start_indices = [vocab.get(tok, 0) for tok in mid_start]
middle_tokens = generate_sequence(mid_start_indices, transformer_model, vocab, inv_vocab, length=300)

# Apply pitch shifting and thinning to middle section using the correct function
# Only shift note tokens, keep others as they are
# Use a list comprehension filtering out tokens that are not note tokens
middle_tokens = [
    shift_note_token(tok, +5) if tok.startswith("Note_") else tok
    for i, tok in enumerate(middle_tokens) if i % 4 != 0 # Thinning
]


# Section 3: resolution (transpose down)
# Need to convert the string tokens from the middle_tokens back to indices for the generation function
end_start = middle_tokens[-64:]
end_start_indices = [vocab.get(tok, 0) for tok in end_start]
ending_tokens = generate_sequence(end_start_indices, transformer_model, vocab, inv_vocab, length=300)

# Apply pitch shifting to ending section using the correct function
# Only shift note tokens, keep others as they are
ending_tokens = [
    shift_note_token(tok, -4) if tok.startswith("Note_") else tok
    for tok in ending_tokens
]


# Concatenate full arc
full_tokens = intro_tokens + middle_tokens + ending_tokens

# === Predict expressive duration with MLP ===

# Extract features from REMI token sequences for duration prediction
def extract_duration_features(token_seqs):
    X_raw, y = [], []
    for seq in token_seqs:
        bar, pos, track = 0, 0, None
        for i, tok in enumerate(seq):
            if tok.startswith("Bar_"):
                bar = int(tok.split("_")[1])
            elif tok.startswith("Position_"):
                pos = int(tok.split("_")[1])
            elif tok.startswith("Track_"):
                track = tok.split("_")[1]
            elif tok.startswith("Note_") and track:
                pitch = tok.replace("Note_", "")
                X_raw.append([track, pos, pitch])
                y.append(1.0 if track == "Melody" else 0.5)  # heuristic durations
    return X_raw, y

# Train a simple MLPRegressor to predict expressive duration
from sklearn.preprocessing import OneHotEncoder
from sklearn.neural_network import MLPRegressor

X_raw, y = extract_duration_features(token_seqs)
encoder = OneHotEncoder(sparse_output=False).fit(X_raw)
X = encoder.transform(X_raw)

duration_model = MLPRegressor(hidden_layer_sizes=(64,), max_iter=1000)
duration_model.fit(X, y)
print("✅ Trained MLP to predict expressive duration.")


# === Predict expressive velocity with MLP ===
def extract_velocity_features(token_seqs):
    X_raw, y = [], []
    for seq in token_seqs:
        bar, pos, track = 0, 0, None
        for tok in seq:
            if tok.startswith("Bar_"):
                bar = int(tok.split("_")[1])
            elif tok.startswith("Position_"):
                pos = int(tok.split("_")[1])
            elif tok.startswith("Track_"):
                track = tok.split("_")[1]
            elif tok.startswith("Note_") and track:
                pitch = tok.replace("Note_", "")
                X_raw.append([track, pos, pitch])
                y.append(80 if track == "Melody" else 60)  # Heuristic velocity
    return X_raw, y

Xv_raw, yv = extract_velocity_features(token_seqs)
velocity_encoder = OneHotEncoder(sparse_output=False).fit(Xv_raw)
Xv = velocity_encoder.transform(Xv_raw)

velocity_model = MLPRegressor(hidden_layer_sizes=(64,), max_iter=1000)
velocity_model.fit(Xv, yv)
print("✅ Trained MLP to predict expressive velocity.")

# === Predict articulation with MLP ===
def extract_articulation_features(token_seqs):
    X_raw, y = [], []
    for seq in token_seqs:
        bar, pos, track = 0, 0, None
        for tok in seq:
            if tok.startswith("Bar_"):
                bar = int(tok.split("_")[1])
            elif tok.startswith("Position_"):
                pos = int(tok.split("_")[1])
            elif tok.startswith("Track_"):
                track = tok.split("_")[1]
            elif tok.startswith("Note_") and track:
                pitch = tok.replace("Note_", "")
                X_raw.append([track, pos, pitch])
                y.append(1 if track == "Melody" else 0)  # Legato for melody, staccato otherwise
    return X_raw, y

Xa_raw, ya = extract_articulation_features(token_seqs)
articulation_encoder = OneHotEncoder(sparse_output=False).fit(Xa_raw)
Xa = articulation_encoder.transform(Xa_raw)

articulation_model = MLPRegressor(hidden_layer_sizes=(32,), max_iter=1000)
articulation_model.fit(Xa, ya)
print("✅ Trained MLP to predict articulation (legato/staccato).")

# === Predict tempo curve (beats per minute) with MLP ===
def extract_tempo_features(token_seqs):
    X_raw, y = [], []
    for seq in token_seqs:
        bar, pos, track = 0, 0, None
        for tok in seq:
            if tok.startswith("Bar_"):
                bar = int(tok.split("_")[1])
            elif tok.startswith("Position_"):
                pos = int(tok.split("_")[1])
            elif tok.startswith("Track_"):
                track = tok.split("_")[1]
            elif tok.startswith("Note_") and track:
                pitch = tok.replace("Note_", "")
                X_raw.append([bar, pos])
                y.append(60 + (bar % 4) * 10)  # Simulated tempo pattern
    return X_raw, y

Xt_raw, yt = extract_tempo_features(token_seqs)
from sklearn.preprocessing import StandardScaler
tempo_scaler = StandardScaler()
Xt = tempo_scaler.fit_transform(Xt_raw)

tempo_model = MLPRegressor(hidden_layer_sizes=(32,), max_iter=1000)
tempo_model.fit(Xt, yt)
print("✅ Trained MLP to predict tempo curve.")

# Updated decode_to_midi with articulation prediction
from music21 import stream, note as m21note, chord as m21chord, tempo as m21tempo
def decode_to_midi(
    tokens,
    filename="multi_instrument_output.mid",
    duration_model=None,
    encoder=None,
    velocity_model=None,
    velocity_encoder=None,
    articulation_model=None,
    articulation_encoder=None,
    tempo_model=None
):
    s = stream.Score()
    parts = {"Melody": stream.Part(), "Chords": stream.Part(), "Bass": stream.Part()}
    # REMI-style bar/position time tracking
    bar_num = 0
    current_position = 0.0
    steps_per_bar = 16
    # Patch: double the duration per position for more realistic phrasing
    quarter_length_per_step = (4.0 / steps_per_bar) * 2.0
    track = None
    for tok in tokens:
        if tok.startswith("Bar_"):
            bar_num = int(tok.split("_")[1])
            # Insert predicted tempo at start of each bar if model is provided
            if tempo_model:
                tempo_feat = tempo_scaler.transform([[bar_num, 0]])
                bpm = int(tempo_model.predict(tempo_feat)[0])
                s.append(m21tempo.MetronomeMark(number=bpm))
        elif tok.startswith("Position_"):
            pos = int(tok.split("_")[1])
            current_position = bar_num * 4.0 + pos * quarter_length_per_step
        elif tok.startswith("Track_"):
            track = tok.split("_")[1]
        elif tok.startswith("Note_"):
            pitch = tok.replace("Note_", "")
            # Predict expressive duration if model and encoder are provided
            if duration_model and encoder:
                feat = encoder.transform([[track, pos, pitch]])[0].reshape(1, -1)
                qlen = duration_model.predict(feat)[0]
                qlen = max(0.25, min(4.0, qlen))
            else:
                qlen = quarter_length_per_step
            # Predict expressive velocity if model and encoder are provided
            if velocity_model and velocity_encoder:
                feat_v = velocity_encoder.transform([[track, pos, pitch]])[0].reshape(1, -1)
                velocity = int(velocity_model.predict(feat_v)[0])
            else:
                velocity = 64
            # Predict articulation (0=staccato, 1=legato)
            if articulation_model and articulation_encoder:
                feat_a = articulation_encoder.transform([[track, pos, pitch]])[0].reshape(1, -1)
                legato = articulation_model.predict(feat_a)[0] > 0.5
            else:
                legato = False
            if track == "Melody":
                n = m21note.Note(pitch)
                # Articulation shaping
                if legato:
                    n.tie = m21note.Tie("start")
                    n.quarterLength = qlen * 1.2
                else:
                    n.quarterLength = qlen * 0.8
                # Optional swing or rubato
                swing_offset = (pos % 2) * 0.05  # swing: offset every other 8th
                current_position += swing_offset
                n.offset = current_position
                n.volume.velocity = velocity
                parts["Melody"].append(n)
            elif track == "Bass":
                n = m21note.Note(pitch)
                # Articulation shaping
                if legato:
                    n.tie = m21note.Tie("start")
                    n.quarterLength = qlen * 1.2
                else:
                    n.quarterLength = qlen * 0.8
                # Optional swing or rubato
                rubato_shift = (bar_num % 3) * 0.02  # slow cyclic rubato
                current_position += rubato_shift
                n.offset = current_position
                n.volume.velocity = velocity
                parts["Bass"].append(n)
        elif tok.startswith("Chord_"):
            root = tok.replace("Chord_", "")
            c = m21chord.Chord([root + "3", root + "4", root + "5"])
            c.quarterLength = quarter_length_per_step
            c.offset = current_position
            parts["Chords"].append(c)
    for p in parts.values():
        s.append(p)
    s.write('midi', fp=filename)
    print(f"Saved MIDI to {filename}")
# Optional render
!fluidsynth -ni /usr/share/sounds/sf2/FluidR3_GM.sf2 symbolic_conditioned_structured.mid -F structured_output.wav -r 16000
print("✅ Saved musically structured Task 1 output to 'symbolic_conditioned_structured.mid' and 'structured_output.wav'")

In [None]:
from IPython.display import Audio, display

# Assuming 'structured_output.wav' was successfully created by the previous cell
# Check if the file exists before attempting to display the audio
import os
audio_file_path = "structured_output.wav"

if os.path.exists(audio_file_path):
    print(f"Attempting to play: {audio_file_path}")
    display(Audio(filename=audio_file_path))
else:
    print(f"Error: Audio file not found at {audio_file_path}. Please ensure it was created.")

In [None]:
# --- Step 3: Feature-based generation (variable pitch/duration/interval) ---
print("🎼 Generating 1-minute sequence with variable pitch/duration/interval (feature-based)...")
import numpy as np
import torch
from music21 import stream, note as m21note

# We'll use the first feature sequence as a starting seed
feature_seq_len = 16
feature_start_idx = np.random.randint(0, len(input_features))
current_input_seq = torch.tensor(input_features[feature_start_idx][:feature_seq_len], dtype=torch.float32).unsqueeze(0)
# Dummy model for demonstration (replace with your trained model)
class DummyFeatureModel(torch.nn.Module):
    def forward(self, x):
        # Simulate output: random next pitch class
        batch, seq, feats = x.shape
        return torch.randn(batch, note_encoder.classes_.shape[0])
feature_model = DummyFeatureModel()

# Generation loop: generate variable-length sequence
generated_pitches = []
generated_durations = []
max_length = 64
for step in range(max_length):
    out = feature_model(current_input_seq)
    next_pitch_idx = torch.argmax(out, dim=-1).item()
    generated_pitches.append(next_pitch_idx)
    # For duration, use the last duration in input or a default value
    if current_input_seq.size(1) > 0:
        duration_scaled = current_input_seq[0, -1, 1].item()
        duration = scaler_dur.inverse_transform([[duration_scaled]])[0, 0]
    else:
        duration = 0.5
    generated_durations.append(duration)
    # Prepare next input (shift window and append next prediction)
    # Here, we just roll the input and append a dummy new note for demonstration
    next_note = current_input_seq[0, -1, :].clone()
    next_note[0] = next_pitch_idx
    # Just keep the same duration/interval for demonstration
    current_input_seq = torch.cat([current_input_seq[:, 1:, :], next_note.view(1, 1, -1)], dim=1)

# Decode feature-based sequence to MIDI (variable durations)
s = stream.Stream()
for i, p in enumerate(generated_pitches):
    n = m21note.Note(int(round(p)))
    if i < current_input_seq.size(1):
        duration_scaled = current_input_seq[0, i, 1].item()
        duration = scaler_dur.inverse_transform([[duration_scaled]])[0, 0]
    else:
        duration = 0.5  # fallback
    n.quarterLength = duration
    s.append(n)
s.write('midi', fp="feature_based_output.mid")
print("✅ Saved variable-duration output to 'feature_based_output.mid'")

In [None]:
!pip install fluidsynth
import fluidsynth
import os
import IPython.display as ipd

# Install fluidsynth if not already installed
# You can check for its existence first, but reinstalling is usually fine
print("Installing fluidsynth...")
!apt-get update -qq && apt-get install fluidsynth -qq -y
print("fluidsynth installation complete.")

# Verify fluidsynth is now in the PATH
print("Checking fluidsynth path:")
!which fluidsynth

# Convert MIDI to audio using the fluidsynth command-line tool
# Ensure the output directory exists if needed
output_dir = "/content"
os.makedirs(output_dir, exist_ok=True)
midi_file = "/content/feature_based_output.mid" # Use the correct MIDI file name

output_wav = os.path.join(output_dir, "output.wav")
soundfont_path = "/usr/share/sounds/sf2/FluidR3_GM.sf2"  # default SoundFont

# Ensure the soundfont exists (already installed by fluid-soundfont-gm)
if not os.path.exists(soundfont_path):
    print(f"Error: Soundfont not found at {soundfont_path}. Please check installation.")
    # Handle the missing soundfont error appropriately, e.g., install soundfont
    # !apt-get install fluid-soundfont-gm -y
    # You might need to check if the soundfont path is correct for the installed package

command = f"fluidsynth -ni -a alsa -m alsa_seq {soundfont_path} {midi_file} -F {output_wav}"

# Execute the command
# The '!' prefix in a Jupyter notebook executes the command in the shell
print(f"Executing command: {command}") # Add print for debugging
# Use !command to execute the shell command. Output or errors from the command
# itself will be displayed in the notebook output.
!{command}

# Check if the output file was created
if os.path.exists(output_wav):
    print(f"✅ Successfully created: {output_wav}")
    # Play the result
    # Pass the filename explicitly to ipd.Audio
    print(f"Attempting to play: {output_wav}") # Add print for debugging
    display(ipd.Audio(filename=output_wav)) # Use display to ensure it renders in notebook
else:
    print(f"❌ Error: Output WAV file was not created at {output_wav}.")
    print("Please check the fluidsynth command output above for errors.")

## 5. Audio rendering

In [None]:
from music21 import stream, note as m21note, chord as m21chord
def decode_to_midi(tokens, filename="multi_instrument_output.mid"):
    s = stream.Score()
    parts = {"Melody": stream.Part(), "Chords": stream.Part(), "Bass": stream.Part()}
    # REMI-style bar/position time tracking
    bar_num = 0
    current_position = 0.0
    steps_per_bar = 16
    # Patch: double the duration per position for more realistic phrasing
    quarter_length_per_step = (4.0 / steps_per_bar) * 2.0
    track = None
    for tok in tokens:
        if tok.startswith("Bar_"):
            bar_num = int(tok.split("_")[1])
        elif tok.startswith("Position_"):
            pos = int(tok.split("_")[1])
            current_position = bar_num * 4.0 + pos * quarter_length_per_step
        elif tok.startswith("Track_"):
            track = tok.split("_")[1]
        elif tok.startswith("Note_"):
            pitch = tok.replace("Note_", "")
            if track == "Melody":
                n = m21note.Note(pitch)
                n.quarterLength = quarter_length_per_step
                n.offset = current_position
                parts["Melody"].append(n)
            elif track == "Bass":
                n = m21note.Note(pitch)
                n.quarterLength = quarter_length_per_step
                n.offset = current_position
                parts["Bass"].append(n)
        elif tok.startswith("Chord_"):
            root = tok.replace("Chord_", "")
            c = m21chord.Chord([root + "3", root + "4", root + "5"])
            c.quarterLength = quarter_length_per_step
            c.offset = current_position
            parts["Chords"].append(c)
    for p in parts.values():
        s.append(p)
    s.write('midi', fp=filename)
    print(f"Saved MIDI to {filename}")

In [None]:
# Save new long version
decode_to_midi(
    long_tokens,
    filename="symbolic_conditioned.mid",
    duration_model=duration_model,
    encoder=encoder,
    velocity_model=velocity_model,
    velocity_encoder=velocity_encoder,
    articulation_model=articulation_model,
    articulation_encoder=articulation_encoder,
    tempo_model=tempo_model
)

In [None]:
!pip install fluidsynth
import fluidsynth
import os
import IPython.display as ipd

# Install fluidsynth if not already installed
# You can check for its existence first, but reinstalling is usually fine
print("Installing fluidsynth...")
!apt-get update -qq && apt-get install fluidsynth -qq -y
print("fluidsynth installation complete.")

# Verify fluidsynth is now in the PATH
print("Checking fluidsynth path:")
!which fluidsynth

# Convert MIDI to audio using the fluidsynth command-line tool
# Ensure the output directory exists if needed
output_dir = "/content"
os.makedirs(output_dir, exist_ok=True)
midi_file = "/content/symbolic_conditioned.mid" # Use the correct MIDI file name

output_wav = os.path.join(output_dir, "output.wav")
soundfont_path = "/usr/share/sounds/sf2/FluidR3_GM.sf2"  # default SoundFont

# Ensure the soundfont exists (already installed by fluid-soundfont-gm)
if not os.path.exists(soundfont_path):
    print(f"Error: Soundfont not found at {soundfont_path}. Please check installation.")
    # Handle the missing soundfont error appropriately, e.g., install soundfont
    # !apt-get install fluid-soundfont-gm -y
    # You might need to check if the soundfont path is correct for the installed package

command = f"fluidsynth -ni -a alsa -m alsa_seq {soundfont_path} {midi_file} -F {output_wav}"

# Execute the command
# The '!' prefix in a Jupyter notebook executes the command in the shell
print(f"Executing command: {command}") # Add print for debugging
# Use !command to execute the shell command. Output or errors from the command
# itself will be displayed in the notebook output.
!{command}

# Check if the output file was created
if os.path.exists(output_wav):
    print(f"✅ Successfully created: {output_wav}")
    # Play the result
    # Pass the filename explicitly to ipd.Audio
    print(f"Attempting to play: {output_wav}") # Add print for debugging
    display(ipd.Audio(filename=output_wav)) # Use display to ensure it renders in notebook
else:
    print(f"❌ Error: Output WAV file was not created at {output_wav}.")
    print("Please check the fluidsynth command output above for errors.")

In [None]:
print("🎼 Generating extended symbolic sequence for Task 1 (1-minute duration)...")

# Generate a long sequence (e.g., 1200 tokens)
start_tokens = random.choice(encoded_token_seqs)[:256]
long_tokens_task1_long = generate_sequence(start_tokens, transformer_model, vocab, inv_vocab, length=1800)

# Save to MIDI
decode_to_midi(
    long_tokens_task1_long,
    filename="extended_symbolic_task1_long.mid",
    duration_model=duration_model,
    encoder=encoder,
    velocity_model=velocity_model,
    velocity_encoder=velocity_encoder,
    articulation_model=articulation_model,
    articulation_encoder=articulation_encoder,
    tempo_model=tempo_model
)

# Convert to WAV
!fluidsynth -ni /usr/share/sounds/sf2/FluidR3_GM.sf2 extended_symbolic_task1_long.mid -F extended_task1_long.wav -r 16000
print("✅ Saved extended Task 1 symbolic music to 'extended_symbolic_task1_long.mid' and audio to 'extended_task1_long.wav'")

In [None]:
# Convert MIDI to audio using the fluidsynth command-line tool
# Ensure the output directory exists if needed
output_dir = "/content"
os.makedirs(output_dir, exist_ok=True)
midi_file = "/content/extended_symbolic_task1_long.mid" # Use the correct MIDI file name

output_wav = os.path.join(output_dir, "output.wav")
soundfont_path = "/usr/share/sounds/sf2/FluidR3_GM.sf2"  # default SoundFont

# Ensure the soundfont exists (already installed by fluid-soundfont-gm)
if not os.path.exists(soundfont_path):
    print(f"Error: Soundfont not found at {soundfont_path}. Please check installation.")
    # Handle the missing soundfont error appropriately, e.g., install soundfont
    # !apt-get install fluid-soundfont-gm -y
    # You might need to check if the soundfont path is correct for the installed package

command = f"fluidsynth -ni -a alsa -m alsa_seq {soundfont_path} {midi_file} -F {output_wav}"

# Execute the command
# The '!' prefix in a Jupyter notebook executes the command in the shell
print(f"Executing command: {command}") # Add print for debugging
# Use !command to execute the shell command. Output or errors from the command
# itself will be displayed in the notebook output.
!{command}

# Check if the output file was created
if os.path.exists(output_wav):
    print(f"✅ Successfully created: {output_wav}")
    # Play the result
    # Pass the filename explicitly to ipd.Audio
    print(f"Attempting to play: {output_wav}") # Add print for debugging
    display(ipd.Audio(filename=output_wav)) # Use display to ensure it renders in notebook
else:
    print(f"❌ Error: Output WAV file was not created at {output_wav}.")
    print("Please check the fluidsynth command output above for errors.")