# AudioPrompt: Prompt Generation Workflow

Use the toggles in the Config cell to switch between:
- Straight pink noise
- Pink noise with spectral focus (bass/guitar/vocal/custom)
- Scale-driven randomized melody imprint (with optional focus)

Then optionally prepend the prompt to your input track and save.

In [1]:
# Imports
import numpy as np
from pathlib import Path
from IPython.display import Audio, display
from scipy.signal import stft, istft, resample_poly
import soundfile as sf

# Optional: try to import your project loader
try:
    import audioprompt as ap
except Exception:
    ap = None


In [3]:
# Helper functions: noise, scales, melody, imprint, focus, gating

def midi_to_hz(m):
    m = np.asarray(m, dtype=float)
    return 440.0 * (2.0 ** ((m - 69.0) / 12.0))

def pink_noise(n, sr, seed=None):
    rng = np.random.default_rng(seed)
    X = np.fft.rfft(rng.standard_normal(n))
    f = np.fft.rfftfreq(n, 1/sr)
    X[1:] /= np.sqrt(np.maximum(f[1:], 1e-6))
    y = np.fft.irfft(X, n)
    y /= np.max(np.abs(y) + 1e-12)
    return y.astype(np.float32)

SCALES = {
    'major': [0,2,4,5,7,9,11],
    'natural_minor': [0,2,3,5,7,8,10],
    'dorian': [0,2,3,5,7,9,10],
    'mixolydian': [0,2,4,5,7,9,10],
    'pentatonic': [0,3,5,7,10],
    'harmonic_minor': [0,2,3,5,7,8,11],
    # Extras
    'minor_pentatonic': [0,3,5,7,10],
    'major_pentatonic': [0,2,4,7,9],
    'minor_blues': [0,3,5,6,7,10],
    'major_blues': [0,2,3,4,7,9],
    'lydian': [0,2,4,6,7,9,11],
    'phrygian': [0,1,3,5,7,8,10],
    'aeolian': [0,2,3,5,7,8,10],
    'locrian': [0,1,3,5,6,8,10],
    'melodic_minor': [0,2,3,5,7,9,11],
    'harmonic_major': [0,2,4,5,7,8,11],
    'double_harmonic': [0,1,4,5,7,8,11],
    'whole_tone': [0,2,4,6,8,10],
    'octatonic_whole_half': [0,2,3,5,6,8,9,11],
    'octatonic_half_whole': [0,1,3,4,6,7,9,10],
    'chromatic': list(range(12)),
}

NOTE_TO_MIDI = {
    'C':60,'C#':61,'Db':61,'D':62,'D#':63,'Eb':63,'E':64,'F':65,
    'F#':66,'Gb':66,'G':67,'G#':68,'Ab':68,'A':69,'A#':70,'Bb':70,'B':71
}

def build_scale_pitches(root='C', octave=4, scale='major', low_midi=48, high_midi=84):
    root_midi = NOTE_TO_MIDI[root] + 12*(octave-4)
    pattern = [p % 12 for p in SCALES[scale]]
    allowed = [m for m in range(low_midi, high_midi+1) if ((m - root_midi) % 12) in pattern]
    return np.array(allowed, dtype=int)

def generate_random_melody(duration_s, bpm=100, root='C', octave=4, scale='major',
                           low_midi=55, high_midi=79, step_bias=0.8,
                           leap_max_scale_steps=4, rest_prob=0.1,
                           durations_beats=(0.25, 0.5, 1.0), duration_probs=(0.25, 0.5, 0.25),
                           seed=None):
    rng = np.random.default_rng(seed)
    allowed = build_scale_pitches(root, octave, scale, low_midi, high_midi)
    if len(allowed) < 2:
        raise ValueError('Pitch set too small; adjust range/scale.')
    spb = 60.0 / bpm
    t, t_end = 0.0, duration_s
    events = []
    current_idx = rng.integers(0, len(allowed))
    while t < t_end - 1e-6:
        db = rng.choice(durations_beats, p=duration_probs)
        dur = min(db * spb, t_end - t)
        if rng.random() < rest_prob:
            events.append((t, t+dur, None))
        else:
            if rng.random() < step_bias:
                step = rng.choice([-1, 1])
                ni = np.clip(current_idx + step, 0, len(allowed)-1)
            else:
                leap = rng.integers(-leap_max_scale_steps, leap_max_scale_steps+1)
                if leap == 0: leap = rng.choice([-1, 1])
                ni = np.clip(current_idx + leap, 0, len(allowed)-1)
            current_idx = int(ni)
            events.append((t, t+dur, int(allowed[current_idx])))
        t += dur
    return events

def events_to_f0(events, sr, n_samples, glide_prob=0.25, glide_frac=0.35,
                 vibrato_hz=5.5, vibrato_depth=0.02, seed=None):
    rng = np.random.default_rng(seed)
    t = np.arange(n_samples)/sr
    f0 = np.zeros(n_samples, dtype=float)
    for i, (t0, t1, midi) in enumerate(events):
        s0, s1 = int(round(t0*sr)), int(round(t1*sr))
        if s0 >= n_samples: break
        s1 = min(s1, n_samples)
        if midi is None: continue
        f_curr = midi_to_hz(midi)
        next_midi = events[i+1][2] if (i+1) < len(events) else None
        if next_midi is not None and rng.random() < glide_prob:
            f_next = midi_to_hz(next_midi)
            g_len = max(1, int((s1 - s0) * glide_frac))
            if s0 + g_len <= s1:
                f0[s0:s0+g_len] = np.linspace(f_curr, f_next, g_len, endpoint=False)
                f0[s0+g_len:s1]   = f_next
            else:
                f0[s0:s1] = np.linspace(f_curr, f_next, s1 - s0, endpoint=False)
        else:
            f0[s0:s1] = f_curr
    mask = f0 > 0
    if np.any(mask):
        f0[mask] *= 1.0 + vibrato_depth * np.sin(2*np.pi*vibrato_hz * t[mask])
    if n_samples > 2048:
        win = np.hanning(513); win /= win.sum()
        f0 = np.convolve(f0, win, mode='same')
    return f0

FOCUS_BANDS = {
    'bass':   (40, 300),
    'guitar': (80, 6000),
    'vocal':  (120, 3200),
}

def _soft_band_envelope(f_hz, low_hz, high_hz, sharpness=12.0):
    f = np.asarray(f_hz, dtype=float)
    f_safe = np.maximum(f, 1e-6)
    lf = np.log2(f_safe)
    l0 = np.log2(max(low_hz, 1e-6))
    h0 = np.log2(max(high_hz, low_hz + 1e-6))
    lo = 1.0 / (1.0 + np.exp(-sharpness * (lf - l0)))
    hi = 1.0 / (1.0 + np.exp( sharpness * (lf - h0)))
    return lo * hi

def imprint_melody_focus(noise, sr, f0_hz, gain=8.0, harmonics=10, bw_frac=0.01,
                         focus=None, band_floor_db=-18.0, sharpness=12.0, n_fft=2048):
    hop = n_fft // 4
    freqs, times, Z = stft(noise, fs=sr, nperseg=n_fft, noverlap=n_fft-hop, boundary=None)
    mag, ph = np.abs(Z), np.angle(Z)
    if focus is not None:
        if isinstance(focus, str):
            if focus not in FOCUS_BANDS:
                raise ValueError(f'Unknown focus preset: {focus}')
            low_hz, high_hz = FOCUS_BANDS[focus]
        else:
            low_hz, high_hz = focus
        band = _soft_band_envelope(freqs, low_hz, high_hz, sharpness=sharpness)
        floor = 10.0 ** (band_floor_db / 20.0)
        eq_mask = floor + (1.0 - floor) * band
        mag *= eq_mask[:, None]
    if np.isscalar(f0_hz):
        f0_traj = np.full_like(times, float(f0_hz))
    else:
        f0_time = np.linspace(0, len(noise)/sr, num=len(f0_hz), endpoint=False)
        f0_traj = np.interp(times, f0_time, f0_hz)
    for i, f0 in enumerate(f0_traj):
        if f0 <= 0 or not np.isfinite(f0):
            continue
        mask = np.zeros_like(freqs)
        for k in range(1, harmonics + 1):
            fk = k * f0
            if fk > freqs[-1]: break
            bw = bw_frac * fk
            mask += np.exp(-0.5 * ((freqs - fk) / (bw + 1e-6))**2)
        if mask.max() > 0:
            mask = 1.0 + (gain * (mask / mask.max()))
            mag[:, i] *= mask
    _, y = istft(mag * np.exp(1j * ph), fs=sr, nperseg=n_fft, noverlap=n_fft-hop, boundary=None)
    y = y[:len(noise)]
    y /= np.max(np.abs(y) + 1e-12)
    return y.astype(np.float32)

def rhythmic_gate_from_events(events, sr, n_samples, attack=0.01, release=0.03):
    env = np.zeros(n_samples, dtype=float)
    for (t0, t1, midi) in events:
        s0 = int(np.round(t0*sr)); s1 = int(np.round(t1*sr))
        s0 = max(0, min(n_samples-1, s0)); s1 = max(0, min(n_samples, s1))
        if s1 <= s0: continue
        a = max(1, int(attack * sr)); r = max(1, int(release * sr))
        seg = np.ones(s1 - s0, dtype=float)
        seg[:min(a, len(seg))] *= np.linspace(0, 1, num=min(a, len(seg)), endpoint=False)
        if r < len(seg): seg[-r:] *= np.linspace(1, 0, num=r, endpoint=True)
        env[s0:s1] = np.maximum(env[s0:s1], seg)
    return env


## Config
Flip these switches to control the workflow.

In [None]:
# ---- Global config ----
INPUT_AUDIO = ''  # e.g., '/path/to/your/audio.wav'
SR = 48000
PROMPT_SECONDS = 4.0
PREPEND_SECONDS = PROMPT_SECONDS  # set a separate prepend length if desired

# Toggles
ENABLE_MELODY = True       # Use scale-driven randomized melody
ENABLE_FOCUS = False     # Emphasize a spectral band (bass/guitar/vocal/custom)
ENABLE_GATE = True        # Apply rhythmic gate from melody events
PREPEND_TO_TRACK = False     # Prepend prompt to INPUT_AUDIO if provided
SAVE_OUTPUT = True
SAVE_PROMPT = True            # Also save the standalone y_prompt
PROMPT_ONLY_SUFFIX = '_prompt'

# Focus preset or custom band
FOCUS_PRESET = 'vocal'     # 'bass' | 'guitar' | 'vocal' | None
FOCUS_BAND = (500, 2500)    # Only used if FOCUS_PRESET is None and ENABLE_FOCUS is True

# Melody parameters
MELODY_BPM = 96
MELODY_ROOT = 'A'
MELODY_SCALE = 'minor_blues'  # e.g., 'major', 'dorian', 'minor_blues'
MELODY_LOW_MIDI = 55          # G3
MELODY_HIGH_MIDI = 79         # G5
MELODY_STEP_BIAS = 0.8
MELODY_LEAP_STEPS = 4
MELODY_REST_PROB = 0.12
MELODY_DURS = (0.25, 0.5, 1.0)
MELODY_DUR_PROBS = (0.25, 0.5, 0.25)
MELODY_GLIDE_PROB = 0.25
MELODY_GLIDE_FRAC = 0.35
MELODY_VIBRATO_HZ = 5.5
MELODY_VIBRATO_DEPTH = 0.02
SEED = 7

# Imprint parameters
IMPRINT_GAIN = 8.0
IMPRINT_HARMONICS = 10
IMPRINT_BW_FRAC = 0.01
IMPRINT_NFFT = 2048
FOCUS_FLOOR_DB = -18.0
FOCUS_SHARPNESS = 12.0

# Prepend/save parameters
PROMPT_GAIN_DB = -3.0
FADE_IN_MS = 10
FADE_OUT_MS = 50
OUTPUT_SUFFIX = '_with_prompt'


## Generate Prompt
Creates `y_prompt` based on the toggles.

In [21]:
# Build base pink noise for the prompt window
n = int(SR * PROMPT_SECONDS)
x_pink = pink_noise(n, SR, seed=SEED)
events = None

# Determine focus argument
focus_arg = None
if ENABLE_FOCUS:
    if FOCUS_PRESET is None:
        focus_arg = FOCUS_BAND
    else:
        focus_arg = FOCUS_PRESET

# Melody-driven imprint or alternatives
if ENABLE_MELODY:
    events = generate_random_melody(
        duration_s=PROMPT_SECONDS, bpm=MELODY_BPM, root=MELODY_ROOT, octave=4, scale=MELODY_SCALE,
        low_midi=MELODY_LOW_MIDI, high_midi=MELODY_HIGH_MIDI, step_bias=MELODY_STEP_BIAS,
        leap_max_scale_steps=MELODY_LEAP_STEPS, rest_prob=MELODY_REST_PROB,
        durations_beats=MELODY_DURS, duration_probs=MELODY_DUR_PROBS, seed=SEED
    )
    f0 = events_to_f0(
        events, SR, n_samples=n, glide_prob=MELODY_GLIDE_PROB, glide_frac=MELODY_GLIDE_FRAC,
        vibrato_hz=MELODY_VIBRATO_HZ, vibrato_depth=MELODY_VIBRATO_DEPTH, seed=SEED
    )
    y_prompt = imprint_melody_focus(
        x_pink, SR, f0_hz=f0, gain=IMPRINT_GAIN, harmonics=IMPRINT_HARMONICS, bw_frac=IMPRINT_BW_FRAC,
        focus=focus_arg, band_floor_db=FOCUS_FLOOR_DB, sharpness=FOCUS_SHARPNESS, n_fft=IMPRINT_NFFT
    )
elif ENABLE_FOCUS:
    # No melody, apply only spectral focus shaping to pink noise
    y_prompt = imprint_melody_focus(
        x_pink, SR, f0_hz=0.0, gain=0.0, harmonics=0, bw_frac=IMPRINT_BW_FRAC,
        focus=focus_arg, band_floor_db=FOCUS_FLOOR_DB, sharpness=FOCUS_SHARPNESS, n_fft=IMPRINT_NFFT
    )
else:
    # Straight pink noise
    y_prompt = x_pink.astype(np.float32)

# Optional rhythmic gate from events
if ENABLE_GATE and events is not None:
    gate = rhythmic_gate_from_events(events, SR, n_samples=n, attack=0.01, release=0.03)
    y_prompt = (y_prompt * (0.15 + 0.85 * gate)).astype(np.float32)

# Normalize
peak = np.max(np.abs(y_prompt)) + 1e-12
if peak > 0:
    y_prompt = (y_prompt / peak).astype(np.float32)

print(f'Generated prompt: {len(y_prompt)} samples at {SR} Hz')
display(Audio(y_prompt, rate=SR))


Generated prompt: 192000 samples at 48000 Hz


  _, y = istft(mag * np.exp(1j * ph), fs=sr, nperseg=n_fft, noverlap=n_fft-hop, boundary=None)


## Prepend To Track & Save
Loads your input, prepends the prompt, and writes a WAV.

In [22]:
def apply_fades(y, sr, fade_in_ms=10, fade_out_ms=50):
    y = y.astype(np.float32, copy=True)
    fi = max(0, int(sr * fade_in_ms / 1000.0))
    fo = max(0, int(sr * fade_out_ms / 1000.0))
    if fi > 0:
        y[:fi] *= np.linspace(0.0, 1.0, fi, endpoint=False, dtype=np.float32)
    if fo > 0:
        y[-fo:] *= np.linspace(1.0, 0.0, fo, endpoint=True, dtype=np.float32)
    return y

if PREPEND_TO_TRACK:
    if not INPUT_AUDIO:
        raise RuntimeError('Set INPUT_AUDIO to a valid path before running this cell.')
    in_path = Path(str(INPUT_AUDIO))
    # Load mono float32 at SR
    if ap is not None:
        x, sr_in = ap._load_audio_mono(in_path, SR)
    else:
        data, sr_in = sf.read(str(in_path), always_2d=True, dtype='float32')
        x = data.mean(axis=1)
        if sr_in != SR:
            g = np.gcd(sr_in, SR)
            x = resample_poly(x, SR // g, sr_in // g).astype(np.float32)
    # Prepare prompt length/gain
    prompt = y_prompt.astype(np.float32, copy=False)
    target_len = int(round(PREPEND_SECONDS * SR))
    if len(prompt) >= target_len:
        prompt = prompt[:target_len]
    else:
        pad = np.zeros(target_len - len(prompt), dtype=np.float32)
        prompt = np.concatenate([prompt, pad], axis=0)
    # Gain + fades
    gain = 10 ** (PROMPT_GAIN_DB / 20.0)
    prompt = apply_fades(prompt * gain, SR, FADE_IN_MS, FADE_OUT_MS)
    # Concatenate
    combined = np.concatenate([prompt, x.astype(np.float32, copy=False)], axis=0)
    peak = np.max(np.abs(combined)) + 1e-12
    if peak > 0.999:
        combined = (combined / peak * 0.999).astype(np.float32)
    # Build descriptive suffix to avoid overwrites: include scale, focus, and seed
    scale_tag = MELODY_SCALE if ENABLE_MELODY else 'none'
    if ENABLE_FOCUS:
        if isinstance(FOCUS_PRESET, str):
            focus_tag = FOCUS_PRESET
        else:
            try:
                lo, hi = FOCUS_BAND
                focus_tag = f'band-{int(lo)}-{int(hi)}'
            except Exception:
                focus_tag = 'custom'
    else:
        focus_tag = 'none'
    suffix = f"{OUTPUT_SUFFIX}_scale-{scale_tag}_focus-{focus_tag}_seed-{SEED}"
    out_path = in_path.with_name(in_path.stem + suffix + in_path.suffix)
    if SAVE_OUTPUT:
        sf.write(str(out_path), combined, SR, subtype='PCM_16')
        print(f'Saved combined: {out_path}')
    print(f'Prompt: {len(prompt)/SR:.2f}s, Original: {len(x)/SR:.2f}s, Combined: {len(combined)/SR:.2f}s at {SR} Hz')
    display(Audio(combined, rate=SR))
else:
    print('Skipping prepend: PREPEND_TO_TRACK is False')

# Save standalone prompt (y_prompt) with tags if requested
if SAVE_PROMPT:
    scale_tag = MELODY_SCALE if ENABLE_MELODY else 'none'
    if ENABLE_FOCUS:
        if isinstance(FOCUS_PRESET, str):
            focus_tag = FOCUS_PRESET
        else:
            try:
                lo, hi = FOCUS_BAND
                focus_tag = f'band-{int(lo)}-{int(hi)}'
            except Exception:
                focus_tag = 'custom'
    else:
        focus_tag = 'none'
    if INPUT_AUDIO:
        base = Path(str(INPUT_AUDIO))
        prompt_out = base.with_name(base.stem + f"{PROMPT_ONLY_SUFFIX}_scale-{scale_tag}_focus-{focus_tag}_seed-{SEED}" + base.suffix)
    else:
        prompt_out = Path(f"prompt{PROMPT_ONLY_SUFFIX}_scale-{scale_tag}_focus-{focus_tag}_seed-{SEED}.wav")
    sf.write(str(prompt_out), y_prompt.astype(np.float32, copy=False), SR, subtype='PCM_16')
    print(f'Saved prompt: {prompt_out}')


Saved combined: /path/to/output_with_prompt_scale-none_focus-band-500-2500_seed-7.wav
Prompt: 4.00s, Original: 32.81s, Combined: 36.81s at 48000 Hz


Saved prompt: /path/to/output_prompt_scale-none_focus-band-500-2500_seed-7.wav


### Tips
- Change `MELODY_SCALE` to try 'major', 'dorian', 'minor_blues', etc.
- Use `ENABLE_FOCUS` with `FOCUS_PRESET` ('bass'/'guitar'/'vocal') or `FOCUS_BAND=(low, high)` in Hz.
- Set `ENABLE_MELODY=False` and `ENABLE_FOCUS=False` for straight pink noise prompts.
- Adjust `IMPRINT_GAIN`, `IMPRINT_HARMONICS`, and `IMPRINT_BW_FRAC` to tighten/loosen pitch bias.