In [1]:
from stable_audio_tools.models.pretrained import get_pretrained_model

model, model_config = get_pretrained_model("stabilityai/stable-audio-open-small")


No module named 'flash_attn'
flash_attn not installed, disabling Flash Attention


  WeightNorm.apply(module, name, dim)


In [2]:
import sys
sys.path.append("/workspace/stable-audio-tools")


In [3]:
!pwd

/workspace


In [4]:
from pathlib import Path

AUDIO_DIR = Path("/workspace/data3")
OUTPUT_DIR = Path("/workspace/data3_preencoded_overlap")
SAMPLE_RATE = 44100
SEGMENT_DURATION = 10.0  # seconds
SEGMENT_SAMPLES = int(SAMPLE_RATE * SEGMENT_DURATION)

In [4]:
import torchaudio
import numpy as np
import torch

torch.set_float32_matmul_precision('high')

# === VERIFY MODEL ===
assert hasattr(model, "pretransform") and model.pretransform is not None, \
    "❌ Model must have a `.pretransform` encoder"

model.pretransform.to("cuda").eval()

# === CONFIG ==

OVERLAP_RATIO = 0.25
STEP_SIZE = int(SEGMENT_SAMPLES * (1 - OVERLAP_RATIO))

MIN_RMS_THRESHOLD = 0.01  # skip near-silent segments

OUTPUT_DIR.mkdir(parents=True, exist_ok=True)

print("🔁 Encoding and saving pre-encoded latent segments...")

def is_non_silent(segment, threshold=MIN_RMS_THRESHOLD):
    rms = segment.pow(2).mean().sqrt().item()
    return rms > threshold

total_saved = 0

for file in AUDIO_DIR.glob("*.wav"):
    try:
        # === Load ===
        audio, sr = torchaudio.load(str(file))  # [C, T]
        if audio.numel() == 0:
            print(f"⚠️ Empty audio in {file.name}, skipping.")
            continue

        # === Resample if needed ===
        if sr != SAMPLE_RATE:
            audio = torchaudio.transforms.Resample(orig_freq=sr, new_freq=SAMPLE_RATE)(audio)

        # === Mono → Stereo padding ===
        if audio.shape[0] == 1:
            audio = audio.repeat(2, 1)  # [2, T]

        audio_len = audio.shape[1]
        seg_idx = 0

        # === Slice with overlap ===
        for start in range(0, audio_len - SEGMENT_SAMPLES + 1, STEP_SIZE):
            end = start + SEGMENT_SAMPLES
            segment = audio[:, start:end]  # [2, SEGMENT_SAMPLES]

            if not is_non_silent(segment):
                continue

            segment = segment.unsqueeze(0).to("cuda")  # [1, 2, T]

            # === Encode ===
            with torch.no_grad():
                latent = model.pretransform.encode(segment)  # [1, D, T']
                latent = latent.squeeze(0).cpu().numpy()     # [D, T']

            out_path = OUTPUT_DIR / f"{file.stem}_ov{seg_idx:03d}.npy"
            np.save(out_path, latent)

            seg_idx += 1
            total_saved += 1

    except Exception as e:
        print(f"❌ Error processing {file.name}: {e}")

print(f"✅ Done. Saved {total_saved} valid latent segments to: {OUTPUT_DIR}")


🔁 Encoding and saving pre-encoded latent segments...
✅ Done. Saved 431 valid latent segments to: /workspace/data3_preencoded_overlap


In [5]:
from stable_audio_tools.data.dataset import create_dataloader_from_config
import json

with open("/workspace/dataset_config.json") as f:
    dataset_config = json.load(f)

train_loader = create_dataloader_from_config(
    dataset_config,
    batch_size=1,
    sample_size=SAMPLE_RATE * SEGMENT_DURATION,
    sample_rate=44100,
    audio_channels=2,
    num_workers=8
)


Found 431 files


In [6]:
import os
os.environ["TOKENIZERS_PARALLELISM"] = "false"


In [7]:
from stable_audio_tools.training.diffusion import DiffusionCondTrainingWrapper
import pytorch_lightning as pl

training_wrapper = DiffusionCondTrainingWrapper(
    model=model,
    lr=1e-4,
    pre_encoded=True
)

trainer = pl.Trainer(
    max_steps=1200,                     # <-- total steps to train
    accumulate_grad_batches=2,          # <-- simulate larger batch
    precision='16',                        # <-- mixed precision (faster)
    log_every_n_steps=600,               # <-- print loss every 10 steps
    enable_progress_bar=True,           
    enable_checkpointing=False,
    val_check_interval=None,
    strategy='auto',
    devices=1
)

trainer.fit(training_wrapper, train_dataloaders=train_loader)
training_wrapper.export_model("/workspace/stable-audio-tools/saved/final_model.pt")

with open("/workspace/stable-audio-tools/saved/final_model_config.json", "w") as f:
    json.dump(model_config, f, indent=2)



/usr/local/lib/python3.10/dist-packages/lightning_fabric/connector.py:565: `precision=16` is supported for historical reasons but its usage is discouraged. Please set your precision to 16-mixed instead!
Using 16bit Automatic Mixed Precision (AMP)
/usr/local/lib/python3.10/dist-packages/pytorch_lightning/plugins/precision/amp.py:54: `torch.cuda.amp.GradScaler(args...)` is deprecated. Please use `torch.amp.GradScaler('cuda', args...)` instead.
GPU available: True (cuda), used: True
TPU available: False, using: 0 TPU cores
IPU available: False, using: 0 IPUs
HPU available: False, using: 0 HPUs
/usr/local/lib/python3.10/dist-packages/pytorch_lightning/trainer/configuration_validator.py:74: You defined a `validation_step` but have no `val_dataloader`. Skipping val loop.
You are using a CUDA device ('NVIDIA GeForce RTX 4050 Laptop GPU') that has Tensor Cores. To properly utilize them, you should set `torch.set_float32_matmul_precision('medium' | 'high')` which will trade-off precision for pe

Training: |                                                                                       | 0/? [00:00…

  with torch.cuda.amp.autocast(dtype=torch.float16) and torch.set_grad_enabled(self.enable_grad):
`Trainer.fit` stopped: `max_steps=1200` reached.


In [8]:
#import torch
#import json
#from stable_audio_tools.models.diffusion import create_diffusion_cond_from_config

# Load saved config
#with open("/workspace/stable-audio-tools/saved/final_model_config.json") as f:
    #config = json.load(f)

# Rebuild model from config
#model = create_diffusion_cond_from_config(config)
#ckpt = torch.load("/workspace/stable-audio-tools/saved/final_model.pt", map_location="cuda")
#model.load_state_dict(ckpt["state_dict"])
#model.to("cuda").eval()


In [9]:
print(torch.hub.get_dir())

/root/.cache/torch/hub


In [8]:
from stable_audio_tools.inference.generation import generate_diffusion_cond

conditioning = [{
    "prompt": "Trap remix with evolving piano, gliding 808s, and hard-hitting drums",
    "seconds_total": 90.0
}]

negative_conditioning = [{
    "prompt": "no ambient drone, no static loop, no fade out",
    "seconds_total": 90.0
}]



SAMPLE_RATE = 44100
DURATION_SEC = sum(p["seconds_total"] for p in conditioning)  # = 90
SAMPLE_SIZE = int(SAMPLE_RATE * DURATION_SEC)  # = 3,969,000


output = generate_diffusion_cond(
    model=model.to("cuda"),       # make 100% sure model is on GPU
    steps=1024,
    cfg_scale=6.5,
    conditioning=conditioning,
    negative_conditioning=negative_conditioning,
    sample_size=SAMPLE_SIZE,       # 9 seconds
    device="cuda"                 # force everything onto GPU
)


1088640747


1024it [19:27,  1.14s/it]


RuntimeError: CUDA error: out of memory
CUDA kernel errors might be asynchronously reported at some other API call, so the stacktrace below might be incorrect.
For debugging consider passing CUDA_LAUNCH_BLOCKING=1
Compile with `TORCH_USE_CUDA_DSA` to enable device-side assertions.


In [None]:
import torchaudio
from einops import rearrange
import torch

# Rearrange: [B, C, T] -> [C, T] for saving
waveform = rearrange(output, "b c n -> c (b n)")

# Peak normalize and convert to 16-bit PCM
waveform = waveform.to(torch.float32).div(torch.max(torch.abs(waveform))).mul(32767).clamp(-32768, 32767).to(torch.int16).cpu()


In [None]:
torchaudio.save("remix_output_prompt_2048_steps.wav", waveform, sample_rate=44100)
print("✅ Saved to remix_output_prompt_2048_steps.wav")
