<a href="https://colab.research.google.com/github/softmurata/colab_notebooks/blob/main/audio/AudioLM.ipynb" target="_parent"><img src="https://colab.research.google.com/assets/colab-badge.svg" alt="Open In Colab"/></a>

Installation

In [None]:
!pip install diffusers transformers accelerate

Library import

In [None]:
import torch
from diffusers import AudioLDMPipeline

from transformers import AutoProcessor, ClapModel

Load model

In [None]:
# make Space compatible with CPU duplicates
if torch.cuda.is_available():
    device = "cuda"
    torch_dtype = torch.float16
else:
    device = "cpu"
    torch_dtype = torch.float32

# load the diffusers pipeline
repo_id = "cvssp/audioldm-m-full"
pipe = AudioLDMPipeline.from_pretrained(repo_id, torch_dtype=torch_dtype).to(device)
pipe.unet = torch.compile(pipe.unet)

# CLAP model (only required for automatic scoring)
clap_model = ClapModel.from_pretrained("sanchit-gandhi/clap-htsat-unfused-m-full").to(device)
processor = AutoProcessor.from_pretrained("sanchit-gandhi/clap-htsat-unfused-m-full")

generator = torch.Generator(device)

Utility function

In [None]:
def score_waveforms(text, waveforms):
    inputs = processor(text=text, audios=list(waveforms), return_tensors="pt", padding=True)
    inputs = {key: inputs[key].to(device) for key in inputs}
    with torch.no_grad():
        logits_per_text = clap_model(**inputs).logits_per_text  # this is the audio-text similarity score
        probs = logits_per_text.softmax(dim=-1)  # we can take the softmax to get the label probabilities
        most_probable = torch.argmax(probs)  # and now select the most likely audio waveform
    waveform = waveforms[most_probable]
    return waveform

Input settings

In [None]:
text = "A hammer is hitting a wooden surface"
negative_prompt = "low quality, average quality"
duration = 5
guidance_scale = 2.5
n_candidates = 1
random_seed = 45

Inference

In [None]:
waveforms = pipe(
        text,
        audio_length_in_s=duration,
        guidance_scale=guidance_scale,
        negative_prompt=negative_prompt,
        num_waveforms_per_prompt=n_candidates if n_candidates else 1,
        generator=generator.manual_seed(int(random_seed)),
    )["audios"]
if waveforms.shape[0] > 1:
  waveform = score_waveforms(text, waveforms)
else:
  waveform = waveforms[0]

Display and save audio

In [None]:
import IPython
import scipy
# IPython.display.audio(audio)
scipy.io.wavfile.write("hammer.wav", rate=16000, data=waveform)
IPython.display.Audio(waveform, rate=16000)