In [1]:
!pip install -q audiocraft

In [2]:
!pip install -q laion_clap

In [3]:
!git clone https://huggingface.co/lukewys/laion_clap

fatal: destination path 'laion_clap' already exists and is not an empty directory.


In [4]:
from transformers import AutoProcessor, MusicgenForConditionalGeneration
import scipy.io.wavfile
import numpy as np
import librosa
import torch
import torchaudio
from scipy.signal import hilbert
from pathlib import Path
from audiocraft.metrics import CLAPTextConsistencyMetric

    PyTorch 2.1.0+cu121 with CUDA 1201 (you have 2.1.0+cu118)
    Python  3.10.13 (you have 3.10.12)
  Please reinstall xformers (see https://github.com/facebookresearch/xformers#installing-xformers)
  Memory-efficient attention, SwiGLU, sparse and more won't be available.
  Set XFORMERS_MORE_DETAILS=1 for more details


In [5]:
!pip install accelerate xformers transformers



In [6]:
!pip install git+https://github.com/huggingface/diffusers

Collecting git+https://github.com/huggingface/diffusers
  Cloning https://github.com/huggingface/diffusers to /tmp/pip-req-build-1_4lj3uh
  Running command git clone --filter=blob:none --quiet https://github.com/huggingface/diffusers /tmp/pip-req-build-1_4lj3uh
  Resolved https://github.com/huggingface/diffusers to commit 0a401b95b7f298b3d029576e1d65d99f06ed1228
  Installing build dependencies ... [?25l[?25hdone
  Getting requirements to build wheel ... [?25l[?25hdone
  Preparing metadata (pyproject.toml) ... [?25l[?25hdone


In [7]:
from diffusers import AudioLDMPipeline

In [8]:
class MusicGenSmall:
    def __init__(self, model_name="facebook/musicgen-small"):
        self.processor = AutoProcessor.from_pretrained(model_name)
        self.model = MusicgenForConditionalGeneration.from_pretrained(model_name)

    def generate_music(self, prompt):
        inputs = self.processor(
            text=[prompt],
            padding=True,
            return_tensors="pt",
        )
        audio_values = self.model.generate(**inputs, max_new_tokens=1503)
        return audio_values[0, 0].numpy()

class MusicGenMedium:
    def __init__(self, model_name="facebook/musicgen-medium"):
        self.processor = AutoProcessor.from_pretrained(model_name)
        self.model = MusicgenForConditionalGeneration.from_pretrained(model_name)

    def generate_music(self, prompt):
        inputs = self.processor(
            text=[prompt],
            padding=True,
            return_tensors="pt",
        )
        audio_values = self.model.generate(**inputs, max_new_tokens=1503)
        print(audio_values[0, 0])
        return audio_values[0, 0].numpy()

class Diffusersldm:
    def __init__(self, model_name="cvssp/audioldm"):
        self.pipe = AudioLDMPipeline.from_pretrained(model_name, torch_dtype=torch.float16)

    def generate_music(self, prompt):
        self.pipe = self.pipe.to("cuda")
        audio_values = self.pipe(prompt, num_inference_steps=10, audio_length_in_s=30.0).audios[0]
        return audio_values

In [9]:
class WavFileWriter:
    def write_wav_file(self, audio_values, file_path, sampling_rate):
        scipy.io.wavfile.write(file_path, rate=sampling_rate, data=audio_values)
        print(file_path)
        return file_path

In [10]:
class MetricEvaluator:
    @staticmethod
    def calculate_snr(file_path):
        audio_signal, _ = librosa.load(file_path, sr=None)
        signal_power = np.sum(audio_signal**2)
        noise_power = np.sum(librosa.effects.preemphasis(audio_signal)**2)
        snr = 10 * np.log10(signal_power / noise_power)
        return snr

    @staticmethod
    def calculate_smoothness(file_path):
        audio_signal, _ = torchaudio.load(file_path)
        amplitude_envelope = torch.from_numpy(np.abs(hilbert(audio_signal[0].numpy())))
        smoothness = 0.0
        for i in range(1, len(amplitude_envelope)):
            smoothness += torch.abs((amplitude_envelope[i] - amplitude_envelope[i-1]) / (i - (i-1)))
        smoothness /= len(amplitude_envelope) - 1
        return smoothness.item()

    @staticmethod
    def calculate_consistency(file_path, text):
        device = torch.device('cuda' if torch.cuda.is_available() else 'cpu')
        clap_metric = CLAPTextConsistencyMetric('/content/laion_clap/630k-best.pt').to(device)
        def convert_audio(audio, from_rate, to_rate, to_channels):
          resampler = torchaudio.transforms.Resample(orig_freq=from_rate, new_freq=to_rate)
          audio = resampler(audio)
          if to_channels == 1:
              audio = audio.mean(dim=0, keepdim=True)
          return audio

        audio, sr = torchaudio.load(file_path)
        audio = convert_audio(audio, from_rate=sr, to_rate=48000, to_channels=1)

        clap_metric.update(audio.unsqueeze(0), [text], torch.tensor([audio.shape[1]]), torch.tensor([48000]))
        consistency_score = clap_metric.compute()
        return consistency_score

In [11]:
class MusicQualityEvaluator:
    def __init__(self, generator, writer, metric_evaluator):
        self.generator = generator
        self.writer = writer
        self.metric_evaluator = metric_evaluator

    def evaluate_music_quality(self, prompt, output_file_path, sample_rate=16000):
        audio_values = self.generator.generate_music(prompt)
        file_path = self.writer.write_wav_file(audio_values, output_file_path, sample_rate)

        snr_value = self.metric_evaluator.calculate_snr(file_path)
        print(f'SNR: {snr_value} dB')
        smoothness_score = self.metric_evaluator.calculate_smoothness(file_path)
        print(f'Smoothness Score: {smoothness_score}')
        consistency_score = self.metric_evaluator.calculate_consistency(file_path, prompt)
        print(f"Consistency Score: {consistency_score}")

        # Normalize scores and calculate aggregate score
        normalized_snr = snr_value / 20.0
        normalized_smoothness = smoothness_score
        normalized_consistency = consistency_score

        aggregate_score = (normalized_snr + normalized_smoothness + normalized_consistency) / 3.0
        print(f"Aggregate Score: {aggregate_score}")

In [12]:
model_name = "facebook/musicgen-small"
if model_name == "facebook/musicgen-small":
  generator = MusicGenSmall(model_name)
elif model_name == "facebook/musicgen-medium":
  generator = MusicGenMedium(model_name)
elif model_name == "cvssp/audioldm":
  generator = Diffusersldm(model_name)

writer = WavFileWriter()
metric_evaluator = MetricEvaluator()
output_file_path = "music_out.wav"

quality_evaluator = MusicQualityEvaluator(generator, writer, metric_evaluator)
prompt = "Imagine a pop song inspired by a whimsical and fantastical world. Use playful lyrics and a vibrant musical arrangement to transport the audience to a magical wonderland filled with joy and enchantment."
quality_evaluator.evaluate_music_quality(prompt, output_file_path)



music_out.wav
SNR: 11.478888988494873 dB
Smoothness Score: 0.00999988242983818


  return _VF.meshgrid(tensors, **kwargs)  # type: ignore[attr-defined]
Some weights of RobertaModel were not initialized from the model checkpoint at roberta-base and are newly initialized: ['roberta.pooler.dense.weight', 'roberta.pooler.dense.bias']
You should probably TRAIN this model on a down-stream task to be able to use it for predictions and inference.


Consistency Score: 0.4754309356212616
Aggregate Score: 0.35312508915861446
