# Simple ready to use steps for Generating the Text to Music. Using HuggingFace model hub to download the model.

In [1]:
from transformers import AutoProcessor, MusicgenForConditionalGeneration
import torch
import numpy as np
import IPython.display as ipd
import scipy.io.wavfile as wavfile

In [18]:
%%time
# Load the processor and model
processor = AutoProcessor.from_pretrained("facebook/musicgen-small")
model = MusicgenForConditionalGeneration.from_pretrained("facebook/musicgen-small")

# Prepare the inputs
#text=["80s pop track with bassy drums and synth", "90s rock song with loud guitars and heavy drums"],
inputs = processor(
    #text=["80s pop track with bassy drums and synth"],
    text=["Soft flute spritual music"],
    padding=True,
    return_tensors="pt",
)

CPU times: user 9.26 s, sys: 1.54 s, total: 10.8 s
Wall time: 11.6 s


In [19]:
%%time
# Generate audio values
audio_values = model.generate(**inputs, do_sample=True, guidance_scale=3, max_new_tokens=256)

# Assuming audio_values is a tensor of shape (batch_size, 1, num_samples)
# Convert the tensor to a numpy array
audio_array = audio_values.detach().cpu().numpy()

# Debugging: Print the shape and range of the audio array
print("Audio array shape:", audio_array.shape)
print("Audio array min value:", np.min(audio_array))
print("Audio array max value:", np.max(audio_array))

Audio array shape: (1, 1, 161920)
Audio array min value: -0.24103218
Audio array max value: 0.24920416
CPU times: user 1min 53s, sys: 881 ms, total: 1min 54s
Wall time: 57.2 s


In [20]:
# Clip the audio array to the range [-1, 1]
audio_array = np.clip(audio_array, -1, 1)

# Remove the extra dimension if necessary
audio_array = audio_array.squeeze()

# Convert the normalized audio array to 16-bit PCM format
audio_array = (audio_array * 32767).astype(np.int16)

CPU times: user 1.17 ms, sys: 1.04 ms, total: 2.21 ms
Wall time: 1.32 ms


In [21]:
# Save the audio array as a WAV file
sample_rate = 16000  # You can adjust the sample rate as needed
wavfile.write("generated_audio.wav", sample_rate, audio_array)

# Play the audio in the notebook
ipd.Audio("generated_audio.wav")

CPU times: user 1.54 ms, sys: 20 µs, total: 1.56 ms
Wall time: 1.17 ms
