# **UAS Deep Learning**

- Audrey Josephine (202000249)
- Jessica Ong (202000204)
- Samuel Marcellino (202000202)

In [12]:
from google.colab import drive
drive.mount('/content/gdrive')

Drive already mounted at /content/gdrive; to attempt to forcibly remount, call drive.mount("/content/gdrive", force_remount=True).


In [None]:
#Install Converter Audio
!apt-get install ffmpeg

In [None]:
!pip install --upgrade pip
!pip install --upgrade transformers scipy

In [15]:
!pip install Cython

[0m

# **MusicGen**
Generate high-quality music samples based on text descriptions or audio prompts

### How MusicGen works
1. Input text processed into a sequence of tokens
2. Use tokens to generate discrete audio tokens, done using Transformer architecture.
3. Audio tokens/codes are decoded to recover audio waveform (final audio output)
<br>

Tokens are processed and converted into a numerical format (token IDs). The model will identify patterns in the numbers. Token IDs are fed into the model and the model will generate new numbers.


In [16]:
from transformers import AutoProcessor, MusicgenForConditionalGeneration
from IPython.display import Audio
import torch

# processor : responsible for converting input data into a format the model can understand
# This involves tokenization, converting text into a sequence of tokens (smaller pieces of text)
processor = AutoProcessor.from_pretrained("facebook/musicgen-small")
model = MusicgenForConditionalGeneration.from_pretrained("facebook/musicgen-small")

device = "cuda:0" if torch.cuda.is_available() else "cpu"
model.to(device)



MusicgenForConditionalGeneration(
  (text_encoder): T5EncoderModel(
    (shared): Embedding(32128, 768)
    (encoder): T5Stack(
      (embed_tokens): Embedding(32128, 768)
      (block): ModuleList(
        (0): T5Block(
          (layer): ModuleList(
            (0): T5LayerSelfAttention(
              (SelfAttention): T5Attention(
                (q): Linear(in_features=768, out_features=768, bias=False)
                (k): Linear(in_features=768, out_features=768, bias=False)
                (v): Linear(in_features=768, out_features=768, bias=False)
                (o): Linear(in_features=768, out_features=768, bias=False)
                (relative_attention_bias): Embedding(32, 12)
              )
              (layer_norm): T5LayerNorm()
              (dropout): Dropout(p=0.1, inplace=False)
            )
            (1): T5LayerFF(
              (DenseReluDense): T5DenseActDense(
                (wi): Linear(in_features=768, out_features=3072, bias=False)
                (wo): L

In [33]:
inputs = processor(
    text=[
        "kpop music with mellow piano",
        "kpop music with electronic hip-hop and techno synths with fun melody"
    ],

    # Ensures that all inputs are the same length
    padding=True,

    # pt: PyTorch Tensors
    return_tensors="pt",
)
inputs.to(device)

# max_new_token: specifies the maximum length of the generated music
audio_values = model.generate(**inputs, max_new_tokens=1024)

In [23]:
sampling_rate = model.config.audio_encoder.sampling_rate
Audio(audio_values[0].cpu().numpy(), rate=sampling_rate)

In [24]:
Audio(audio_values[1].cpu().numpy(), rate=sampling_rate)

## **Audio Tokens**

In [27]:
import librosa
import numpy as np

def load_and_tokenize_audio(file_path, token_size=1000):
    # Load audio file
    audio_data, sampling_rate = librosa.load(file_path)

    # Break down the audio into smaller chunks (audio tokens)
    audio_tokens = [audio_data[i:i+token_size] for i in range(0, len(audio_data), token_size)]

    # Convert each token to a numerical representation
    numerical_tokens = [np.mean(token) for token in audio_tokens]  # Using the mean as a simple representation

    return numerical_tokens, audio_data

# Example usage
file_path = '/content/gdrive/MyDrive/deep_learning/uas/input_music/heize-traveler.mp3'
audio_tokens, audio_sample = load_and_tokenize_audio(file_path, token_size=1000)

print("First few audio tokens:", audio_tokens[:5])

First few audio tokens: [0.0, 0.0, 0.0, 0.0, 0.0]


In [29]:
audio_sample.shape

(2998017,)

In [31]:
audio_sample[100:110]

array([0., 0., 0., 0., 0., 0., 0., 0., 0., 0.], dtype=float32)

In [32]:
audio_tokens[100:110]

[0.0033245233,
 -0.008424482,
 0.0050359573,
 -0.003253939,
 0.008001808,
 -0.0030457806,
 -0.0070775985,
 -0.0004937063,
 -8.111358e-06,
 0.0018722011]