In [2]:
! pip install laion-clap



In [6]:
import numpy as np
import librosa
import torch
import laion_clap

In [4]:
NAME = "audio/Lady Gaga-Die With A Smile.wav"

In [5]:
#! wget https://huggingface.co/lukewys/laion_clap/resolve/main/music_audioset_epoch_15_esc_90.14.pt?download=true

In [16]:
# quantization
def int16_to_float32(x):
    return (x / 32767.0).astype(np.float32)


def float32_to_int16(x):
    x = np.clip(x, a_min=-1., a_max=1.)
    return (x * 32767.).astype(np.int16)

model = laion_clap.CLAP_Module(enable_fusion=False, amodel="HTSAT-base")
model.load_ckpt("music_audioset_epoch_15_esc_90.14.pt")


Some weights of the model checkpoint at roberta-base were not used when initializing RobertaModel: ['lm_head.dense.weight', 'lm_head.layer_norm.bias', 'lm_head.layer_norm.weight', 'lm_head.bias', 'lm_head.dense.bias']
- This IS expected if you are initializing RobertaModel from the checkpoint of a model trained on another task or with another architecture (e.g. initializing a BertForSequenceClassification model from a BertForPreTraining model).
- This IS NOT expected if you are initializing RobertaModel from the checkpoint of a model that you expect to be exactly identical (initializing a BertForSequenceClassification model from a BertForSequenceClassification model).
Some weights of RobertaModel were not initialized from the model checkpoint at roberta-base and are newly initialized: ['roberta.pooler.dense.bias', 'roberta.pooler.dense.weight']
You should probably TRAIN this model on a down-stream task to be able to use it for predictions and inference.


Load the specified checkpoint music_audioset_epoch_15_esc_90.14.pt from users.
Load Checkpoint...
logit_scale_a 	 Loaded
logit_scale_t 	 Loaded
audio_branch.spectrogram_extractor.stft.conv_real.weight 	 Loaded
audio_branch.spectrogram_extractor.stft.conv_imag.weight 	 Loaded
audio_branch.logmel_extractor.melW 	 Loaded
audio_branch.bn0.weight 	 Loaded
audio_branch.bn0.bias 	 Loaded
audio_branch.patch_embed.proj.weight 	 Loaded
audio_branch.patch_embed.proj.bias 	 Loaded
audio_branch.patch_embed.norm.weight 	 Loaded
audio_branch.patch_embed.norm.bias 	 Loaded
audio_branch.layers.0.blocks.0.norm1.weight 	 Loaded
audio_branch.layers.0.blocks.0.norm1.bias 	 Loaded
audio_branch.layers.0.blocks.0.attn.relative_position_bias_table 	 Loaded
audio_branch.layers.0.blocks.0.attn.qkv.weight 	 Loaded
audio_branch.layers.0.blocks.0.attn.qkv.bias 	 Loaded
audio_branch.layers.0.blocks.0.attn.proj.weight 	 Loaded
audio_branch.layers.0.blocks.0.attn.proj.bias 	 Loaded
audio_branch.layers.0.blocks.0.norm2

In [8]:


# Get text embedings from texts:
text_data = ["I love the contrastive learning", "I love the pretrain model"] 
text_embed = model.get_text_embedding(text_data)
print(text_embed)
print(text_embed.shape)

# Get text embedings from texts, but return torch tensor:
text_data = ["I love the contrastive learning", "I love the pretrain model"] 
text_embed = model.get_text_embedding(text_data, use_tensor=True)
print(text_embed)
print(text_embed.shape)


[[ 0.00123349  0.0132849  -0.03407464 ...  0.00113719 -0.04714473
  -0.0076841 ]
 [-0.06046472  0.01317478 -0.01736684 ... -0.01735987 -0.04660314
  -0.0555802 ]]
(2, 512)
tensor([[ 0.0012,  0.0133, -0.0341,  ...,  0.0011, -0.0471, -0.0077],
        [-0.0605,  0.0132, -0.0174,  ..., -0.0174, -0.0466, -0.0556]],
       grad_fn=<DivBackward0>)
torch.Size([2, 512])


In [9]:

# Directly get audio embeddings from audio files
audio_file = [
    NAME
    # '/home/data/test_clap_short.wav',
    # '/home/data/test_clap_long.wav'
]
audio_embed = model.get_audio_embedding_from_filelist(x = audio_file, use_tensor=False)
print(audio_embed[:,-20:])
print(audio_embed.shape)

# Get audio embeddings from audio data
audio_data, _ = librosa.load(NAME, sr=48000) # sample rate should be 48000
audio_data = audio_data.reshape(1, -1) # Make it (1,T) or (N,T)
audio_embed = model.get_audio_embedding_from_data(x = audio_data, use_tensor=False)
print(audio_embed[:,-20:])
print(audio_embed.shape)

# Directly get audio embeddings from audio files, but return torch tensor
audio_file = [
    # '/home/data/test_clap_short.wav',
    # '/home/data/test_clap_long.wav'
    NAME
]
audio_embed = model.get_audio_embedding_from_filelist(x = audio_file, use_tensor=True)
print(audio_embed[:,-20:])
print(audio_embed.shape)

# Get audio embeddings from audio data (Quantized to float32)
audio_data, _ = librosa.load(NAME, sr=48000) # sample rate should be 48000
audio_data = audio_data.reshape(1, -1) # Make it (1,T) or (N,T)
audio_data = torch.from_numpy(int16_to_float32(float32_to_int16(audio_data))).float() # quantize before send it in to the model
audio_embed = model.get_audio_embedding_from_data(x = audio_data, use_tensor=True)
print(audio_embed[:,-20:])
print(audio_embed.shape)




[[-0.01159139  0.04377364  0.02932273 -0.02094874  0.03698318  0.10338023
  -0.01713564 -0.01164776  0.08721245 -0.03339985  0.0123974   0.02579406
  -0.02455246 -0.03345013  0.01063808  0.01403836  0.09404798  0.04904974
  -0.0008765   0.02720405]]
(1, 512)
[[-0.01457467 -0.00046521  0.00349219 -0.00796316  0.04246774  0.07364035
   0.01973836 -0.01102181  0.10100918 -0.02696394  0.0183143   0.03078488
  -0.05095678 -0.03980389  0.01216598  0.01713993  0.0652664   0.04447147
   0.0154278  -0.00927463]]
(1, 512)
tensor([[-0.0178,  0.0013, -0.0306, -0.0027,  0.1069,  0.0490, -0.0499, -0.0061,
          0.0825, -0.1207,  0.0372,  0.0764, -0.0253, -0.0274, -0.0063,  0.0180,
          0.0992,  0.0226,  0.0165,  0.0375]], grad_fn=<SliceBackward0>)
torch.Size([1, 512])
tensor([[-0.0334,  0.0029, -0.0024,  0.0095,  0.0548,  0.0804,  0.0239, -0.0245,
          0.0862, -0.0267,  0.0127,  0.0389, -0.0646, -0.0466,  0.0261,  0.0192,
          0.0647,  0.0568,  0.0148,  0.0060]], grad_fn=<SliceBac

In [10]:
from sklearn.metrics.pairwise import cosine_similarity

In [None]:
emotions = ["happy", "sad", "calm"]
emotion_embeds = model.get_text_embedding(emotions, use_tensor=False)

In [None]:
emotion_embeds_norm = emotion_embeds / np.linalg.norm(emotion_embeds, axis=1, keepdims=True)


RuntimeError: Can't call numpy() on Tensor that requires grad. Use tensor.detach().numpy() instead.

In [None]:

# Define file name (replace NAME with actual audio file path)
audio_file = "audio/Lady Gaga-Die With A Smile.wav"

# Define mood descriptors
emotions = ["happy", "sad", "calm", "romantic", "angry", "fearful", "excited", "relaxed", "tragic"]

# Get text embeddings for emotion descriptors
emotion_embeds = model.get_text_embedding(emotions, use_tensor=False)

# Normalize emotion embeddings for cosine similarity

# Load audio embedding from file
audio_embed = model.get_audio_embedding_from_filelist(
    x=[audio_file], 
    use_tensor=False
)[0]  # Shape: (embedding_dim,)

# Compute magnitudes
audio_magnitude = np.linalg.norm(audio_embed)

# Compute projection magnitudes and ratios
for emotion, emotion_vector in zip(emotions, emotion_embeds):
    # Projection formula
    projection_scalar = np.dot(audio_embed, emotion_vector) / np.dot(emotion_vector, emotion_vector)
    projection_vector = projection_scalar * emotion_vector
    
    # Projection magnitude
    proj_magnitude = np.linalg.norm(projection_vector)
    
    print(f"Emotion: {emotion}")
    print(f"  Projection Magnitude: {proj_magnitude:.4f}")

Emotion: happy
  Projection Magnitude: 0.2665
  Audio Vector Magnitude: 1.0000
  Ratio: 0.2665

Emotion: sad
  Projection Magnitude: 0.2456
  Audio Vector Magnitude: 1.0000
  Ratio: 0.2456

Emotion: calm
  Projection Magnitude: 0.1484
  Audio Vector Magnitude: 1.0000
  Ratio: 0.1484

Emotion: romantic
  Projection Magnitude: 0.2695
  Audio Vector Magnitude: 1.0000
  Ratio: 0.2695

Emotion: angry
  Projection Magnitude: 0.2329
  Audio Vector Magnitude: 1.0000
  Ratio: 0.2329

Emotion: fearful
  Projection Magnitude: 0.2743
  Audio Vector Magnitude: 1.0000
  Ratio: 0.2743

Emotion: excited
  Projection Magnitude: 0.2438
  Audio Vector Magnitude: 1.0000
  Ratio: 0.2438

Emotion: relaxed
  Projection Magnitude: 0.0737
  Audio Vector Magnitude: 1.0000
  Ratio: 0.0737

Emotion: tragic
  Projection Magnitude: 0.2945
  Audio Vector Magnitude: 1.0000
  Ratio: 0.2945

