In [1]:
import torch
import torch.nn.functional as F

from model import Song2Vec

input_shape = (1024, 2048, 3)
DEVICE = torch.device("cuda" if torch.cuda.is_available() else "cpu")

model = Song2Vec().to(DEVICE)

# Generate two random inputs
input1 = torch.randn(1, *input_shape).to(DEVICE)
input2 = torch.randn(1, *input_shape).to(DEVICE)

# Get embeddings
_, embed1 = model.encode(input1)
_, embed2 = model.encode(input2)

# Compare embeddings
print("Are embeddings equal?", torch.allclose(embed1, embed2))
print("Embedding cosine similarity:", F.cosine_similarity(embed1, embed2))


Are embeddings equal? False
Embedding cosine similarity: tensor([0.9717], device='cuda:0', grad_fn=<SumBackward1>)


In [2]:
# Replicate forward pass manually and check cosine similarity for intermediate outputs

def cosine_similarity(x1, x2):
    return F.cosine_similarity(x1.flatten(), x2.flatten(), dim=0)

x1 = input1.permute(0, 3, 2, 1)
x2 = input2.permute(0, 3, 2, 1)

# Encoder
x1 = model.encoder(x1)
x2 = model.encoder(x2)
print("After encoder:", cosine_similarity(x1, x2).item())

# Positional encoding
B, L, D = x1.shape
pos = model.w_pe(torch.arange(L, device=DEVICE)).unsqueeze(0).expand(B, -1, -1) * 0.01
context1 = x1 + pos
context2 = x2 + pos
print("After positional encoding:", cosine_similarity(context1, context2).item())

# Add query vector and apply transformer encoder
context_w_query1 = torch.cat([context1, model.query_vector.expand(B, -1, -1)], dim=1)
context_w_query2 = torch.cat([context2, model.query_vector.expand(B, -1, -1)], dim=1)
context_w_query1 = model.transformer_encoder(context_w_query1)
context_w_query2 = model.transformer_encoder(context_w_query2)
print("After transformer encoder:", cosine_similarity(context_w_query1, context_w_query2).item())

# Separate context and z
context1 = context_w_query1[:, :-1, :]
context2 = context_w_query2[:, :-1, :]
z1 = context_w_query1[:, -1, :]
z2 = context_w_query2[:, -1, :]
print("Context after separation:", cosine_similarity(context1, context2).item())
print("Z after separation:", cosine_similarity(z1, z2).item())

# Normalize z
z1 = F.normalize(z1.squeeze(1), p=2, dim=1)
z2 = F.normalize(z2.squeeze(1), p=2, dim=1)
print("After normalization (final embedding):", cosine_similarity(z1, z2).item())

# Decoder (optional, as it's not part of the embedding process)
x1 = model.transformer_decoder(context1)
x2 = model.transformer_decoder(context2)
print("After transformer decoder:", cosine_similarity(x1, x2).item())

x1 = model.decoder(x1)
x2 = model.decoder(x2)
print("After CNN decoder:", cosine_similarity(x1, x2).item())

x1 = x1.permute(0, 3, 2, 1)
x2 = x2.permute(0, 3, 2, 1)
print("Final output:", cosine_similarity(x1, x2).item())


After encoder: 0.7035765647888184
After positional encoding: 0.70637047290802
After transformer encoder: 0.4619770050048828
Context after separation: 0.46099618077278137
Z after separation: 0.9641786813735962
After normalization (final embedding): 0.9641786813735962
After transformer decoder: 0.6729239225387573
After CNN decoder: 0.9978883862495422
Final output: 0.9978883266448975


In [3]:
from baseline_model import Song2Vec

model = Song2Vec().to(DEVICE)

# Generate two random inputs
input1 = torch.randn(1, *input_shape).to(DEVICE)
input2 = torch.randn(1, *input_shape).to(DEVICE)


In [4]:
# Replicate forward pass manually and check cosine similarity for intermediate outputs

def cosine_similarity(x1, x2):
    return F.cosine_similarity(x1.flatten(), x2.flatten(), dim=0)

# x shape: (B, H, W, C)
x1 = input1
x2 = input2

# Encoder
x1 = x1.permute(0, 3, 1, 2)  # Shape: (B, C, H, W)
x2 = x2.permute(0, 3, 1, 2)  # Shape: (B, C, H, W)
x1 = model.encoder(x1)        # Shape: (B, H_enc, W_enc)
x2 = model.encoder(x2)        # Shape: (B, H_enc, W_enc)
print("After encoder:", cosine_similarity(x1, x2).item())

B, H_enc, W_enc = x1.shape
x1 = x1.permute(0, 2, 1)     # Shape: (B, W_enc, H_enc)
x2 = x2.permute(0, 2, 1)     # Shape: (B, W_enc, H_enc)
x1 = x1.contiguous().view(B, W_enc, -1)  # Shape: (B, W_enc, H_enc)
x2 = x2.contiguous().view(B, W_enc, -1)  # Shape: (B, W_enc, H_enc)
print("After reshape:", cosine_similarity(x1, x2).item())

# GRU Encoder
out1, h_n1 = model.gru_encoder(x1)  # out shape: (B, W_enc, 1024), h_n shape: (num_layers * num_directions, B, 512)
out2, h_n2 = model.gru_encoder(x2)
print("After GRU encoder (out):", cosine_similarity(out1, out2).item())
print("After GRU encoder (h_n):", cosine_similarity(h_n1, h_n2).item())

z1 = h_n1[-1]  # Taking the last layer's hidden state as the latent vector, shape: (B, 512)
z2 = h_n2[-1]
print("Z before normalization:", cosine_similarity(z1, z2).item())

z1 = F.normalize(z1, p=2, dim=1)  # Normalize the latent vector
z2 = F.normalize(z2, p=2, dim=1)
print("After normalization (final embedding):", cosine_similarity(z1, z2).item())

# Decoder (optional, as it's not part of the embedding process)
out1, _ = model.gru_decoder(out1)  # Shape: (B, W_enc, 1024)
out2, _ = model.gru_decoder(out2)
print("After GRU decoder:", cosine_similarity(out1, out2).item())

out1 = model.fc_dec(out1)
out2 = model.fc_dec(out2)
print("After fc_dec:", cosine_similarity(out1, out2).item())

x1 = model.decoder(out1.permute(0, 2, 1))  # Shape: (B, 3, H, W)
x2 = model.decoder(out2.permute(0, 2, 1))
print("After CNN decoder:", cosine_similarity(x1, x2).item())

x1 = x1.permute(0, 2, 3, 1)  # Shape: (B, H, W, C)
x2 = x2.permute(0, 2, 3, 1)
print("Final output:", cosine_similarity(x1, x2).item())


After encoder: 0.5409560203552246
After reshape: 0.5409560203552246
After GRU encoder (out): 0.988620400428772
After GRU encoder (h_n): 0.9798527359962463
Z before normalization: 0.9914640784263611
After normalization (final embedding): 0.9914641380310059
After GRU decoder: 0.9971319437026978
After fc_dec: 0.998042106628418
After CNN decoder: 1.0
Final output: 1.0


In [5]:
from baseline_model import Song2Vec

model = Song2Vec().to(DEVICE)
state_dict = torch.load("epoch_5.pt")["model_state_dict"]
wo_orig_mod = {k.replace("_orig_mod.", ""): v for k, v in state_dict.items()}
model.load_state_dict(wo_orig_mod)

model.eval()

with torch.no_grad():
    output = model(input1)
    print(output)

(tensor([[[[ 0.2393,  0.3988, -0.1561],
          [ 2.0803,  0.3250, -0.5943],
          [-0.0706,  1.0512,  0.1907],
          ...,
          [-4.8030,  0.5056, -0.0531],
          [-4.0531,  0.3901, -0.0793],
          [-1.7387,  0.2568, -0.1113]],

         [[-0.3576,  0.4786, -0.0206],
          [ 2.0067,  0.6090, -0.5994],
          [-0.8384,  1.3359,  0.4703],
          ...,
          [-7.4869,  0.4950, -0.0634],
          [-5.9977,  0.4306, -0.0530],
          [-2.6090,  0.2714, -0.1381]],

         [[-1.4565,  0.4040, -0.0280],
          [ 0.1485,  0.7888, -0.3657],
          [-3.0122,  1.0738,  0.5432],
          ...,
          [-9.2580,  0.3919, -0.0879],
          [-8.3015,  0.3340, -0.1558],
          [-3.1231,  0.2765, -0.1550]],

         ...,

         [[ 9.8281,  0.3481,  0.3886],
          [17.0836,  0.6651,  0.6850],
          [17.1621,  0.7136,  0.9167],
          ...,
          [ 8.4461,  0.8070,  0.6490],
          [ 9.2588,  0.5422,  0.3440],
          [ 4.5849,  

  state_dict = torch.load("epoch_5.pt")["model_state_dict"]


In [6]:
import glob
import numpy as np
from preprocess import process_audio_file
from sklearn.metrics.pairwise import cosine_similarity

# Get first 100 files from fma_small
fma_small_files = sorted(glob.glob('fma_small/**/*.mp3'))[:100]



In [7]:
from tqdm import tqdm
import torch
from torch.utils.data import DataLoader, Dataset

class AudioDataset(Dataset):
    def __init__(self, file_paths):
        self.file_paths = file_paths

    def __len__(self):
        return len(self.file_paths)

    def __getitem__(self, idx):
        audio_array = process_audio_file(self.file_paths[idx])
        audio_array = np.ascontiguousarray(audio_array)
        return torch.from_numpy(audio_array).float()

# Create dataset and dataloader
dataset = AudioDataset(fma_small_files)
batch_size = 16  # Adjust based on your GPU memory
dataloader = DataLoader(dataset, batch_size=batch_size, shuffle=False)

# Process files and calculate embeddings in batches
embeddings = []
model.eval()
with torch.no_grad():
    for batch in tqdm(dataloader, desc="Processing audio files"):
        batch = batch.to(DEVICE)
        _, batch_embeddings = model.encode(batch)
        embeddings.append(batch_embeddings.cpu().numpy())

embeddings = np.vstack(embeddings)

Processing audio files: 100%|██████████| 7/7 [00:49<00:00,  7.14s/it]


In [8]:
# Process a single song as the query (using the first song in the list)
query_file = fma_small_files[0]
query_audio = process_audio_file(query_file)
query_audio = np.ascontiguousarray(query_audio)  # Ensure contiguous memory layout
query_tensor = torch.from_numpy(query_audio).float().unsqueeze(0).to(DEVICE)
with torch.no_grad():
    _, query_embedding = model.encode(query_tensor)
query_embedding = query_embedding.cpu().numpy()

# Calculate cosine similarities
similarities = cosine_similarity(query_embedding, embeddings)[0]

# Get top 5 most similar and least similar songs
top_5_indices = np.argsort(similarities)[-5:][::-1]
bottom_5_indices = np.argsort(similarities)[:5]
top_5_similarities = similarities[top_5_indices]
bottom_5_similarities = similarities[bottom_5_indices]

print(f"Top 5 most similar songs to {query_file}:")
for i, (index, similarity) in enumerate(zip(top_5_indices, top_5_similarities), 1):
    print(f"{i}. Song {index+1}: Similarity = {similarity:.4f}")

print(f"\nTop 5 least similar songs to {query_file}:")
for i, (index, similarity) in enumerate(zip(bottom_5_indices, bottom_5_similarities), 1):
    print(f"{i}. Song {index+1}: Similarity = {similarity:.4f}")

Top 5 most similar songs to fma_small/000/000002.mp3:
1. Song 1: Similarity = 1.0000
2. Song 2: Similarity = 0.9177
3. Song 39: Similarity = 0.9126
4. Song 42: Similarity = 0.9045
5. Song 41: Similarity = 0.8976

Top 5 least similar songs to fma_small/000/000002.mp3:
1. Song 30: Similarity = -0.8524
2. Song 47: Similarity = -0.8504
3. Song 74: Similarity = -0.8091
4. Song 76: Similarity = -0.8009
5. Song 69: Similarity = -0.7957


In [9]:
import ipywidgets as widgets
from IPython.display import display, Audio
import librosa

def play_audio(file_path):
    y, sr = librosa.load(file_path, duration=30)  # Load up to 30 seconds
    return Audio(data=y, rate=sr)

def create_audio_player(file_path, title):
    audio = play_audio(file_path)
    return widgets.VBox([
        widgets.HTML(f"<b>{title}</b>"),
        widgets.HTML(audio._repr_html_())
    ])

# Create audio players for query song and top 5 similar songs
query_player = create_audio_player(query_file, "Query Song")

top_5_players = [
    create_audio_player(fma_small_files[index], f"Top {i+1} Similar (Similarity: {similarity:.4f})")
    for i, (index, similarity) in enumerate(zip(top_5_indices, top_5_similarities))
]

# Create audio players for bottom 5 similar songs
bottom_5_players = [
    create_audio_player(fma_small_files[index], f"Bottom {i+1} Similar (Similarity: {similarity:.4f})")
    for i, (index, similarity) in enumerate(zip(bottom_5_indices, bottom_5_similarities))
]

# Display all players
display(widgets.VBox([
    query_player,
    widgets.HBox([
        widgets.VBox([widgets.HTML("<b>Top 5 Most Similar</b>")] + top_5_players),
        widgets.VBox([widgets.HTML("<b>Bottom 5 Least Similar</b>")] + bottom_5_players)
    ])
]))


VBox(children=(VBox(children=(HTML(value='<b>Query Song</b>'), HTML(value='\n                <audio  controls=…