# Fine Tuning Resnet
- Prepare data for training/validation. Create dataloader
- Load in resnet model
- Create architecture for fine tuning including pytorch/tensorflow boilerplate

In [None]:
#!pip install -q --pre torch torchvision torchaudio --extra-index-url https://download.pytorch.org/whl/nightly/cu118

In [None]:
#!pip install -q spotipy

In [10]:
import pandas as pd
import numpy as np
import librosa
import librosa.display
from tqdm import tqdm
import matplotlib.pyplot as plt
import os
from dotenv import dotenv_values 
import pickle as pkl

import torch
import torch.nn as nn
import torch.nn.functional as F
import torchvision.models as models
from torch.utils.data import DataLoader, Dataset
import torch.optim as optim

from scipy.spatial.distance import euclidean
from sklearn.model_selection import train_test_split
from transformers import WhisperProcessor, WhisperForConditionalGeneration

#### Create Dataset Class

In [11]:
class SpectrogramDataset(Dataset):
    def __init__(self, file_paths, transform=False, sr=22050, n_mels=128):
        self.file_paths = file_paths
        self.data_index = self._build_index()
        self.sr = sr
        self.n_mels = n_mels
        self.transform = transform
        
    def _build_index(self):
        index = []
        for file_idx, file_path in enumerate(self.file_paths):
            with open(file_path, 'rb') as f:
                data = pkl.load(f)
                for i in range(len(data)):
                    index.append((file_idx, i))
        return index

    def _get_log_mel_spectrogram(self, y):
        # Convert to mel spectrogram
        mel_spectrogram = librosa.feature.melspectrogram(y=y, sr=self.sr, n_mels=self.n_mels)
        # Convert to log scale
        log_mel_spectrogram = librosa.power_to_db(mel_spectrogram, ref=np.max)
        log_mel_spectrogram = torch.tensor(log_mel_spectrogram, dtype=torch.float32).unsqueeze(0)
        return log_mel_spectrogram

    def __len__(self):
        return len(self.data_index)

    def __getitem__(self, idx):
        file_idx, data_idx = self.data_index[idx]
        file_path = self.file_paths[file_idx]

        with open(file_path, 'rb') as f:
            data = pkl.load(f)
        
        row = data.iloc[data_idx]
        anchor = row['processed_audio'][0]  # (y, sr)
        positive = row['augmented_audio'][0]
        negative = row['diff_processed_audio'][0]

        # Convert to log mel spectrograms
        anchor_mel = self._get_log_mel_spectrogram(anchor)
        positive_mel = self._get_log_mel_spectrogram(positive)
        negative_mel = self._get_log_mel_spectrogram(negative)
        
        # Apply any transformations
        if self.transform:
            anchors = self.transform(anchors)
            positives = self.transform(positives)
            negatives = self.transform(negatives)

        return anchor_mel, positive_mel, negative_mel


### Split Data and Instantiate Dataset Class

In [12]:
file_paths = [f'/kaggle/input/augmented-audio-10k/batch_{i}_augmented.pkl' for i in range(1,10,1)]

# Split the files instaed of actual data into training/val
train_files, val_files = train_test_split(file_paths, test_size=0.2, random_state=123)

# Instantiate Dataset Classes
train_dataset = SpectrogramDataset(train_files)
val_dataset = SpectrogramDataset(val_files)

# Declare dataloaders
train_loader = DataLoader(train_dataset, batch_size=128, shuffle=True, num_workers=4, pin_memory=True)
val_loader = DataLoader(val_dataset, batch_size=128, shuffle=False, num_workers=4, pin_memory=True)

### Delcaring the Model
- Define architecture: default resnet with adjusted first conv layer and final FC layer to set num params
- Choose loss function, optimizer, device, etc.

In [None]:
# Define pretrained resnet from Torch Vision resnet 18
class ResNetEmbedding(nn.Module):
    def __init__(self, embedding_dim=128, dropout_rate=0.8):
        # get resnet super class
        super(ResNetEmbedding, self).__init__()
        self.resnet = models.resnet18(weights='DEFAULT')
        # Change structure of first layer to take non RGB images, rest of params same as default
        self.resnet.conv1 = nn.Conv2d(1, 64, kernel_size=7, stride=2, padding=3, bias=False)
        self.dropout = nn.Dropout(p=dropout_rate)
        # Set the last fully connected to a set dimension "embedding_dim" instead of default 1000
        self.resnet.fc = nn.Linear(self.resnet.fc.in_features, embedding_dim)

    def forward(self, x):
        x = self.resnet(x)
        return F.normalize(x, p=2, dim=1)

In [13]:
class WhisperEmbedding(nn.Module):
    def __init__(self, embedding_dim=128, pretrained_model_name="openai/whisper-tiny"):
        super(WhisperEmbedding, self).__init__()
        self.whisper = WhisperForConditionalGeneration.from_pretrained(pretrained_model_name)
        # Add a final fully connected layer to whisper
        self.fc = nn.Linear(self.whisper.config.hidden_size, embedding_dim)

    def forward(self, x):
        # Feed into whisper encoder
        outputs = self.whisper.encoder(x)
        # Get the mean of the hidden states to create an embedding
        embedding = outputs.last_hidden_state.mean(dim=1)
        # Pass through a linear layer to get the final embedding
        embedding = self.fc(embedding)
        # Normalize the embedding
        return torch.nn.functional.normalize(embedding, p=2, dim=1)

In [16]:
# Choose model, loss, and optimizer
model = ResNetEmbedding()
#model = WhisperEmbedding()
criterion = nn.TripletMarginLoss(margin=1.0, p=2, eps=1e-7)

# Freeze all the layers
for param in model.resnet.parameters():
    param.requires_grad = False

# Turn back on last residual block
for param in model.resnet.layer4.parameters():
    param.requires_grad = True

# Turn back on fully connected layer
for param in model.resnet.fc.parameters():
    param.requires_grad = True

# Use a smaller learning rate for fine-tuning
optimizer = optim.Adam(filter(lambda p: p.requires_grad, model.parameters()), lr=1e-4, weight_decay=1e-4)

#optimizer = optim.Adam(model.parameters(), lr=1e-5, weight_decay=1e-4)

# Declare losses/accuracies
train_losses = []
val_losses = []
baseline_losses = []

num_epochs = 1
device = torch.device('cuda' if torch.cuda.is_available() else 'cpu')
model.to(device)

config.json:   0%|          | 0.00/1.98k [00:00<?, ?B/s]

model.safetensors:   0%|          | 0.00/151M [00:00<?, ?B/s]

generation_config.json:   0%|          | 0.00/3.75k [00:00<?, ?B/s]

WhisperEmbedding(
  (whisper): WhisperForConditionalGeneration(
    (model): WhisperModel(
      (encoder): WhisperEncoder(
        (conv1): Conv1d(80, 384, kernel_size=(3,), stride=(1,), padding=(1,))
        (conv2): Conv1d(384, 384, kernel_size=(3,), stride=(2,), padding=(1,))
        (embed_positions): Embedding(1500, 384)
        (layers): ModuleList(
          (0-3): 4 x WhisperEncoderLayer(
            (self_attn): WhisperSdpaAttention(
              (k_proj): Linear(in_features=384, out_features=384, bias=False)
              (v_proj): Linear(in_features=384, out_features=384, bias=True)
              (q_proj): Linear(in_features=384, out_features=384, bias=True)
              (out_proj): Linear(in_features=384, out_features=384, bias=True)
            )
            (self_attn_layer_norm): LayerNorm((384,), eps=1e-05, elementwise_affine=True)
            (activation_fn): GELUActivation()
            (fc1): Linear(in_features=384, out_features=1536, bias=True)
            (fc2):

### Training Model

In [17]:
# Loop over epochs
for epoch in range(num_epochs):
    model.train()
    train_loss = 0.0
    running_train_loss = 0.0
    pbar = tqdm(train_loader, desc=f"Training {epoch+1}/{num_epochs}", unit="batch")

    # Loop over batches using dataloaders
    for anchors, positives, negatives in train_loader:
        anchors, positives, negatives = anchors.to(device), positives.to(device), negatives.to(device)
        optimizer.zero_grad()
        
        anchor_embeddings = model(anchors)
        positive_embeddings = model(positives)
        negative_embeddings = model(negatives)
        
        loss = criterion(anchor_embeddings, positive_embeddings, negative_embeddings)
        loss.backward()
        optimizer.step()
        
        running_train_loss += loss.item() * anchors.size(0)
        # Update the progress bar by the current batch size
        pbar.update(1)  # Increment the progress bar
        #pbar.update(anchors.size(0))
        #pbar.set_postfix(loss=loss.item())
          
    train_loss = running_train_loss / len(train_loader.dataset)
    train_losses.append(train_loss)

    # Turn on validation/eval mode
    model.eval()
    running_val_loss = 0.0 
    baseline_loss = 0.0   
    val_pbar = tqdm(val_loader, desc=f"Validation {epoch+1}/{num_epochs}", unit="batch")
    # Turn off gradient updates since we're in validation
    with torch.no_grad():
        # Batch loop 
        for anchors, positives, negatives in tqdm(val_loader):
            anchors, positives, negatives = anchors.to(device), positives.to(device), negatives.to(device)
            
            anchor_embeddings = model(anchors)
            positive_embeddings = model(positives)
            negative_embeddings = model(negatives)
            
            loss = criterion(anchor_embeddings, positive_embeddings, negative_embeddings)
            #val_loss += loss.item()

            running_val_loss += loss.item() * anchors.size(0)
            # baseline loss
            baseline_loss += criterion(anchors, positives, negatives).item()
            
            # Update the validation progress bar
            val_pbar.update(1)  # Increment the progress bar
            #val_pbar.update(anchors.size(0))
            #val_pbar.set_postfix(loss=loss.item())
    
    # Calculate average validation loss over the entire dataset
    val_loss = running_val_loss / len(val_loader.dataset)
    val_losses.append(val_loss)
    # Do the same for the baseline
    baseline_avg_loss = baseline_loss / len(val_loader.dataset)
    baseline_losses.append(baseline_avg_loss)

    
    print(f"Epoch [{epoch+1}/{num_epochs}], Train Loss: {train_loss:.4f}, Val Loss: {val_loss:.4f}, Baseline Loss: {baseline_avg_loss:.4f}")
    
    with open('training_logs.pkl', 'wb') as f:
        pkl.dump((train_losses, val_losses), f)

Training 1/1:   0%|          | 0/219 [00:00<?, ?batch/s]

AttributeError: 'WhisperForConditionalGeneration' object has no attribute 'encoder'

In [None]:
# Plot loss curves for training
epochs = range(1, num_epochs + 1)

plt.figure(figsize=(10, 5))
plt.plot(epochs, train_losses, label='Training Loss')
plt.plot(epochs, val_losses, label='Validation Loss')
plt.plot(epochs, baseline_losses, label='Baseline Loss')
plt.xlabel('Epochs')
plt.ylabel('Loss')
plt.title('Training and Validation Loss')
plt.legend()
plt.tight_layout()
plt.savefig('resnet-loss-plot.png')
plt.show()


In [None]:
# Save just the model weights (recommended apparently for portability/compatibility)
torch.save(model.state_dict(), 'resnet18_model_weights.pth')

In [None]:
# Save the entire model so we can use it for deployment
torch.save(model, 'resnet18_model.pth')

### Deploy on any 2 songs
- Use model to calculate embeddings (using eval mode specifically and with no gradient updating)
- NOTE: Right now, we have to have the input data in the correct format: a spectrogram/chromagram/tempogram (generically called "gram"). So for any deployment, we'll have to do preprocessing in the streamlit app for example. OR we can have a set of say 10-15 sample songs you can compare where we've already done all of the calculations.
- **Similarity values key:**
    - 0.5 to 1: Very similar. Perhaps the same song.
    - 0 to 0.5: Somewhat similar. Share some key characteristics
    - -1 to 0: Low to no similarity. Different songs.


In [None]:
# How to load the model later using just the state dictionary
model = ResNetEmbedding()  # Make sure this matches the architecture you used
model.load_state_dict(torch.load('resnet18_model_weights.pth'))

# If using a GPU
model.to(device)

In [None]:
def extract_embedding(model, audio_data_clip, sr=22050, use_model=True):
    y = audio_data_clip
    #y, sr = librosa.load(audio_data_clip, sr=sr)
    mel_spectrogram = librosa.feature.melspectrogram(y=y, sr=sr)
    mel_spectrogram_db = librosa.power_to_db(mel_spectrogram, ref=np.max)
    
    # Convert to tensor and move to the appropriate device
    mel_tensor = torch.tensor(mel_spectrogram_db, dtype=torch.float32).unsqueeze(0).unsqueeze(0).to(device)
    
    if use_model:
        # Get the embedding from the model
        with torch.no_grad():
            embedding = model(mel_tensor)
        
        # Normalize the embedding
        #embedding = F.normalize(embedding, p=2, dim=1)
        return embedding
    else:
        return mel_tensor


def compute_cosine_similarity(embedding1, embedding2):
    # Compute cosine similarity
    cosine_sim = F.cosine_similarity(embedding1, embedding2)
    return cosine_sim.item()  # Convert to a Python float


In [None]:
#explore_df = pd.read_pickle('/kaggle/input/augmented-audio-10k/batch_1_augmented.pkl')
#explore_df.head(2)

In [None]:
# Test on some random training/validation data as sanity check
#y1, y2 = explore_df['processed_audio'][10000][0], explore_df['diff_processed_audio'][10000][0]
#mel1, mel2 = extract_embedding(model, y1), extract_embedding(model, y2)
#compute_cosine_similarity(mel1, mel2)

In [None]:
# Test on some random training/validation data as sanity check
#y1, y3 = explore_df['processed_audio'][10000][0], explore_df['augmented_audio'][10000][0]
#mel1, mel3 = extract_embedding(model, y1), extract_embedding(model, y3)
#compute_cosine_similarity(mel1, mel3)

In [None]:
#criterion(mel1, mel2, mel3)