# Classifier Model
- Merge covers, random songs, and augmented songs datasets.
- Set up for doing deep learning classifier from scratch

In [1]:
import numpy as np
import pandas as pd
import requests
import io
from IPython.display import Audio, display
import librosa
import librosa.display
import matplotlib.pyplot as plt
import os
import re
import json
import pickle as pkl
import base64
import time
import urllib.parse
import spotipy
from spotipy.oauth2 import SpotifyClientCredentials
from requests.exceptions import ReadTimeout
from tqdm import tqdm
import sys
from audiomentations import Compose, AddGaussianNoise, TimeStretch, PitchShift, Shift, AddGaussianSNR, ClippingDistortion, Gain
from pydub import AudioSegment
import random

import torch
import torch.nn as nn
import torch.nn.functional as F
import torchvision.models as models
from torch.utils.data import DataLoader, Dataset
import torch.optim as optim

from scipy.spatial.distance import euclidean
from sklearn.model_selection import train_test_split
from transformers import Wav2Vec2Model

In [2]:
augmented_df = pd.read_pickle('/Users/reggiebain/erdos/song-similarity-erdos-old/data/augmented_audio/batch_1_augmented.pkl')
cover_df = pd.read_pickle('/Users/reggiebain/erdos/song-similarity-erdos-old/data/test_set_covers.pkl')

In [3]:
cover_df.head(2)

Unnamed: 0,song_title,artist,album,song,anchors,positives,negatives
0,Claudette,everly_brothers,The_Fabulous_Style_of,01-Claudette,"[[-80.0, -80.0, -80.0, -80.0, -80.0, -80.0, -8...","[[-80.0, -80.0, -80.0, -61.775627, -48.010227,...","[[-80.0, -80.0, -80.0, -53.419292, -39.97673, ..."
1,I_Don_t_Want_To_Miss_A_Thing,aerosmith,Armageddon_Original_Soundtrack_,01-I_Don_t_Want_To_Miss_A_Thing,"[[-80.0, -80.0, -80.0, -79.25159, -56.510735, ...","[[-80.0, -80.0, -80.0, -80.0, -80.0, -80.0, -8...","[[-80.0, -80.0, -80.0, -58.83052, -51.61307, -..."


In [4]:
augmented_df.head(2)

Unnamed: 0,track_id,name,artist,spotify_preview_url,spotify_id,tags,genre,year,duration_ms,danceability,...,instrumentalness,liveness,valence,tempo,time_signature,processed_audio,augmented_audio,diff_processed_audio,diff_artist,diff_name
10000,TRFNPIK128F9317759,Limelight,Rush,https://p.scdn.co/mp3-preview/6b5b71fc9762eda2...,0dnz7bSs3txd9nGY9e3Mlf,"classic_rock, hard_rock, 80s, progressive_rock",Rock,2006,260066,0.579,...,0.00401,0.286,0.795,131.097,3,"([-0.1274366, -0.21459332, -0.023305148, -0.08...","([-0.18257691, -0.3222155, -0.12718868, -0.017...","([-0.09010448, -0.13527939, -0.14203383, -0.09...",Mr. Big,To Be With You
10001,TRWPWPE128F92CB675,Friends Will Be Friends,Queen,https://p.scdn.co/mp3-preview/771fbc667792ab31...,0nvIhBnscX9w7P2yrqxB6K,"rock, classic_rock, hard_rock, 80s, british",,1986,247840,0.438,...,6e-06,0.345,0.347,75.054,3,"([0.09164546, 0.12546735, 0.06996889, 0.048958...","([0.080816284, 0.06832799, 0.08953604, 0.08095...","([-0.059521385, -0.09814703, 0.0005738288, 0.0...",Eric Clapton,Alberta


In [None]:
# Function to read audio from dataframe and add augmented column
def augment_audio(row, which):
    # Somewhat random assortment of small augmentations
    augment = Compose([
        AddGaussianNoise(min_amplitude=0.001, max_amplitude=0.015, p=0.5),
        TimeStretch(min_rate=0.8, max_rate=1.25, p=0.5),
        PitchShift(min_semitones=-4, max_semitones=4, p=0.5),
        Gain(min_gain_in_db=-5, max_gain_in_db=5, p=1.0),
        AddGaussianSNR(min_snr_db=5.0, max_snr_db=10.0,p=1.0),
        #Shift(min_fraction=-0.5, max_fraction=0.5, p=0.5),
    ])
    # Augment: row[which][0] is y, row[which][1] is sr
    augmented_audio = augment(samples = row[which][0], sample_rate=row[which][1])
    return augmented_audio, row[which][1]

In [None]:
# For each cover song

In [None]:
import torch
import torch.nn as nn
import torch.nn.functional as F
import torch.optim as optim

# Define the CNN model
class AudioEmbeddingCNN(nn.Module):
    def __init__(self, embedding_dim=128):
        super(AudioEmbeddingCNN, self).__init__()
        # Define the CNN layers
        self.conv1 = nn.Conv2d(1, 32, kernel_size=3, stride=1, padding=1)
        self.conv2 = nn.Conv2d(32, 64, kernel_size=3, stride=1, padding=1)
        self.conv3 = nn.Conv2d(64, 128, kernel_size=3, stride=1, padding=1)
        
        # Define batch normalization layers
        self.bn1 = nn.BatchNorm2d(32)
        self.bn2 = nn.BatchNorm2d(64)
        self.bn3 = nn.BatchNorm2d(128)
        
        # Define the fully connected layers
        self.fc1 = nn.Linear(128 * 32 * 32, 256)
        self.fc2 = nn.Linear(256, embedding_dim)
        
        # Dropout layer
        self.dropout = nn.Dropout(0.3)
        
    def forward(self, x):
        # Pass through convolutional layers with batch normalization and ReLU
        x = F.relu(self.bn1(self.conv1(x)))
        x = F.max_pool2d(x, 2)
        
        x = F.relu(self.bn2(self.conv2(x)))
        x = F.max_pool2d(x, 2)
        
        x = F.relu(self.bn3(self.conv3(x)))
        x = F.max_pool2d(x, 2)
        
        # Flatten the output from the conv layers
        x = x.view(x.size(0), -1)
        
        # Pass through fully connected layers with ReLU and dropout
        x = F.relu(self.fc1(x))
        x = self.dropout(x)
        x = self.fc2(x)
        
        # Normalize the embedding to ensure unit length
        return F.normalize(x, p=2, dim=1)

# Instantiate the model, define loss function and optimizer
embedding_dim = 128
model = AudioEmbeddingCNN(embedding_dim=embedding_dim)

# Triplet loss and optimizer
criterion = nn.TripletMarginLoss(margin=1.0, p=2)
optimizer = optim.Adam(model.parameters(), lr=1e-3, weight_decay=1e-4)

# Move model to GPU if available
device = torch.device("cuda" if torch.cuda.is_available() else "cpu")
model.to(device)


In [None]:
import torch
import torch.nn as nn
import torch.nn.functional as F
import torch.optim as optim

class SiameseNetwork(nn.Module):
    def __init__(self, embedding_dim=128):
        super(SiameseNetwork, self).__init__()
        # Shared CNN model for both inputs
        self.embedding_cnn = AudioEmbeddingCNN(embedding_dim=embedding_dim)

    def forward(self, x1, x2):
        # Pass both inputs through the shared embedding model
        embed1 = self.embedding_cnn(x1)
        embed2 = self.embedding_cnn(x2)
        return embed1, embed2

def contrastive_loss(output1, output2, label, margin=1.0):
    # Calculate the Euclidean distance between the two outputs
    euclidean_distance = F.pairwise_distance(output1, output2)
    # Contrastive loss calculation
    loss = (1 - label) * torch.pow(euclidean_distance, 2) + \
           label * torch.pow(torch.clamp(margin - euclidean_distance, min=0.0), 2)
    return loss.mean()

# Instantiate the model, define optimizer
embedding_dim = 128
model = SiameseNetwork(embedding_dim=embedding_dim)

# Define the optimizer
optimizer = optim.Adam(model.parameters(), lr=1e-3, weight_decay=1e-4)

# Move model to GPU if available
device = torch.device("cuda" if torch.cuda.is_available() else "cpu")
model.to(device)


In [None]:
num_epochs = 10  # Adjust as needed
for epoch in range(num_epochs):
    model.train()
    running_loss = 0.0
    
    for (song1, song2, labels) in train_loader:
        song1, song2, labels = song1.to(device), song2.to(device), labels.to(device)
        
        # Zero the gradients
        optimizer.zero_grad()
        
        # Forward pass
        embed1, embed2 = model(song1, song2)
        
        # Calculate contrastive loss
        loss = contrastive_loss(embed1, embed2, labels)
        
        # Backward pass and optimization
        loss.backward()
        optimizer.step()
        
        running_loss += loss.item() * song1.size(0)
    
    # Calculate and print the average loss for this epoch
    epoch_loss = running_loss / len(train_loader.dataset)
    print(f"Epoch [{epoch+1}/{num_epochs}], Loss: {epoch_loss:.4f}")
