In [5]:
import os
import numpy as np
import pandas as pd
from bokeh.plotting import figure, show, ColumnDataSource
from bokeh.plotting import output_notebook, output_file, reset_output
from sklearn.decomposition import PCA
from sklearn.manifold import TSNE
from sklearn.preprocessing import MinMaxScaler
import torch

## FCN Example

In [7]:
sample_dir = "./dataset/NIPS_Workshop"

In [8]:
BohemianRhapsody = ['BohemianRhapsody_original.mp3','BohemianRhapsody_female.mp3']
UPtownFunk = ['UptownFunk_original.mp3','UptownFunk_female.mp3']
device = torch.device('cuda:0')

In [None]:
input_length, model, checkpoint_path = load_model(59049, "FCN037")
model = model.to(device)
state_dict = torch.load(checkpoint_path, map_location=device)
new_state_map = {model_key: model_key.split("model.")[1] for model_key in state_dict.get("state_dict").keys()}
new_state_dict = {new_state_map[key]: value for (key, value) in state_dict.get("state_dict").items() if key in new_state_map.keys()}
model.load_state_dict(new_state_dict)
model.eval()

In [None]:
def get_audio(mp3_path):
    waveform, sr = torchaudio.load(mp3_path)
    downsample_resample = torchaudio.transforms.Resample(sr, 16000)
    audio_tensor = downsample_resample(waveform)
    audio_tensor = torch.mean(audio_tensor, dim=0)
    return audio_tensor, len(audio_tensor)

In [None]:
def make_frames(audio_tensor, audio_length, input_length, sampleing_rate = 16000):
    num_frame = int(audio_length / input_length)
    hop_size = int(sampleing_rate / 15)
    split = [audio_tensor[i:i+input_length] for i in range(0,audio_length-input_length, hop_size)]
    batch_audio = torch.stack(split[:-1])
    return batch_audio

In [None]:
def get_frame_embeddings(mp3_path, model, input_length=input_length):
    results = []
    audio, audio_length = get_audio(os.path.join(sample_dir,mp3_path))
    batch_audio = make_frames(audio, audio_length, input_length)
    batch_audio = torch.split(batch_audio, 16)
    for i in batch_audio:
        batch_results = []
        with torch.no_grad():
            _, embeddings = model(i.to(device))
            batch_results.extend(embeddings.detach().cpu().numpy())
        results.append(batch_results)
    return results

In [None]:
B_origin_embedding = get_frame_embeddings("BohemianRhapsody_original.mp3", model, input_length=input_length)
B_origin_embedding = [instance for batch in B_origin_embedding for instance in batch]
B_origin_embedding = np.stack(B_origin_embedding)
B_origin_embedding.shape

## CPC Example