In [1]:
import os
import numpy as np
import pandas as pd
from bokeh.plotting import figure, show, ColumnDataSource
from bokeh.plotting import output_notebook, output_file, reset_output
from sklearn.decomposition import PCA
from sklearn.manifold import TSNE
from sklearn.preprocessing import MinMaxScaler
import torch
import torchaudio
from extractor import load_model



In [2]:
sample_dir = "./dataset/NIPS_Workshop"

In [3]:
BohemianRhapsody = ['BohemianRhapsody_original.mp3','BohemianRhapsody_female.mp3']
UPtownFunk = ['UptownFunk_original.mp3','UptownFunk_female.mp3']

In [4]:
device = torch.device('cuda:0')

In [5]:
input_length, model, checkpoint_path = load_model(59049, "CPC037")
model = model.to(device)
state_dict = torch.load(checkpoint_path, map_location=device)
new_state_map = {model_key: model_key.split("model.")[1] for model_key in state_dict.get("state_dict").keys()}
new_state_dict = {new_state_map[key]: value for (key, value) in state_dict.get("state_dict").items() if key in new_state_map.keys()}
model.load_state_dict(new_state_dict)
model.eval()

CPC(
  (encoder): Sequential(
    (0): Conv1d(1, 512, kernel_size=(16,), stride=(8,), padding=(3,), bias=False)
    (1): BatchNorm1d(512, eps=1e-05, momentum=0.1, affine=True, track_running_stats=True)
    (2): ReLU(inplace=True)
    (3): Conv1d(512, 512, kernel_size=(8,), stride=(4,), padding=(2,), bias=False)
    (4): BatchNorm1d(512, eps=1e-05, momentum=0.1, affine=True, track_running_stats=True)
    (5): ReLU(inplace=True)
    (6): Conv1d(512, 512, kernel_size=(8,), stride=(4,), padding=(2,), bias=False)
    (7): BatchNorm1d(512, eps=1e-05, momentum=0.1, affine=True, track_running_stats=True)
    (8): ReLU(inplace=True)
    (9): Conv1d(512, 512, kernel_size=(4,), stride=(2,), padding=(1,), bias=False)
    (10): BatchNorm1d(512, eps=1e-05, momentum=0.1, affine=True, track_running_stats=True)
    (11): ReLU(inplace=True)
    (12): Conv1d(512, 512, kernel_size=(4,), stride=(2,), padding=(1,), bias=False)
    (13): BatchNorm1d(512, eps=1e-05, momentum=0.1, affine=True, track_running_st

In [6]:
def get_audio(mp3_path):
    waveform, sr = torchaudio.load(mp3_path)
    downsample_resample = torchaudio.transforms.Resample(sr, 16000)
    audio_tensor = downsample_resample(waveform)
    audio_tensor = torch.mean(audio_tensor, dim=0)
    return audio_tensor, len(audio_tensor)

In [7]:
def make_frames(audio_tensor, audio_length, input_length, sampleing_rate = 16000):
    num_frame = int(audio_length / input_length)
    hop_size = int(sampleing_rate / 15)
    split = [audio_tensor[i:i+input_length] for i in range(0,audio_length-input_length, hop_size)]
    batch_audio = torch.stack(split[:-1])
    return batch_audio

In [8]:
def get_frame_embeddings(mp3_path, model, input_length=input_length):
    results = []
    audio, audio_length = get_audio(os.path.join(sample_dir,mp3_path))
    batch_audio = make_frames(audio, audio_length, input_length)
    batch_audio = torch.split(batch_audio, 16)
    for i in batch_audio:
        batch_results = []
        with torch.no_grad():
            _, embeddings = model.predict(i.to(device))
            batch_results.extend(embeddings.detach().cpu().numpy())
        results.append(batch_results)
    return results

In [9]:
B_origin_embedding = get_frame_embeddings("BohemianRhapsody_original.mp3", model, input_length=input_length)

In [14]:
B_origin_embedding = [instance for batch in B_origin_embedding for instance in batch]
B_origin_embedding = np.stack(B_origin_embedding)
B_origin_embedding.shape

(5071, 256)

In [25]:
intro = B_origin_embedding[:819,:]
ballad = B_origin_embedding[819:2382,:]
guitar = B_origin_embedding[2382:2799,:]
opera = B_origin_embedding[2799:3723,:]
hardrock = B_origin_embedding[3723:4468,:]
outro = B_origin_embedding[4468:,:]

## Visualize Heatmap

In [15]:
import seaborn as sns
import matplotlib.pyplot as plt
%matplotlib inline

In [16]:
def draw_heapmap(embedding_matrix, vmin=0, vmax=6):
    df_original = pd.DataFrame(embedding_matrix)
    plt.figure(figsize=(25,4))
    sns.heatmap(embedding_matrix.T, cmap="Blues",annot=False, xticklabels=10, yticklabels=100, vmin=vmin, vmax=vmax)

In [17]:
frame_0 = B_origin_embedding[0:-1]
frame_1 = B_origin_embedding[1:]

In [18]:
distance = np.square(frame_0 - frame_1).sum(axis=1)

In [19]:
time = [i/15 for i in range(len(distance))]

In [20]:
from bokeh.plotting import figure, output_file, show
reset_output()
output_notebook()
p = figure(title="simple line example", x_axis_label='x', y_axis_label='y')

# add a line renderer with legend and line thickness
p.line([i for i in range(len(distance))],distance, legend_label="Temp.", line_width=2)

In [21]:
p.width = 900
p.height = 300
show(p)

In [23]:
def dim_reduction(x, method='tsne', perplexity=50, n_iter=5000):
    if method == 'pca':
        trainer = PCA(n_components=2)
    else:
        trainer = TSNE(n_components=2, perplexity=perplexity, n_iter=n_iter)
    result = trainer.fit_transform(x)
    return result

In [24]:
B_origin_embedding = dim_reduction(B_origin_embedding, method='pca')

In [26]:
intro_pca = B_origin_embedding[:819]
ballad_pca = B_origin_embedding[819:2382]
guitar_pca = B_origin_embedding[2382:2799]
opera_pca = B_origin_embedding[2799:3723]
hardrock_pca = B_origin_embedding[3723:4468]
outro_pca = B_origin_embedding[4468:]

In [27]:
# prepare visualization
import numpy as np
from bokeh.plotting import figure, show, ColumnDataSource
from bokeh.plotting import output_notebook, output_file, reset_output

def initialize_figure(title):
    if title is None:
        title = 'Untitled'

    TOOLTIPS = [
        ("(x,y)", "($x, $y)"),
        ("entity", "@desc"),
    ]

    p = figure(title=title, tooltips=TOOLTIPS)
    p.grid.grid_line_color = None
    p.background_fill_color = "white"
    p.width = 600
    p.height = 600
    return p

def mtext(p, x, y, text, text_color, text_font_size, text_alpha):
    p.text(x, y, text=[text], text_color=text_color, text_align="center",
           text_font_size=text_font_size, text_alpha=text_alpha)

def annotation(p, coordinates, idx_to_text, text_shift=0.05,
    text_font_size='15pt', text_color="firebrick", text_alpha=1.0):

    for idx, text in enumerate(idx_to_text):
        if not text:
            continue
        x_ = coordinates[idx,0] + text_shift
        y_ = coordinates[idx,1] + text_shift
        text = idx_to_text[idx]
        mtext(p, x_, y_, text, text_color, text_font_size, text_alpha)

def draw(coordinates, idx_to_text, p=None, title=None, marker="circle",
         marker_color='orange', marker_size=5, marker_alpha=0.5):

    # prepare figure
    if p is None:
        p = initialize_figure(title)

    # prepare data source
    source = ColumnDataSource(data=dict(
        x = coordinates[:,0].tolist(),
        y = coordinates[:,1].tolist(),
        desc = idx_to_text
    ))

    # scatter plot
    p.scatter('x', 'y', marker=marker, size=marker_size, line_color='black',
              fill_color= marker_color, alpha=marker_alpha, source=source)
    return p

In [28]:
reset_output()
output_notebook()
# output_file('gender_tsne.html')
color = ['#3288bd', '#99d594', '#e6f598', '#fee08b', '#fc8d59', '#d53e4f']

p = draw(coordinates = intro_pca, idx_to_text= ["intro frame"+str(i) for i in range(len(intro_pca))],
         marker_size=6, marker_alpha=0.7, marker_color=color[0],
         title='Between Song with Frame')
p = draw(coordinates = ballad_pca, idx_to_text= ["balled frame"+ str(i) for i in range(len(ballad_pca))],
         p=p,marker_size=6, marker_alpha=0.7, marker_color=color[1],)
p = draw(coordinates = guitar_pca, idx_to_text= ["guitar frame"+str(i) for i in range(len(guitar_pca))],
         p=p, marker_size=6, marker_alpha=0.7, marker_color=color[2])
p = draw(coordinates = opera_pca, idx_to_text= ["opera frame"+str(i) for i in range(len(opera_pca))],
         p=p, marker_size=6, marker_alpha=0.7, marker_color=color[3])
p = draw(coordinates = hardrock_pca, idx_to_text= ["hardrock frame"+str(i) for i in range(len(hardrock_pca))],
         p=p, marker_size=6, marker_alpha=0.7, marker_color=color[4])
p = draw(coordinates = outro_pca, idx_to_text= ["outro frame"+str(i) for i in range(len(outro_pca))],
         p=p, marker_size=6, marker_alpha=0.7, marker_color=color[5])

In [29]:
p.height = 600
p.width = 600
show(p)

In [263]:
from bokeh.plotting import figure, output_file, show
reset_output()
output_notebook()
p = figure(title="simple line example", x_axis_label='x', y_axis_label='y')

# add a line renderer with legend and line thickness
p.line(F_origin_embedding_pca[1150:1250][:,0],F_origin_embedding_pca[1150:1250][:,1], legend_label="Temp.", line_width=2)

In [264]:
p.height = 600
p.width = 600
show(p)

In [334]:
reset_output()
output_notebook()
# output_file('gender_tsne.html')
color = ['#3288bd', '#99d594', '#e6f598', '#fee08b', '#fc8d59', '#d53e4f']

p = draw(coordinates = tnse_b_original, idx_to_text= ["frame"+str(i) for i in range(len(tnse_b_original))],
         marker_size=10, marker_alpha=1, marker_color=color[0],
         title='Between Song with Frame')
p = draw(coordinates = tnse_b_female, idx_to_text= ["frame"+str(i) for i in range(len(tnse_b_female))],
         p=p, marker_size=10, marker_alpha=1, marker_color=color[1])
p = draw(coordinates = tnse_u_original, idx_to_text= ["frame"+str(i) for i in range(len(tnse_u_original))],
         p=p, marker="square" ,marker_size=6, marker_alpha=1, marker_color=color[0])
p = draw(coordinates = tnse_u_female, idx_to_text= ["frame"+str(i) for i in range(len(tnse_u_female))],
         p=p, marker="square" ,marker_size=6, marker_alpha=1, marker_color=color[1])

In [335]:
p.height = 600
p.width = 600
show(p)

In [356]:
reset_output()
output_notebook()
# output_file('gender_tsne.html')
color = ['#3288bd', '#99d594', '#e6f598', '#fee08b', '#fc8d59', '#d53e4f']

p = draw(coordinates = pca_b_original, idx_to_text= ["frame_original_b"+str(i) for i in range(len(pca_b_original))],
         marker_size=15, marker_alpha=0.5, marker_color=color[0],
         title='Between Song with Frame')
p = draw(coordinates = pca_b_female, idx_to_text= ["frame_femal_b"+str(i) for i in range(len(pca_b_female))],
         p=p, marker_size=10, marker="asterisk", marker_alpha=0.8, marker_color=color[4])
p = draw(coordinates = pca_u_original, idx_to_text= ["frame_original_u"+str(i) for i in range(len(pca_u_original))],
         p=p, marker="square" ,marker_size=10, marker_alpha=1, marker_color=color[0])
p = draw(coordinates = pca_u_female, idx_to_text= ["frame_female_u"+str(i) for i in range(len(pca_u_female))],
         p=p, marker="square" ,marker_size=6, marker_alpha=0.8, marker_color=color[4])

In [357]:
p.height = 600
p.width = 600
show(p)