In [16]:
import os
import numpy as np
import pandas as pd
from bokeh.plotting import figure, show, ColumnDataSource
from bokeh.plotting import output_notebook, output_file, reset_output
from sklearn.decomposition import PCA
from sklearn.manifold import TSNE
from sklearn.preprocessing import MinMaxScaler
import torch

from extractor import get_audio, load_model, make_frames

In [8]:
target_gender = ['female','male']
target_inst = ['vocal','synth','strings','quiet']
target_genre = ['classical','electronic','rock','country','metal']

In [9]:
dataset_path = "../dataset/mtat"
# train_list = np.load(os.path.join(dataset_path, "split", "train.npy"))
# binary = np.load(os.path.join(dataset_path, "split", "binary.npy"))
# labels = np.load(os.path.join(dataset_path, "split", "tags.npy"))
df_annotation = pd.read_csv(os.path.join(dataset_path, "annotations_final.csv"), sep="\t")

In [92]:
female_tags = []
for idx in df_annotation[df_annotation["female"] != 0][:100].index:
    temp = ""
    for i in df_annotation.loc[idx][df_annotation.loc[idx] != 0].index[1:-1]:
        temp = temp + " "+ i
    female_tags.append(temp)

In [93]:
male_tags = []
for idx in df_annotation[df_annotation["male"] != 0][:100].index:
    temp = ""
    for i in df_annotation.loc[idx][df_annotation.loc[idx] != 0].index[1:-1]:
        temp = temp + " "+ i
    male_tags.append(temp)

In [107]:
def get_tags(types = "male"):
    male_tags = []
    for idx in df_annotation[df_annotation[types] != 0][:100].index:
        temp = ""
        for i in df_annotation.loc[idx][df_annotation.loc[idx] != 0].index[1:-1]:
            temp = temp + " "+ i
        male_tags.append(temp)
    return male_tags

In [108]:
strings_tags = get_tags("strings")
piano_tags = get_tags("piano")
vocal_tags = get_tags("vocal")
synth_tags = get_tags("synth")

In [10]:
female_mp3path = list(df_annotation[df_annotation["female"] != 0]["mp3_path"])[:100]
male_mp3path = list(df_annotation[df_annotation["male"] != 0]["mp3_path"])[:100]
strings_mp3path = list(df_annotation[df_annotation["strings"] != 0]["mp3_path"])[:100]
piano_mp3path = list(df_annotation[df_annotation["piano"] != 0]["mp3_path"])[:100]
vocal_mp3path = list(df_annotation[df_annotation["vocal"] != 0]["mp3_path"])[:100]
synth_mp3path = list(df_annotation[df_annotation["synth"] != 0]["mp3_path"])[:100]
classical_mp3path = list(df_annotation[df_annotation["classical"] != 0]["mp3_path"])[:100]
electronic_mp3path = list(df_annotation[df_annotation["electronic"] != 0]["mp3_path"])[:100]

In [11]:
target_gender = [female_mp3path, male_mp3path]
target_inst = [strings_mp3path,piano_mp3path,vocal_mp3path,synth_mp3path]
target_genre = [classical_mp3path,electronic_mp3path]

In [17]:
input_length, model, checkpoint_path = load_model(464000, "FCN05")
state_dict = torch.load(checkpoint_path, map_location=torch.device('cpu'))
new_state_map = {model_key: model_key.split("model.")[1] for model_key in state_dict.get("state_dict").keys()}
new_state_dict = {new_state_map[key]: value for (key, value) in state_dict.get("state_dict").items() if key in new_state_map.keys()}
model.load_state_dict(new_state_dict)
model.eval()

FCN05(
  (spec): MelSpectrogram(
    (spectrogram): Spectrogram()
    (mel_scale): MelScale()
  )
  (to_db): AmplitudeToDB()
  (spec_bn): BatchNorm2d(1, eps=1e-05, momentum=0.1, affine=True, track_running_stats=True)
  (layer1): Conv_2d(
    (conv): Conv2d(1, 64, kernel_size=(3, 3), stride=(1, 1), padding=(1, 1))
    (bn): BatchNorm2d(64, eps=1e-05, momentum=0.1, affine=True, track_running_stats=True)
    (relu): ReLU()
    (mp): MaxPool2d(kernel_size=(4, 2), stride=(4, 2), padding=0, dilation=1, ceil_mode=False)
  )
  (layer2): Conv_2d(
    (conv): Conv2d(64, 128, kernel_size=(3, 3), stride=(1, 1), padding=(1, 1))
    (bn): BatchNorm2d(128, eps=1e-05, momentum=0.1, affine=True, track_running_stats=True)
    (relu): ReLU()
    (mp): MaxPool2d(kernel_size=(2, 2), stride=(2, 2), padding=0, dilation=1, ceil_mode=False)
  )
  (layer3): Conv_2d(
    (conv): Conv2d(128, 128, kernel_size=(3, 3), stride=(1, 1), padding=(1, 1))
    (bn): BatchNorm2d(128, eps=1e-05, momentum=0.1, affine=True, tr

In [48]:
def get_target_embeddings(target_path, model):
    result = []
    for path_list in target_path:
        for mp3_path in path_list:
            audio, audio_length = get_audio(os.path.join(dataset_path,"mp3",mp3_path))
            labels = np.load('dataset/mtat/split/tags.npy')
            audio_input = make_frames(audio, input_length)
            _, embeddings = model(audio_input)
            song_embedding = embeddings.mean(dim=0)
            result.append(song_embedding.detach().numpy())
    return result

In [None]:
gender_embeddings = get_target_embeddings(target_gender, model)
inst_embeddings = get_target_embeddings(target_inst, model)

In [None]:
def get_frame_embeddings(target_path, model):
    result = []
    for path_list in target_path:
        for mp3_path in path_list:
            audio, audio_length = get_audio(os.path.join(dataset_path,"mp3",mp3_path))
            labels = np.load('dataset/mtat/split/tags.npy')
            audio_input = make_frames(audio, input_length)
            _, embeddings = model(audio_input)
            song_embedding = embeddings.mean(dim=0)
            result.append(song_embedding.detach().numpy())
    return result

In [55]:
def dim_reduction(x, method='tsne', perplexity=50, n_iter=5000):
    if method == 'pca':
        trainer = PCA(n_components=2)
    else:
        trainer = TSNE(n_components=2, perplexity=perplexity, n_iter=n_iter)
    result = trainer.fit_transform(x)
    return result

In [56]:
gender_tsne = dim_reduction(gender_embeddings)

In [57]:
inst_tsne = dim_reduction(inst_embeddings)

In [59]:
female_tsne = gender_tsne[0:100]
male_tsne = gender_tsne[100:]

In [112]:
strings_tsne = inst_tsne[0:100]
piano_tsne = inst_tsne[100:200]
vocal_tsne = inst_tsne[200:300]
synth_tsne = inst_tsne[300:]

In [60]:
# prepare visualization
import numpy as np
from bokeh.plotting import figure, show, ColumnDataSource
from bokeh.plotting import output_notebook, output_file, reset_output

def initialize_figure(title):
    if title is None:
        title = 'Untitled'

    TOOLTIPS = [
        ("(x,y)", "($x, $y)"),
        ("entity", "@desc"),
    ]

    p = figure(title=title, tooltips=TOOLTIPS)
    p.grid.grid_line_color = None
    p.background_fill_color = "white"
    p.width = 600
    p.height = 600
    return p

def mtext(p, x, y, text, text_color, text_font_size, text_alpha):
    p.text(x, y, text=[text], text_color=text_color, text_align="center",
           text_font_size=text_font_size, text_alpha=text_alpha)

def annotation(p, coordinates, idx_to_text, text_shift=0.05,
    text_font_size='15pt', text_color="firebrick", text_alpha=1.0):

    for idx, text in enumerate(idx_to_text):
        if not text:
            continue
        x_ = coordinates[idx,0] + text_shift
        y_ = coordinates[idx,1] + text_shift
        text = idx_to_text[idx]
        mtext(p, x_, y_, text, text_color, text_font_size, text_alpha)

def draw(coordinates, idx_to_text, p=None, title=None, marker="circle",
         marker_color='orange', marker_size=5, marker_alpha=0.5):

    # prepare figure
    if p is None:
        p = initialize_figure(title)

    # prepare data source
    source = ColumnDataSource(data=dict(
        x = coordinates[:,0].tolist(),
        y = coordinates[:,1].tolist(),
        desc = idx_to_text
    ))

    # scatter plot
    p.scatter('x', 'y', marker=marker, size=marker_size, line_color='black',
              fill_color= marker_color, alpha=marker_alpha, source=source)
    return p

In [105]:
reset_output()
output_notebook()
output_file('gender_tsne.html')
color = ['#3288bd', '#99d594', '#e6f598', '#fee08b', '#fc8d59', '#d53e4f']

p = draw(coordinates = female_tsne, idx_to_text= female_tags,
         marker_size=6, marker_alpha=1, marker_color=color[0],
         title='gender_tsne')
p = draw(coordinates = male_tsne, idx_to_text = male_tags,
         p=p, marker_size=6, marker_alpha=1, marker_color=color[1])

In [106]:
p.height = 600
p.width = 600
show(p)

In [113]:

reset_output()
output_notebook()
output_file('inst.html')
color = ['#3288bd', '#99d594', '#e6f598', '#fee08b', '#fc8d59', '#d53e4f']

p = draw(coordinates = strings_tsne, idx_to_text= strings_tags,
         marker_size=6, marker_alpha=1, marker_color=color[2],
         title='inst Visualization')
p = draw(coordinates = piano_tsne, idx_to_text = piano_tags,
         p=p, marker_size=6, marker_alpha=1, marker_color=color[3])
p = draw(coordinates = vocal_tsne, idx_to_text = vocal_tags,
         p=p, marker_size=6, marker_alpha=1, marker_color=color[4])
p = draw(coordinates = synth_tsne, idx_to_text = synth_tags,
         p=p, marker_size=6, marker_alpha=1, marker_color=color[5])

In [114]:
p.height = 600
p.width = 600
show(p)