In [2]:
import torch
from torch import nn
from torch.utils.data import DataLoader, Subset
import torchaudio
from ae_model import CNNAutoencoder
import matplotlib.pyplot as plt
import librosa
import soundfile as sf
import datetime
import numpy as np

projector_embeddings = torch.load("projector_embeddings.pt", map_location="cpu")
projector_embeddings = projector_embeddings.detach().numpy()
projector_embeddings_x = projector_embeddings[:, 0]
projector_embeddings_y = projector_embeddings[:, 1]

backbone_embeddings = torch.load("backbone_embeddings.pt", map_location="cpu")
backbone_embeddings = backbone_embeddings.detach().numpy()
backbone_embeddings_x = backbone_embeddings[:, 0]
backbone_embeddings_y = backbone_embeddings[:, 1]

In [3]:
from sklearn.decomposition import PCA

NUM_COMPONENTS = 2
LOG_LOSSES = False
USE_PROJECTIONS = False

assert(NUM_COMPONENTS in [2, 3])

pca = PCA(n_components=NUM_COMPONENTS)
reduced_embeddings_x = pca.fit_transform(projector_embeddings_x if USE_PROJECTIONS else backbone_embeddings_x)
reduced_embeddings_y = pca.fit_transform(projector_embeddings_y if USE_PROJECTIONS else backbone_embeddings_y)

In [14]:
import plotly.express as px
import plotly.graph_objects as go
import pandas as pd

data_x = {}
data_y = {}
for i in range(NUM_COMPONENTS):
    data_x[f"Dim. {i + 1}"] = reduced_embeddings_x[:, i]
    data_y[f"Dim. {i + 1}"] = reduced_embeddings_y[:, i]
# data["Log-Loss" if LOG_LOSSES else "Loss"] = np.log10(losses) if LOG_LOSSES else losses
data_x["Timestamp"] = np.arange(reduced_embeddings_x.shape[0])
data_y["Timestamp"] = np.arange(reduced_embeddings_y.shape[0])

df_x = pd.DataFrame(data=data_x)
df_y = pd.DataFrame(data=data_y)

if NUM_COMPONENTS == 2:
    fig = px.scatter(df_y, x='Dim. 1', y='Dim. 2',
                    #  color="Log-Loss" if LOG_LOSSES else "Loss",
                     hover_data=["Timestamp"],
                     title="Dimension-Reduced Embeddings of One-Second Audio Samples")

    # fig = go.Figure(data=
    #         [go.Scatter(x=df_x['Dim. 1'], y=df_x['Dim. 2'], mode="markers"),
    #          go.Scatter(x=df_y['Dim. 1'], y=df_y['Dim. 2'], mode="markers")]
    #     )

if NUM_COMPONENTS == 3:
    fig = px.scatter_3d(df_x, x='Dim. 1', y='Dim. 2', z='Dim. 3',
                        # color="Log-Loss" if LOG_LOSSES else "Loss",
                        hover_data=["Timestamp"],
                        title="Dimension-Reduced Embeddings of One-Second Audio Samples")
    # fig = go.Figure(data
    # =
    #         [go.Scatter3d(x=df_x['Dim. 1'], y=df_x['Dim. 2'], z=df_x['Dim. 3'], mode="markers"),
    #          go.Scatter3d(x=df_y['Dim. 1'], y=df_y['Dim. 2'], z=df_y['Dim. 3'], mode="markers")]
    #     )

fig.update_traces(marker=dict(size=3),
                  selector=dict(mode='markers'))
fig.show()

In [4]:
print(reduced_embeddings_x[:10])
print(reduced_embeddings_y[:10])

[[ 0.7417198   0.33677325]
 [ 1.3856605  -0.2670031 ]
 [-0.04758597 -0.9226105 ]
 [ 0.7217013   0.81348693]
 [ 0.2930276  -1.2626593 ]
 [-2.8407438   0.700728  ]
 [ 0.92153233 -1.4100175 ]
 [ 1.2355349  -0.47315946]
 [ 0.80851495  0.7720792 ]
 [ 0.5181759  -1.5025198 ]]
[[ 0.741722    0.33672917]
 [ 1.3856592  -0.26693836]
 [-0.04755362 -0.92247736]
 [ 0.7217004   0.8132014 ]
 [ 0.29305956 -1.2625381 ]
 [-2.8407187   0.7007117 ]
 [ 0.921527   -1.4100996 ]
 [ 1.2355558  -0.4731879 ]
 [ 0.80852544  0.7721521 ]
 [ 0.5181829  -1.5024616 ]]
