In [2]:
import numpy as np
from BoomboxProcessor import BoomboxProcessor
import torch.nn as nn
import torch
from tqdm import tqdm
from sklearn.preprocessing import OneHotEncoder
from torchmetrics.functional import accuracy as torch_acc
from sklearn.metrics import classification_report
import torch.optim as optim
from torch.utils.data import DataLoader
from train_encoding_model import BoomboxNet

In [2]:
# # First we need to trajectorize the data
# import librosa
# import glob
# import pickle
# from tqdm import tqdm

# from MusicVectorizer import MusicVectorizer

# SAMPLE_RATE = 16000

In [3]:
# mv = MusicVectorizer()

# trajectories = dict()
# for song in tqdm(glob.glob("transformed_audio/*.mp3")):
#     song_data = librosa.load(song, sr=SAMPLE_RATE)[0]
#     trajectories[song] = mv.trajectorize_song(song_data, SAMPLE_RATE)
    
# np.save("data/salami_trajectories.npy", trajectories)

In [4]:
data_folders = ['salami']
bx = BoomboxProcessor()

In [5]:
bx.load_trajectories(data_folders)

In [6]:
bx.load_encoding_model("models/model_50000.pt", BoomboxNet)

In [7]:
bx.encode_trajectories(device='cpu')

In [8]:
bx.split_encoded_trajectories(10,True)

In [9]:
songlet_trajectories = bx.get_songlet_trajectories()['salami']
timestep_trajectories = bx.get_trajectories()['salami']

In [34]:
songlet_trajectories = {k.split('/')[-1].split('.')[0]:v for k,v in songlet_trajectories.items()}

In [35]:
songlet_trajectories.keys()

dict_keys(['307', '486', '690', '760', '628', '356', '1624', '596', '765', '480', '819', '378', '948', '846', '671', '408', '702', '213', '572', '476', '114', '790', '463', '611', '510', '685', '624', '158', '348', '866', '363', '415', '154', '575', '1616', '316', '770', '311', '1603', '250', '878', '399', '3', '700', '827', '629', '229', '405', '287', '570', '942', '342', '933', '66', '111', '619', '707', '359', '587', '936', '38', '762', '59', '822', '285', '750', '40', '854', '170', '580', '6', '1619', '323', '557', '226', '844', '711', '847', '767', '64', '534', '771', '551', '783', '714', '338', '660', '686', '13', '931', '478', '53', '600', '320', '347', '583', '300', '679', '303', '613', '911', '292', '5', '447', '642', '484', '701', '19', '859', '236', '455', '367', '639', '590', '310', '116', '909', '677', '654', '1647', '507', '695', '622', '452', '141', '324', '328', '708', '533', '1654', '272', '132', '280', '597', '532', '731', '835', '726', '662', '675', '733', '1648', '4

In [10]:
songlet_lengths = dict()
for song in timestep_trajectories:
    time = (timestep_trajectories[song].shape[0] * 5) / 10

    songlet_lengths[song] = time

In [11]:
salami_ids = list()
for key in songlet_trajectories:
    salami_ids.append(key.split('/')[-1].split('.')[0])

In [12]:
from SegmentationLabels import SegmentationLabels
import os
import csv

In [58]:
sl = SegmentationLabels(salami_ids)

In [59]:
annotations = dict()
for salami_id in tqdm(salami_ids):
    annotations[salami_id] = sl.get_annotation(salami_id)

100%|██████████| 526/526 [00:00<00:00, 3722.68it/s]


In [60]:
NO_ANNO_DATA = []
for annotation in annotations:
    if len(annotations[annotation]) == 0:
        # remove the song from the dataset
        NO_ANNO_DATA.append(annotation)

In [61]:
labels = dict()
for salami_id in annotations:
    if salami_id in NO_ANNO_DATA:
        continue
    data = annotations[salami_id]
    length = songlet_lengths['transformed_audio/' + salami_id + '.mp3']
    labels[salami_id] = sl.create_label(length, data)
    

In [21]:
labels['710']

[['Silence'],
 ['Silence'],
 ['Silence'],
 ['Silence'],
 ['Silence'],
 ['Silence'],
 ['Silence'],
 [],
 [],
 []]

In [62]:
SKIP_DATA = []
for key in labels:
    label = labels[key]
    # if label contains an empty item, report it 
    for idx, item in enumerate(label):
        if item == []:
            SKIP_DATA.append(key)
            continue
        else:
            for i in range(len(item)):
                item[i] = item[i].lower()
        labels[key][idx] = item
SKIP_DATA = SKIP_DATA + NO_ANNO_DATA
print(list(set(SKIP_DATA)))

['61', '96', '292', '378', '102', '259', '724', '714', '679', '872', '263', '66', '1651', '226', '710', '560', '717', '157', '256', '300', '67', '301', '174', '419', '69', '859', '804', '375', '246', '101', '716', '182']


In [63]:
labels = {k: v for k, v in labels.items() if k not in SKIP_DATA}

In [64]:
np.save("data/salami_labels.npy", labels)

In [65]:
contained_labels = set()
for key in labels:
    label = labels[key]
    # if label contains an empty item, report it 
    for group in label:
        for item in group:
            contained_labels.add(item)

In [45]:
contained_labels = list(contained_labels)

for i in range(len(contained_labels)):
    contained_labels[i] = contained_labels[i].lower()
    

In [68]:
len(contained_labels)

33

In [1]:
from SegmentationModel import SegmentationModel

sm = SegmentationModel()
sm.load_data()
sm.create_train_test_data()

[[0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 1. 0. 0. 0. 0. 0. 0. 0. 0. 0. 1.
  0. 0. 0. 0. 0. 0. 0. 1. 0.]
 [0. 0. 1. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0.
  0. 0. 0. 0. 0. 0. 0. 1. 0.]
 [0. 0. 1. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0.
  0. 0. 0. 0. 0. 0. 0. 1. 0.]
 [0. 0. 1. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0.
  0. 0. 0. 0. 0. 0. 0. 1. 0.]
 [0. 0. 1. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0.
  1. 0. 0. 0. 0. 0. 0. 0. 0.]
 [0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0.
  1. 0. 0. 0. 0. 0. 0. 1. 0.]
 [0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0.
  0. 0. 0. 0. 0. 0. 0. 1. 0.]
 [0. 0. 1. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0.
  0. 0. 0. 0. 0. 0. 0. 1. 0.]
 [0. 0. 1. 0. 0. 0. 0. 0. 0. 0. 0. 0. 1. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0.
  0. 0. 0. 0. 0. 0. 0. 0. 0.]
 [0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 1. 0. 0. 0. 0. 0. 0. 0. 0.

In [2]:
print(sm.train_data.shape)
print(sm.test_data.shape)

(494, 10, 768)
(494, 10, 33)


In [3]:
print(sm.test_data[10][:])

[[0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0.
  0. 0. 0. 0. 0. 0. 0. 0. 0.]
 [0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0.
  0. 0. 0. 0. 0. 0. 0. 0. 0.]
 [0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0.
  0. 0. 0. 0. 0. 0. 0. 0. 0.]
 [0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0.
  0. 0. 0. 0. 0. 0. 0. 0. 0.]
 [0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0.
  0. 0. 0. 0. 0. 0. 0. 0. 0.]
 [0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0.
  0. 0. 0. 0. 0. 0. 0. 0. 0.]
 [0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0.
  0. 0. 0. 0. 0. 0. 0. 0. 0.]
 [0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0.
  0. 0. 0. 0. 0. 0. 0. 0. 0.]
 [0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0.
  0. 0. 0. 0. 0. 0. 0. 0. 0.]
 [0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0.