# Configuration

In [1]:
config = [
    
"--experimentdir", "/home/schindlera/experiments/ismir2020_reviews/",
"--modeldir"     , "/home/schindlera/experiments/ismir2020_reviews/",
    
"--relcontent"   , "rel_content_emb_tag_lsi", 
"--audio"        , "melspec_128_10seconds_2ch", 
    
"--model"        , "model.m1_3",     
"--gpu"          , "0",
    
"--loss"         , "original", 
"--lossagg"      , "max", 
"--margin"       , "1.0", 
"--uppersim"     , "0.99", 
    
"--finaldim"     , "128", 
"--epochs"       , "500", 
"--learnrate"    , "1e-06",
"--batchsize"    , "3600"
    
]

# Data Loading & Preprocessing

In [4]:
import sys
import argparse
import logging

In [13]:

parser = argparse.ArgumentParser()

parser.add_argument('--relcontent',    type=str)
parser.add_argument('--model',         type=str)
parser.add_argument('--audio',         type=str)
parser.add_argument('--experimentdir', type=str)
parser.add_argument('--modeldir',      type=str)
parser.add_argument('--gpu',           type=int)
parser.add_argument('--finaldim',      type=int)
parser.add_argument('--lossagg',       type=str, default="min")
parser.add_argument('--loss',          type=str, default="original")
parser.add_argument('--batchsize',     type=int, default=1000)
parser.add_argument('--margin',        type=float)
parser.add_argument('--learnrate',     type=float, default=0.0001)
parser.add_argument('--uppersim',      type=float)
parser.add_argument('--epochs',        type=int, default=100)
parser.add_argument("--log-level", default=logging.DEBUG, type=lambda x: getattr(logger, x), help="Configure the logger level.")

if sys.argv[0].find("ipykernel_launcher") != -1:
    args = parser.parse_args(config)
else:
    args = parser.parse_args()

# Imports

In [6]:
import os
os.environ["CUDA_VISIBLE_DEVICES"] = str(args.gpu)

import json

import numpy as np
import pandas as pd

import torch
from torch import nn
import torch.nn.functional as F
import torch.optim as optim
from torch.utils.data import Dataset, DataLoader

from torchsummary import summary

In [7]:
# control random processes
np.random.seed(1)

# Functions

In [9]:
def prepare_model_dir(model_path):
    if not os.path.exists(model_path):
        os.makedirs(model_path)

# Initialize Experiment

## Init Logger

In [10]:
logger = logging.getLogger("experiment.py")
logger.setLevel(logging.DEBUG)

formatter = logging.Formatter('%(asctime)s - %(name)s - %(levelname)s - %(message)s', datefmt='%Y-%m-%d %H:%M:%S')

ch = logging.StreamHandler(sys.stdout)
ch.setLevel(logging.DEBUG)
ch.setFormatter(formatter)
logger.addHandler(ch)

logger.info("+------------------------------------------------------------------+")
logger.info("| STARTING EXPERIMENT                                              |")
logger.info("+------------------------------------------------------------------+")
logger.info("Logger initialized")

logger.info("Initializing model experiment directory")
model_storage_path = prepare_model_dir(args.modeldir)

logger.info("Initializing logger filehandler")
fh = logging.FileHandler("%s/experiment.log" % args.modeldir)
fh.setLevel(logging.DEBUG)
fh.setFormatter(formatter)

logger.addHandler(fh)

2020-04-21 09:44:31 - experiment.py - INFO - +------------------------------------------------------------------+
2020-04-21 09:44:31 - experiment.py - INFO - | STARTING EXPERIMENT                                              |
2020-04-21 09:44:31 - experiment.py - INFO - +------------------------------------------------------------------+
2020-04-21 09:44:31 - experiment.py - INFO - Logger initialized
2020-04-21 09:44:31 - experiment.py - INFO - Initializing model experiment directory
2020-04-21 09:44:31 - experiment.py - INFO - Initializing logger filehandler


Print Experiment Summary

In [11]:
logger.info("+------------------------------------------------------------------+")
logger.info("| EXPERIMENT:                                                      |")
logger.info("+------------------------------------------------------------------+")
logger.info("| Experiment directory              : %s"           % args.experimentdir)
logger.info("| Model-Directory                   : %s"           % args.modeldir)
logger.info("| Related content filename          : %s"           % args.relcontent)
logger.info("| Audio-Features filename           : %s"           % args.audio)
logger.info("| GPU                               : %d"           % args.gpu)
logger.info("+------------------------------------------------------------------+")
logger.info("| Model                             : %s"           % args.model)
logger.info("| Dimensions Final Music-Embeddings : %d"           % args.finaldim)
logger.info("+------------------------------------------------------------------+")
logger.info("| Loss                              : %s"           % args.loss)
logger.info("| Loss-Aggregation                  : %s"           % args.lossagg)
logger.info("| Upper Sim                         : %f"           % args.uppersim)
logger.info("| Margin                            : %f"           % args.margin)
logger.info("+------------------------------------------------------------------+")
logger.info("| Learn Rate                        : %f"           % args.learnrate)
logger.info("| Batch Size                        : %d"           % args.batchsize)
logger.info("| Num Epochs                        : %d"           % args.epochs)
logger.info("+------------------------------------------------------------------+")

2020-04-21 09:44:35 - experiment.py - INFO - +------------------------------------------------------------------+
2020-04-21 09:44:35 - experiment.py - INFO - | EXPERIMENT:                                                      |
2020-04-21 09:44:35 - experiment.py - INFO - +------------------------------------------------------------------+
2020-04-21 09:44:35 - experiment.py - INFO - | Experiment directory              : /home/schindlera/experiments/ismir2020_reviews/
2020-04-21 09:44:35 - experiment.py - INFO - | Model-Directory                   : /home/schindlera/experiments/ismir2020_reviews/
2020-04-21 09:44:35 - experiment.py - INFO - | Related content filename          : rel_content_emb_tag_lsi
2020-04-21 09:44:35 - experiment.py - INFO - | Audio-Features filename           : melspec_128_10seconds_2ch
2020-04-21 09:44:35 - experiment.py - INFO - | GPU                               : 0
2020-04-21 09:44:35 - experiment.py - INFO - | Precision                         : double preci

Store configuration for reproducability

In [14]:
with open("%s/experiment_arguments.json" % args.modeldir, 'w') as json_file:
    args_json = json.dump(vars(args), json_file)

## Load Audio Data

In [15]:
logger.info("* Load Audio Data")

2020-04-21 10:07:17 - experiment.py - INFO - * Load Audio Data


Load Audio Data - Train Partition

In [21]:
logger.info("* Load Audio Data - Train Partition")

# load partition trackid file
par_file           = "%s/eval_partition_trackids_train.csv" % (args.experimentdir)
par_trackids_train = pd.read_csv(par_file, header=None, index_col=0)

# load audio content
par_filename_audio_train = "%s/%s_train.npz" % (args.experimentdir, args.audio)

with np.load(par_filename_audio_train, allow_pickle=True) as npz:
    data_audio_train      = npz["data"]
    track_ids_audio_train = npz["track_ids"].astype(str)
    
lookup_audio_train = pd.DataFrame(np.arange(track_ids_audio_train.shape[0], dtype=int), 
                                  index   = track_ids_audio_train, 
                                  columns = ["feature_line_nr"])

2020-04-21 10:15:57 - experiment.py - INFO - * Load Audio Data - Train Partition


KeyboardInterrupt: 

In [16]:
# CHECK: ids and data have same length
assert(data_audio_train.shape[0] == track_ids_audio_train.shape[0])

Load Audio Data - Validation Partition

In [None]:
logger.info("* Load Audio Data - Validation Partition")

# load partition trackid file
par_file         = "%s/eval_partition_trackids_val.csv" % (args.experimentdir)
par_trackids_val = pd.read_csv(par_file, header=None, index_col=0)

# load audio content
par_filename_audio_val = "%s/%s_val.npz" % (args.experimentdir, args.audio)

with np.load(par_filename_audio_val, allow_pickle=True) as npz:
    data_audio_val      = npz["data"]
    track_ids_audio_val = npz["track_ids"].astype(str)
    
lookup_audio_val = pd.DataFrame(np.arange(track_ids_audio_val.shape[0], dtype=int), 
                                  index   = track_ids_audio_val, 
                                  columns = ["feature_line_nr"])

In [None]:
# CHECK: ids and data have same length
assert(data_audio_val.shape[0] == track_ids_audio_val.shape[0])

In [None]:
logger.info("Num instances - audio data train : %d" % str(data_audio_train.shape[0]))
logger.info("Num instances - audio data val   : %d" % str(data_audio_val.shape[0]))

logger.debug("data_audio dimensions     : %s" % str(data_audio_train.shape))
logger.debug("track_ids_audio dimensions: %s" % str(track_ids_audio_train.shape))

## Load Related Content

In [3]:
logger.info("* Load Related Content Embeddings")

In [3]:
with np.load(args.text, allow_pickle=True) as npz:
    data_text      = npz["data"]
    track_ids_text = npz["track_ids"].astype(str)

lookup_text = pd.DataFrame(np.arange(track_ids_text.shape[0], dtype=int), index=track_ids_text, columns=["feature_line_nr"])

# CHECK: ids and data have same length
assert(data_text.shape[0] == track_ids_text.shape[0])

# CHECK: ids of text and audio are aligned
assert((track_ids_text == track_ids_audio).sum() == track_ids_audio.shape[0])

2020-04-10 12:35:57 - experiment.py - INFO - * Load Audio Data
2020-04-10 12:45:03 - experiment.py - DEBUG - data_audio dimensions     : (249681, 128, 880, 2)
2020-04-10 12:45:03 - experiment.py - DEBUG - track_ids_audio dimensions: (249681,)
2020-04-10 12:45:03 - experiment.py - INFO - * Load Text Embeddings


In [None]:
logger.debug("data_text dimensions      : %s" % str(data_text.shape))
logger.debug("text_ids_audio dimensions : %s" % str(track_ids_audio.shape))
logger.info("TEXT_EMBEDDINGS_DIMENSIONS : %d" % data_text.shape[1])
    
TEXT_EMBEDDINGS_DIMENSIONS = data_text.shape[1]

experiment_partition = pd.read_csv("/home/schindlera/experiments/representation_from_album_review/experiment_partition.csv", index_col=0)


logger.info("normalize audio data")

#data_audio = data_audio / -80.0
#data_audio = data_audio - data_audio.mean()


logger.info("creating partitions: train/val")

#print(lookup_audio.loc[experiment_partition[experiment_partition.train == 1].index].feature_line_nr.values)

audio_train = data_audio[lookup_audio.loc[experiment_partition[experiment_partition.train == 1].index].feature_line_nr.values]
audio_val   = data_audio[lookup_audio.loc[experiment_partition[experiment_partition.val   == 1].index].feature_line_nr.values]

text_train  = data_text[lookup_text.loc[experiment_partition[experiment_partition.train == 1].index].feature_line_nr.values]
text_val    = data_text[lookup_text.loc[experiment_partition[experiment_partition.val   == 1].index].feature_line_nr.values]

logger.debug("audio_train dimensions    : %s" % str(audio_train.shape))
logger.debug("text_train  dimensions    : %s" % str(text_train.shape))
logger.debug("audio_val   dimensions    : %s" % str(audio_val.shape))
logger.debug("text_val    dimensions    : %s" % str(text_val.shape))

# CHECK: train text and audio have same length
assert(audio_train.shape[0] == text_train.shape[0])
# CHECK: validation text and audio have same length
assert(audio_val.shape[0] == text_val.shape[0])

del track_ids_audio, track_ids_text, data_text, experiment_partition, lookup_text

2020-04-10 12:45:04 - experiment.py - DEBUG - data_text dimensions      : (249681, 128)
2020-04-10 12:45:04 - experiment.py - DEBUG - text_ids_audio dimensions : (249681,)
2020-04-10 12:45:04 - experiment.py - INFO - TEXT_EMBEDDINGS_DIMENSIONS : 128
2020-04-10 12:45:05 - experiment.py - INFO - normalize audio data
2020-04-10 12:45:05 - experiment.py - INFO - creating partitions: train/val


# Translate Keras / Tensorflow code to PyTroch

In [85]:
class OnlineTripletLoss(nn.Module):
    
    def __init__(self, margin, upper_limit):
        super(OnlineTripletLoss, self).__init__()
        
        self.margin      = margin
        self.upper_limit = upper_limit

    def cosine_similarity(self, x1, x2=None, eps=1e-8):
        x2 = x1 if x2 is None else x2
        w1 = x1.norm(p=2, dim=1, keepdim=True)
        w2 = w1 if x2 is x1 else x2.norm(p=2, dim=1, keepdim=True)
        return torch.mm(x1, x2.t()) / (w1 * w2.t()).clamp(min=eps)

    def forward(self, audio_embeddings, text_embdeeings):

        # Get the pairwise distance matrix
        pairwise_dists_text  = self.cosine_similarity(text_embdeeings)
        pairwise_dists_audio = torch.cdist(audio_embeddings,audio_embeddings, p=2)

        # create filter masks
        max_pairwise_dist_audio = pairwise_dists_audio.max()

        # positive bool mask
        mask_positive = (pairwise_dists_text.fill_diagonal_(0) > self.upper_limit).float()

        # negative bool mask
        mask_negative = (1 - mask_positive).fill_diagonal_(0)

        if args.lossagg == "max":

            audio_positive_dist       = pairwise_dists_audio * mask_positive
            hardest_positive_dist, _  = audio_positive_dist.max(dim=1, keepdims=True)

        #elif args.lossagg == "min":
        # 
        #    audio_positive_dist       = pairwise_dists_audio * mask_positive_bool
        #    hardest_positive_dist, _  = audio_positive_dist.min(dim=1, keepdims=True)

        # negative
        max_audio_negative_dist, _ = pairwise_dists_audio.max(1, keepdim=True)
        audio_negative_dist        = pairwise_dists_audio + max_audio_negative_dist * (1.0 - mask_negative)

        hardest_negative_dist, _   = audio_negative_dist.min(dim=1, keepdims=True)

        # Combine biggest d(a, p) and smallest d(a, n) into final triplet loss
        delta = hardest_positive_dist - hardest_negative_dist


        if   args.loss == "original"     : 
            triplet_loss = (delta + self.margin)
            triplet_loss[triplet_loss < 0] = 0
            triplet_loss = triplet_loss.mean()


        #elif args.loss == "logistic_sum" : triplet_loss = tf.reduce_sum (tf.log1p(tf.reduce_sum(tf.exp(delta), axis=2)))
        #elif args.loss == "logistic_mean": triplet_loss = tf.reduce_mean(tf.log1p(tf.reduce_sum(tf.exp(delta), axis=2)))
        #elif args.loss == "hinge_sum"    : triplet_loss = tf.reduce_sum (tf.reduce_sum(tf.nn.relu(margin + delta)))
        #elif args.loss == "hinge_mean"   : triplet_loss = tf.reduce_mean(tf.reduce_sum(tf.nn.relu(margin + delta)))
        #elif args.loss == "exp_sum"      : triplet_loss = tf.reduce_sum (tf.reduce_sum(tf.exp(delta), axis=2))
        #elif args.loss == "exp_mean"     : triplet_loss = tf.reduce_mean(tf.reduce_sum(tf.exp(delta), axis=2))
        else: raise NotImplementedError

        return triplet_loss

In [114]:


class NetKim2019(nn.Module):
    
    def __init__(self):
        super(NetKim2019, self).__init__()
        
        self.conv1      = nn.Conv2d(2,    16, kernel_size=5, stride=(2,1), padding=2)
        self.conv2      = nn.Conv2d(16,   32, kernel_size=3, padding=1)
        self.conv3      = nn.Conv2d(32,   64, kernel_size=3, padding=1)
        self.conv4      = nn.Conv2d(64,   64, kernel_size=3, padding=1)
        self.conv5      = nn.Conv2d(64,  128, kernel_size=3, padding=1)
        self.conv6a     = nn.Conv2d(128, 128, kernel_size=3, padding=1)
        self.conv6b     = nn.Conv2d(128, 256, kernel_size=3, padding=1)
        
        self.maxpool1   = nn.MaxPool2d((2,2))
        self.maxpool2   = nn.MaxPool2d((2,2))
        self.maxpool3   = nn.MaxPool2d((2,2))
        self.maxpool4   = nn.MaxPool2d((2,2))
        self.maxpool5   = nn.MaxPool2d((2,2))
        
        self.gap        = nn.AdaptiveMaxPool2d(1)
                
        self.dropout    = nn.Dropout()
        self.fc_feature = nn.Linear(256, 256)
        self.fc_output  = nn.Linear(256, 128)

    def forward(self, x):

        x = self.conv1(x)
        x = F.relu(x)
        x = self.maxpool1(x)
        
        x = self.conv2(x)
        x = F.relu(x)
        x = self.maxpool2(x)
        
        x = self.conv3(x)
        x = F.relu(x)
        x = self.maxpool3(x)
        
        x = self.conv4(x)
        x = F.relu(x)
        x = self.maxpool4(x)
        
        x = self.conv5(x)
        x = F.relu(x)
        x = self.maxpool5(x)
        
        x = self.conv6a(x)
        x = F.relu(x)
        x = self.conv6b(x)
        x = F.relu(x)
        
        x = self.gap(x)
        x = x.view(x.shape[:2])

        x = self.fc_feature(x)
        x = F.relu(x)
        x = self.dropout(x)
        
        x = self.fc_output(x)
        x = F.sigmoid(x)
        
        return x

#summary(model.cuda(), (2, 216, 128))

In [96]:
class MelSpecDataset(Dataset):
    
    def __init__(self, audio_data, text_data, random_cropping=False):
        self.audio_data      = audio_data
        self.text_data       = text_data
        self.random_cropping = random_cropping

    def __len__(self):
        return self.audio_data.shape[0]

    def __getitem__(self, idx):
        
        if torch.is_tensor(idx):
            idx = idx.tolist()

        result_text  = self.text_data[idx]
            
        result_audio = self.audio_data[idx]
        #print(result_audio.shape)
        result_audio = np.swapaxes(result_audio, 0,2)
        
        if self.random_cropping:
            start = np.random.randint(0, 880 - 216 + 1)
            stop  = start + 216
            result_audio = result_audio[:,start:stop,:]
        else:
            result_audio = result_audio[:,100:316,:]
            
        result_audio = torch.from_numpy(result_audio).float()
        result_text  = torch.from_numpy(result_text).float()
        
        return result_audio, result_text

In [103]:
INTERMEDIATE_BATCH_SIZE = 100

In [115]:
# CUDA for PyTorch
use_cuda = torch.cuda.is_available()
device   = torch.device("cuda:0" if use_cuda else "cpu")
#cudnn.benchmark = True

In [118]:
model = NetKim2019()
model = model.to(device)

In [119]:
train_dataset    = MelSpecDataset(audio_train, text_train, random_cropping=True)
dataloader_train = DataLoader(train_dataset, 
                              batch_size=INTERMEDIATE_BATCH_SIZE,
                              shuffle=True, 
                              num_workers=4)

In [122]:


criterion = OnlineTripletLoss(args.margin, args.uppersim).to(device)
optimizer = optim.Adam(model.parameters(), lr=1e-05, weight_decay=0.0001)

In [125]:
for epoch in range(1):
    
    running_loss       = 0.0
    current_batch_num  = 0
    current_batch_size = 0
    current_batch_audio_embeddings = []
    current_batch_text_embeddings  = []

    for local_audio, local_text in dataloader_train:
    
    
        local_audio = local_audio.to(device)
        local_text  = local_text.to(device)
    
        # zero the parameter gradients
        optimizer.zero_grad()

        outputs = model(local_audio)
        
        current_batch_audio_embeddings.append(outputs)
        current_batch_text_embeddings.append(local_text)
        
        current_batch_size += outputs.shape[0]
        
        if current_batch_size >= args.batchsize:
            
            audio_embeddings = torch.cat(current_batch_audio_embeddings, dim=0)
            text_embeddings  = torch.cat(current_batch_text_embeddings,  dim=0)
            
            loss    = criterion.forward(audio_embeddings, text_embeddings)
            loss.backward()
            optimizer.step()
            
            current_batch_audio_embeddings.clear()
            current_batch_text_embeddings.clear()
        
            # print statistics
            running_loss += loss.item()
            
            print('[%d, %5d] loss: %.3f' % (epoch + 1, current_batch_num + 1, running_loss / 1))
            
            current_batch_num += 1
            current_batch_size = 0
            running_loss       = 0.0

            break
        
    break

RuntimeError: CUDA out of memory. Tried to allocate 22.00 MiB (GPU 0; 23.65 GiB total capacity; 22.71 GiB already allocated; 17.12 MiB free; 22.79 GiB reserved in total by PyTorch)

In [108]:
loss.item()

0.9510605335235596

In [124]:
torch.cuda.empty_cache()

In [75]:
outputs.shape

torch.Size([32, 128])

In [None]:
        
# ===============================================================================
# # Train Model
# ===============================================================================

logger.info("* Prepare Evaluation")

# ===============================================================================
# ### Build and Train Model
# ===============================================================================

# define the model
model = model_def.get_model(args.finaldim)
logger.info("* Model created")

# define the optimizer
opt = Adam(lr=args.learnrate)
logger.info("* Optimizer: %s" % (str(opt)))


#from keras_radam import RAdam

#opt = RAdam(total_steps=10000, warmup_proportion=0.1, learning_rate=1e-4, min_lr=1e-5)

# compile the model
model.compile(loss      = triplet_loss,
              optimizer = opt)
logger.info("* Model compiled")
                    
# ===============================================================================
# Callbacks
# ===============================================================================

cb_modelcheckpoint = ModelCheckpoint(args.modeldir + "/model.h5", 
                                    monitor           = 'val_loss', 
                                    verbose           = 1, 
                                    save_best_only    = True, 
                                    save_weights_only = True, 
                                    mode              = 'auto')
    
cb_tensorboard =  TensorBoard(log_dir=args.modeldir, 
                                histogram_freq=0, 
                                write_graph=False, 
                                write_grads=False, 
                                write_images=False, 
                                embeddings_freq=0, 
                                embeddings_layer_names=None, 
                                embeddings_metadata=None, 
                                embeddings_data=None, 
                                update_freq='epoch')

cb_csv_logger = CSVLogger(args.modeldir + "/model_training_log.csv", separator=';', append=False)

cb_logger = LoggerCallback()
    
callbacks = [cb_tensorboard, cb_modelcheckpoint, cb_csv_logger, cb_logger]
logger.info("* Callbacks created")


logger.info("* Model Training: starting")
# first test - only to debug code
history = model.fit(audio_train,
                    text_train, 
                    batch_size       = args.batchsize, 
                    verbose          = 1, 
                    epochs           = args.epochs,
                    validation_data  = (audio_val, text_val),
                    callbacks        = callbacks,
                    shuffle          = True);

logger.info("* Model Training: completed")

model_path = args.modeldir + "/model.h5"
logger.info("* Loading best model: %s" % model_path)
model.load_weights(model_path)

logger.info("* Inference: Embedding audio data into learned representation")
embeddings = model.predict(data_audio, batch_size=100, verbose=1)

logger.info("* storing embeddings")
np.savez(args.modeldir + "/final_embeddings.npz", data=embeddings, track_ids=lookup_audio.index.values)

logger.info("* Experiment finished!")