# Configuration

In [1]:
# directory to store intermediate and final results of the experiment
EXPERIMENT_DIR                 = "/home/schindlera/experiments/ismir2020_reviews/"

METADATA_PATH                  = EXPERIMENT_DIR + "/experiment_partition.h5"

# Imports

In [114]:
%matplotlib inline
import matplotlib.pyplot as plt

import os
import pandas as pd
import numpy as np

import progressbar as pb

from gensim import corpora, models, similarities

# Load and Process Metadata

In [7]:
# Load Metadata
metadata_final         = pd.read_hdf(METADATA_PATH, "data")

In [9]:
metadata_train = metadata_final[metadata_final.train]
metadata_test  = metadata_final[metadata_final.test ]
metadata_val   = metadata_final[metadata_final.val  ]

In [13]:
metadata_train.shape[0], metadata_test.shape[0], metadata_val.shape[0]

(247480, 254636, 2500)

# Functions

In [91]:
def merge_and_sort_tags(row):
    
    merged_tags = []
    
    for col in row.index:
        
        is_nan = pd.isnull(row[col])
        
        if type(is_nan) != bool:
            is_nan = is_nan.all()
        
        if not is_nan:
            merged_tags.extend(row[col])
            
    return sorted(merged_tags)

In [103]:
def sparse_to_dense(vec):

    dense_vec = np.zeros(NUM_LSI_TOPICS)

    for v in vec:
        dense_vec[v[0]] = v[1]

    return dense_vec

In [119]:
def calc_topic_vectors(joined_tags, dictionary, topic_model):

    topic_vectors = []

    for tid in pb.ProgressBar()(joined_tags.index):

        query_doc       = joined_tags.loc[tid]

        vec_bow         = dictionary.doc2bow(query_doc)
        vec_topic_model = topic_model[vec_bow]
        vec_topic_model = sparse_to_dense(vec_topic_model)
        topic_vectors.append(vec_topic_model)


    return np.stack(topic_vectors, axis=0), joined_tags.index

# Topic Modelling

## Latent Semantic Indexing

### Derive Topic Model from Tagset

Configuration

In [101]:
NUM_LSI_TOPICS = 340
TAG_SET        = ["styles", "moods", "themes", "genres"]

Join Tags from configured Tag-Sets

In [92]:
md_joined_tags_train = metadata_train[TAG_SET].apply(lambda x: merge_and_sort_tags(x), axis=1)

Derive LSI Topic Model

In [96]:
tag_dictionary    = corpora.Dictionary(md_joined_tags_train)

tag_corpus        = [tag_dictionary.doc2bow(doc) for doc in md_joined_tags_train]

tag_tfidf         = models.TfidfModel(tag_corpus)
tag_corpus_tfidf  = tag_tfidf[tag_corpus]

tag_lsi           = models.LsiModel(tag_corpus_tfidf, id2word=tag_dictionary, num_topics=NUM_LSI_TOPICS)

Embedd all Tags for all Tracks in Topic-Model

In [115]:
md_joined_tags_all = metadata_final[TAG_SET].apply(lambda x: merge_and_sort_tags(x), axis=1)

In [124]:
lsi_vectors, lsi_track_ids = calc_topic_vectors(md_joined_tags_all, tag_dictionary, tag_lsi)

100% (504616 of 504616) |################| Elapsed Time: 0:04:33 Time:  0:04:33


In [125]:
pd_lsi_vectors = pd.DataFrame(lsi_vectors, index=lsi_track_ids)

### Store Topic Embeddings

In [137]:
for par in ["train", "val", "test"]:
    
    # path to partition file
    par_file = "%s/eval_partition_trackids_%s.csv" % (EXPERIMENT_DIR, par)
    
    # load partition
    par_trackids = pd.read_csv(par_file, header=None, index_col=0)

    # slice topic vectors and align with partition file (should be in synch with audio features)
    par_lsi_vectors = pd_lsi_vectors.loc[par_trackids.index]

    # save embeddings for this partition
    np.savez(file     = "%s/rel_content_emb_tag_lsi_%s.npz" % (EXPERIMENT_DIR, par), 
             data     = par_lsi_vectors.values.astype(np.float32), 
             trackids = par_lsi_vectors.index)