In [1]:
dm_embeddings_path = '../data/embeddings/composer-embeddings-c2v-dm*.h5'
dbow_embeddings_path = '../data/embeddings/composer-embeddings-c2v-dbow*.h5'

wikipedia_links_path = '../data/wikipedia-links.csv'

In [2]:
from difflib import SequenceMatcher
import glob
import re

import h5py
import numpy as np
import pandas as pd
from scipy.spatial.distance import cdist

from classical_semantics.data import csv

  return f(*args, **kwds)
  return f(*args, **kwds)


In [3]:
all_composers = [(i, *c) for i, c in enumerate(pd.read_csv('../data/composers.csv', sep='|').values)]

In [4]:
def path_to_epoch(path):
    return int(re.search(r'\-(\d*)\-\d*', path).groups()[0])


def path_to_embedding(path):
    with h5py.File(path, 'r') as f:
        return f.get('doc_embeddings').value

In [5]:
dm_sorted_embeddings_paths = sorted(glob.glob(dm_embeddings_path), key=path_to_epoch)
dm_epochs = ['0'] + [path_to_epoch(path) for path in dm_sorted_embeddings_paths]
dm_embeddings = [path_to_embedding(path) for path in dm_sorted_embeddings_paths]
# Prepend random uniform embeddings 
dm_embeddings = [np.random.uniform(size=(len(all_composers), 300))] + dm_embeddings

dbow_sorted_embeddings_paths = sorted(glob.glob(dbow_embeddings_path), key=path_to_epoch)
dbow_epochs = ['0'] + [path_to_epoch(path) for path in dbow_sorted_embeddings_paths]
dbow_embeddings = [path_to_embedding(path) for path in dbow_sorted_embeddings_paths]
# Prepend random uniform embeddings 
dbow_embeddings = [np.random.uniform(size=(len(all_composers), 300))] + dbow_embeddings

In [6]:
def self_similarity(embeddings):
    return cdist(embeddings, embeddings, metric='cosine')

all_similarities_dm = [self_similarity(embeddings) for embeddings in dm_embeddings]
all_orderings_dm = [similarities.argsort() for similarities in all_similarities_dm]

all_similarities_dbow = [self_similarity(embeddings) for embeddings in dbow_embeddings]
all_orderings_dbow = [similarities.argsort() for similarities in all_similarities_dbow]

In [7]:
wikipedia_links = csv.links(wikipedia_links_path)
most_similar_id_by_wikipedia = [wikipedia_links.get(c[0], [None])[0] for c in all_composers]

In [8]:
def rank_or_none(ordered, target):
    rank, = np.where(ordered==target)
    return rank if rank.size else np.array([None])


def ranks_from_ordering(ordering):
    return np.array([rank_or_none(ordered, target) for ordered, target in zip(ordering, most_similar_id_by_wikipedia)])


def median_from_ranks(ranks):
    ranks = np.squeeze(ranks)
    ranks = ranks[ranks != None]
    return np.median(ranks)


ranks_over_time_dm = [median_from_ranks(ranks_from_ordering(o)) for o in all_orderings_dm]
ranks_over_time_dbow = [median_from_ranks(ranks_from_ordering(o)) for o in all_orderings_dbow]

In [12]:
from bokeh.plotting import figure, show
from bokeh.io import output_notebook
from bokeh.palettes import Plasma4

output_notebook()


plot = figure(title='Median rank by epoch',
              x_axis_label='Epoch',
              y_axis_label='Median rank')

plot.line(dm_epochs, ranks_over_time_dm, line_width=2, line_color=Plasma4[0], legend='DM')
plot.line(dbow_epochs, ranks_over_time_dbow, line_width=2, line_color=Plasma4[1], legend='DBOW')

plot.plot_height = 300
plot.plot_width = 800
plot.legend.location = 'bottom_right'

show(plot)