In [1]:
embeddings_path = '../data/embeddings/*.h5'
wikipedia_links_path = '../data/wikipedia-links.csv'

In [2]:
from difflib import SequenceMatcher
import glob
import re

import h5py
import numpy as np
import pandas as pd
from scipy.spatial.distance import cdist

from classical_semantics.data import csv

  return f(*args, **kwds)


In [3]:
all_composers = [(i, *c) for i, c in enumerate(pd.read_csv('../data/composers.csv', sep='|').values)]

In [4]:
def path_to_epoch(path):
    return int(re.search(r'\-(\d*)\-\d*', path).groups()[0])


def path_to_embedding(path):
    with h5py.File(path, 'r') as f:
        return f.get('doc_embeddings').value

In [5]:
sorted_embeddings_paths = sorted(glob.glob(embeddings_path), key=path_to_epoch)
all_epochs = ['0'] + [path_to_epoch(path) for path in sorted_embeddings_paths]
all_embeddings = [path_to_embedding(path) for path in sorted_embeddings_paths]
# Prepend random uniform embeddings 
all_embeddings = [np.random.uniform(size=(len(all_composers), 300))] + all_embeddings

In [6]:
def self_similarity(embeddings):
    return cdist(embeddings, embeddings, metric='cosine')

all_similarities = [self_similarity(embeddings) for embeddings in all_embeddings]
all_orderings = [similarities.argsort() for similarities in all_similarities]

In [7]:
wikipedia_links = csv.links(wikipedia_links_path)
most_similar_id_by_wikipedia = [wikipedia_links.get(c[0], [None])[0] for c in all_composers]

In [8]:
def rank_or_none(ordered, target):
    rank, = np.where(ordered==target)
    return rank if rank.size else np.array([None])


def ranks_from_ordering(ordering):
    return np.array([rank_or_none(ordered, target) for ordered, target in zip(ordering, most_similar_id_by_wikipedia)])


def median_from_ranks(ranks):
    ranks = np.squeeze(ranks)
    ranks = ranks[ranks != None]
    return np.median(ranks)


ranks_over_time = [median_from_ranks(ranks_from_ordering(o)) for o in all_orderings]

In [9]:
from bokeh.plotting import figure, show
from bokeh.io import output_notebook
from bokeh.palettes import Plasma4

output_notebook()


plot = figure(title='Median rank by epoch',
              x_axis_label='Epoch',
              y_axis_label='Median rank')

plot.line(all_epochs, ranks_over_time, line_width=2, line_color=Plasma4[0])

plot.plot_height = 300
plot.plot_width = 800
plot.legend.location = 'bottom_right'

show(plot)