In [2]:
import collections
from difflib import SequenceMatcher
import functools
import itertools
import math
import pickle
import random
import re

from bs4 import BeautifulSoup
import nltk
from nltk.tokenize import word_tokenize
import numpy as np
import requests
from scipy.spatial import distance
import tensorflow as tf

from keras.callbacks import EarlyStopping
from keras.layers import Average, Concatenate, Dense, Embedding, Flatten, Input, Lambda
from keras.models import Model
from keras.optimizers import SGD
from keras.utils import to_categorical

nltk.download('punkt')

  from ._conv import register_converters as _register_converters


[nltk_data] Downloading package punkt to /home/sam/nltk_data...
[nltk_data]   Package punkt is already up-to-date!


Using TensorFlow backend.


True

In [3]:
#@title Default title text
def composers():
  with open('../data/composers.csv', 'r') as f:
    next(f)  # Skip headers
    for composer_id, line in enumerate(f):
      yield (composer_id, *line.strip().split('|'))
      
all_composers = list(composers())

id_to_composer = {c[0]: c for c in all_composers}
composer_to_id = {(c[0], c[1], c[2]): c for c in all_composers}

# Note that this loses the fact that there are multiple composers with the same name
name_to_composer = {c[1]: c for c in all_composers}

@functools.lru_cache()
def soup(url):
  try:
    response = requests.get(url)
    response.raise_for_status()
    return BeautifulSoup(response.text, 'html.parser')
  except requests.HTTPError:
    return None
  
@functools.lru_cache()
def wiki_links(soup):
  if soup is None:
    return []
  
  links = []
  for link in soup.find_all('a'):
    href = link.get('href')
    if href and href.startswith('/wiki'):
      links.append(link.get('title'))
  return links

@functools.lru_cache()
def text(soup):
  if soup is None:
    return ''
  
  paragraphs = soup.find(attrs={"class": "mw-parser-output"}).find_all('p')
  all_text = ' '.join(list(itertools.chain.from_iterable(para.stripped_strings for para in paragraphs)))
  return re.sub('\[\d*\]', '', all_text)

@functools.lru_cache()
def tokens(text):
  return word_tokenize(text)

In [4]:
#@title Choose a composer

# Capture inputs

input_name = 'Antonio Vivaldi' #@param 
model_name = "doc2vec" #@param ["wikipedia-links", "spotify", "doc2vec"]

def similar(a, b):
    return SequenceMatcher(None, a, b).ratio()

composer = max(all_composers, key=lambda c: similar(c[1], input_name))
composer_id = composer[0]

print('Assuming {}: born {}; died {}; composer_id: {}'.format(composer[1], composer[2], composer[3], composer[0]))
print(composer[-1])


Assuming Antonio Vivaldi: born 1678; died 1741; composer_id: 3379
https://en.wikipedia.org/wiki/Antonio_Vivaldi


In [5]:
# Wikipedia and Spotify baselines

def similar_wikipedia_links(composer_id):
  print('Using wikipedia-links model')
  composer = id_to_composer[composer_id]
   
  source_url = composer[-1]
  source_text = tokens(text(soup(source_url)))
  source_links = wiki_links(soup(source_url))

  c = collections.Counter(source_links)

  similar_ids = (name_to_composer[link][0]
                 for link, _
                 in c.most_common()
                 if link in name_to_composer)
  
  return similar_ids


def similar_spotify(composer_id):
  print('Using spotify model')
  return []

In [None]:
# Build vocab

MAX_VOCAB_SIZE = 10000

def build_vocab():
    all_tokens = itertools.chain.from_iterable(tokens(text(soup(c[-1]))) for c in all_composers)
    counter = collections.Counter(all_tokens)
    vocab = {i: token for i, (token, _) in enumerate(counter.most_common(MAX_VOCAB_SIZE))}
    vocab[len(vocab)] = '<unk>'
    return vocab

vocab = build_vocab()

path = '{}.vocab'.format(MAX_VOCAB_SIZE)
with open(path, 'wb') as f:
    pickled = pickle.dump(vocab, f)

files.download(path)

In [7]:
# Load vocab from disk
MAX_VOCAB_SIZE = 10000

with open('../data/{}.vocab'.format(MAX_VOCAB_SIZE), 'rb') as f:
    vocab = pickle.load(f)

assert len(vocab) <= MAX_VOCAB_SIZE + 1, 'Loaded vocab must match max vocab size'

In [10]:
vocab_lower = {token.lower(): i for i, token in vocab.items()}


def token_to_token_id(token):
    return vocab_lower.get(token.lower(), vocab_lower['<unk>'])


def token_ids(tokens):
    return [token_to_token_id(token) for token in tokens]

 
def training_data(window_size, vocab_size):
    assert window_size % 2 == 0, 'window_size must be even'
    offset = window_size // 2
  
    for composer in itertools.cycle(all_composers):
        t_ids = token_ids(tokens(text(soup(composer[-1]))))
        num_tokens = len(t_ids)
    
        if num_tokens <= window_size:
            continue
    
        target_idx = random.randint(offset, (num_tokens - offset) - 1)
    
        target_id = t_ids[target_idx]
      
        context_window = t_ids[target_idx-offset:target_idx] + t_ids[target_idx+1:target_idx+offset+1]
    
        yield composer[0], context_window, to_categorical(target_id, num_classes=vocab_size)
    
    
def batch(data, batch_size=32):
    while True:
        batch = itertools.islice(data, batch_size)
    
        x_1 = []
        x_2 = []
        y = []
    
        for item in batch:
            composer_id, context_window, target_ids = item
      
            x_1.append(composer_id)
            x_2.append(context_window)
            y.append(target_ids)
      
        yield [np.array(x_1), np.array(x_2)], np.array(y)


def build_model(window_size, vocab_size, num_composers):
    sequence_input = Input(shape=(window_size,))
    composer_input = Input(shape=(1,))
  
    embedded_sequence = Embedding(input_dim=vocab_size, output_dim=300, input_length=window_size)(sequence_input)
    embedded_composer = Embedding(input_dim=num_composers, output_dim=300, input_length=1)(composer_input)
  
  
    embedded = Concatenate(axis=1)([embedded_composer, embedded_sequence])
    split = Lambda(lambda t: tf.split(t, window_size + 1, axis=1))(embedded)
    averaged = Average()(split)
    squeezed = Lambda(lambda t: tf.squeeze(t, axis=1))(averaged)
  
    softmax = Dense(vocab_size, activation='softmax')(squeezed)
  
    model = Model(inputs=[composer_input, sequence_input], outputs=softmax)
  
    sgd = SGD(lr=0.001, momentum=0.9, nesterov=True)
    model.compile(optimizer=sgd,
                  loss='categorical_crossentropy',
                  metrics=['categorical_accuracy'])
    
    return model


def train_model(model, window_size, vocab_size):
    history = model.fit_generator(
        batch(training_data(window_size, vocab_size)),
        callbacks=[
            EarlyStopping(monitor='loss', patience=10)
          ],
        steps_per_epoch=200,
        epochs=100,
        use_multiprocessing=True,
        workers=4)
  
    return history

    
# 1. produce sliding window of context-target pairs from input tokens
# 2. sample a random context-target pair from a random composer
# 3. feed into network
# 4. .: a batch is input => (batch_size * 1), (batch_size * context window length); output => (batch size * 1)
# 5. later, extract learned embeddings of composers as our vectors.


window_size = 8
vocab_size = len(vocab)
num_composers = len(all_composers)

model = build_model(window_size, vocab_size, num_composers)
history = train_model(model, window_size, vocab_size)



Epoch 1/100
Epoch 2/100
Epoch 3/100
Epoch 4/100
Epoch 5/100
Epoch 6/100
Epoch 7/100
Epoch 8/100
Epoch 9/100
Epoch 10/100
Epoch 11/100
Epoch 12/100
Epoch 13/100
Epoch 14/100
Epoch 15/100
Epoch 16/100
Epoch 17/100
Epoch 18/100
Epoch 19/100
Epoch 20/100
Epoch 21/100
Epoch 22/100
Epoch 23/100
Epoch 24/100
Epoch 25/100
Epoch 26/100
Epoch 27/100
Epoch 28/100
Epoch 29/100
Epoch 30/100
Epoch 31/100
Epoch 32/100
Epoch 33/100
Epoch 34/100
Epoch 35/100
Epoch 36/100
Epoch 37/100
Epoch 38/100
Epoch 39/100
Epoch 40/100
Epoch 41/100
Epoch 42/100
Epoch 43/100
Epoch 44/100
Epoch 45/100
Epoch 46/100
Epoch 47/100
Epoch 48/100

KeyboardInterrupt: 

In [14]:
model.save('../models/d2v-sgd-lr0.001-epochs-47-steps-200-vocab-10000.h5')

In [11]:
def model_to_composer_embeddings(model):
    return np.array(model.layers[2].get_weights()[0])

composer_embeddings = model_to_composer_embeddings(model)

In [12]:
def similar_doc2vec(composer_id):
    print('Using doc2vec model')
    composer = id_to_composer[composer_id]
    composer_id = composer[0]
    composer_embedding = composer_embeddings[composer_id]
    
    def similarity_to_target(c_id):
        similar_composer_embedding = composer_embeddings[c_id]
        return distance.cosine(composer_embedding, similar_composer_embedding)
    
    return sorted((c[0] for c in all_composers if c[0] != composer_id), key=similarity_to_target)

TODO
---

1. Create batch of data (word ids, composer ids, windows)
2. Save
3. Run for an epoch of training time
4. Early stopping criterion
5. At test, calculate similarity between embedded composers and all other composers?
6. Possibly, have validation criterion based on wikipedia baseline
7. Preload Google News embeddings


In [13]:
# Pick and execute model; print results

def similarity_function(model_name):
    if model_name == 'wikipedia-links':
        return similar_wikipedia_links
    elif model_name == 'spotify':
        return similar_spotify
    elif model_name == 'doc2vec':
        return similar_doc2vec
    else:
        raise ValueError('Invalid model: {}'.format(model))


print('You might like:')
for similar_id in itertools.islice(similarity_function(model_name)(composer_id), 3):
    similar_composer = id_to_composer[similar_id]
    print('{} ({})'.format(similar_composer[1], similar_composer[-1]))  


You might like:
Using doc2vec model
Nicolao Dorati (https://en.wikipedia.org/wiki/Nicolao_Dorati)
Elfrida Andrée (https://en.wikipedia.org/wiki/Elfrida_Andr%C3%A9e)
Emmanuel Fisher (https://en.wikipedia.org/wiki/Emmanuel_Fisher)
