In [None]:
%load_ext wurlitzer

from time import time
from os.path import join
from glob import glob

import numpy as np
import tensorflow as tf

from sklearn.decomposition import PCA
from MulticoreTSNE import MulticoreTSNE as TSNE

import matplotlib.pyplot as plt


tf.enable_eager_execution()
# Check eager execution is enabled
print(tf.executing_eagerly())        # => True

## Edit DATASET_PATH here

In [None]:
DATASET_PATH = '/home/sean/Storage/Datasets/Birp'

MP3_PATH = join(DATASET_PATH, 'mp3')
WAV_PATH = join(DATASET_PATH, 'wav')
MSPEC_PATH = join(DATASET_PATH, 'melspecs')
ZOOMED_PATH = join(DATASET_PATH, 'zoomed')
FLAT_PATH = join(DATASET_PATH, 'flat')

mp3_files = glob(join(MP3_PATH, '*.mp3'))
wav_files = glob(join(WAV_PATH, '*.wav'))
npy_files = glob(join(MSPEC_PATH, '*.npy'))
zoomed_files = glob(join(ZOOMED_PATH, '*.npy'))
flat_files = glob(join(FLAT_PATH, '*.npy'))

## Load the data

This will create a N x 320000 matrix, with N being the number of songs.

In [None]:
data = []

for npy in flat_files:
    data.append(np.load(npy))
    
X = np.array(data)

## Running PCA and t-SNE

First we run PCA over the data to reduce the dimenionality to 100. This makes pairwise comparison in t-SNE much faster, while having minimal impact on the final result. 

t-SNE is run using MulticoreTSNE which generally performs better than the one in scikit_learn, even if there's only one core.

In [None]:
start_time = time()
pca = PCA(n_components=100).fit_transform(X)
print("PCA finished in {0:.2f} seconds.".format(time() - start_time))

start_time = time()
embedded = TSNE(n_jobs=8, n_iter=3000, verbose=2).fit_transform(pca)
# embedded = np.load('embedded.npy')
print("t-SNE finished in {0:.2f} seconds.".format(time() - start_time))

In [None]:
fig = plt.figure(figsize=(18, 16), dpi= 80, facecolor='w', edgecolor='k')

plt.scatter(embedded[:, 0], embedded[:, 1])

for i in range(len(embedded)):
    plt.annotate(i, embedded[i])

plt.show()

## (WIP) Maybe try clustering them?

In [None]:
from sklearn.cluster import DBSCAN

clustering = DBSCAN(eps=1, min_samples=5, n_jobs=7).fit_predict(X)

print(clustering)

## Grab a cluster that includes a song

The above clustering code is not working yet. In the mean time, I wrote this simplified version of DBSCAN that starts on a song that you specify (`song_id`), and do a outward search based on a hop distance that you provide (`threshold`), until there are no more reachable songs or the desired cluster size (`max_num`) is reached.

In [10]:
def get_cluster(song_id, embedding, threshold=1, done=None, max_num=50):
    song = embedding[song_id]
    if done is None:
        done = set()
        
    if len(done) > max_num:
        return set()
        
    next_songs = []
    
    for i, other_song in enumerate(embedding):
        if i == song_id or i in done:
            continue
        if np.linalg.norm(song - other_song) < threshold:
            next_songs.append(i)
            done.add(i)
            break
    
    if len(next_songs) == 0:
        return set()
    for i in next_songs:
        done = done.union(get_cluster(i, embedding, threshold=threshold, done=done, max_num=max_num))
        
    return done

## Actually grab the songs in a cluster

Point `WHERE_TO_PUT_SONGS` to your desired path. Set the other variables according to your choice. This code finds songs in a cluster that includes `STARTING_SONG` and *copies* them to `WHERE_TO_PUT_SONGS`.

In [None]:
import subprocess

STARTING_SONG = 606
MAX_NUM = 50
THRESHOLD = 0.7
WHERE_TO_PUT_SONGS = '/home/sean/Music'

def find_song_file(i):
    song_name = splitext(split(flat_files[i])[1])[0]
    for song_path in wav_files:
        if song_name in song_path:
            return song_path
        
cluster = list(get_cluster(STARTING_SONG, embedded, max_num=MAX_NUM, threshold=THRESHOLD))

print("Got", len(cluster), "songs.")

for i, song_id in enumerate(cluster):
    song_path = find_song_file(song_id)
    
    subprocess.run(['cp', song_path, WHERE_TO_PUT_SONGS])

## Export a CSV of the results

In [None]:
import re

f = open('embedded.csv', 'w')

for i, v in enumerate(embedded):
    print(i)
    song_path = find_song_file(i)
    song_name = splitext(split(song_path)[1])[0]
    print(re.findall('[0-9][0-9][0-9] - (.+) - (.+)', song_name))
    author, title = re.findall('[0-9][0-9][0-9] - (.+) - (.+)', song_name)[0]
    line = "{}, \"{}\", \"{}\", {}, {}\n".format(i, title, author, v[0], v[1])
    f.write(line)
    
f.close()