In [472]:
import numpy as np
import inspect
import os
import sklearn
import math
from sklearn.manifold import TSNE
from sklearn import manifold, mixture, cluster
from sqlalchemy import func, select as sqlselect, distinct, text as sqltext
from matplotlib import pylab
import matplotlib.colors as colors
import matplotlib.cm as cmx
import networkx as nx
import pygraphviz
from networkx.drawing.nx_agraph import graphviz_layout
import functools
import random
import operator
from itertools import islice

currentdir = os.path.dirname(os.path.abspath(inspect.getfile(inspect.currentframe())))
parentdir = os.path.dirname(currentdir)
os.sys.path.insert(0,parentdir)

from api import app, db
import importlib
#importlib.reload(models)
#Genre, Artist, Song, ArtistGenres, SongTracks, SongGroup, Group, SimilarGenre
from server.models import *
from server import song_helper
importlib.reload(song_helper)

%matplotlib inline
genres, genres_name = song_helper.db_get_genres()
genres[0] = '-unk-'

In [None]:
feature_names = ['energy', 'liveness', 'tempo', 'speechiness', 'acousticness', 'instrumentalness', 'loudness', 
                'valence', 'danceability', 'key', 'mode', 'time_signature']


def select_top_songs():
    return db.session.query(Song).filter(Song.IsToplistSong == 1)
def count_top_songs():
    return db.session.query(func.count(Song.SongId)).filter(Song.IsToplistSong == 1).scalar()



def select_song_group(group_id):
    def select_song_group_i():
        return db.session.query(Song).join(SongGroup).filter(SongGroup.GroupId == group_id)
    return select_song_group_i


def count_song_group(group_id):
    def count_song_group_i():
        return db.session.query(func.count(Song.SongId)).join(SongGroup).filter(SongGroup.GroupId == group_id).scalar()
    return count_song_group_i


def select_song_group_type(group_type):
    def select_song_group_type_i():
        return db.session.query(Song).join(SongGroup).join(Group).filter(Group.Type == group_type).distinct()
    return select_song_group_type_i


def count_song_group_type(group_type):
    def count_song_group_type_i():
        return db.session.query(func.count(distinct(Song.SongId))).join(SongGroup).join(Group).filter(Group.Type == group_type).scalar()
    return count_song_group_type_i


song_in_genre_q = "EXISTS (SELECT 1 FROM Artists a JOIN ArtistGenres ag ON a.ArtistId "\
                  "= ag.ArtistId WHERE a.ArtistId = Songs.ArtistId AND ag.GenreId = %i)"
def select_song_in_genre(genre_id, limit):
    def select_song_in_genre_i():
        return db.session.query(Song).filter(sqltext(song_in_genre_q % genre_id)).order_by(Song.Hotness).limit(limit)
    return select_song_in_genre_i


def count_song_in_genre(genre_id, limit):
    def count_song_in_genre_i():
        return min(db.session.query(func.count(distinct(Song.SongId))).filter(sqltext(song_in_genre_q % genre_id)).scalar(), limit)
                        
    return count_song_in_genre_i


def load_songs(select, count):
    songs_count = count()
    # print('will load %i songs' % songs_count)    
    song_names = {}
    song_features = np.ndarray((songs_count, 16), dtype=np.float32)    
    row = 0
    for song in select():
        song_features[row, 0] = isnull(song.AS_energy)
        song_features[row, 1] = isnull(song.AS_liveness)
        song_features[row, 2] = isnull(song.AS_tempo)
        song_features[row, 3] = isnull(song.AS_speechiness)
        song_features[row, 4] = isnull(song.AS_acousticness)
        song_features[row, 5] = isnull(song.AS_instrumentalness)
        song_features[row, 6] = isnull(song.AS_loudness)
        song_features[row, 7] = isnull(song.AS_valence)
        song_features[row, 8] = isnull(song.AS_danceability)
        song_features[row, 9] = isnull(song.AS_key)
        song_features[row, 10] = isnull(song.AS_mode)
        song_features[row, 11] = isnull(song.AS_time_signature)
        song_features[row, 12] = song.DurationMs
        song_features[row, 13] = song.SongId
        song_features[row, 14] = 0 if song.Genre is None else song.GenreId
        song_features[row, 15] = song.ArtistId
        
        song_names[song.SongId] = song.Name        
        row += 1
    if row != songs_count:
        raise Exception('got %i songs, expected %i' % (row, songs_count))
        
    return song_features, song_names


def load_artist_genres():
    s = sqlselect([ArtistGenres.GenreId, ArtistGenres.ArtistId]).order_by(ArtistGenres.ArtistId)\
                                                                .order_by(ArtistGenres.Ord)
    rows = db.session.execute(s).fetchall()
    a_genres = {}
    for row in rows:
        if row[1] not in a_genres:
            a_genres[row[1]] = [row[0]]
        else:
            a_genres[row[1]].append(row[0])
    return a_genres


def load_similar_genres():
    s = sqlselect([SimilarGenre])
    rows = db.session.execute(s).fetchall()
    s_genres = {}
    for row in rows:
        if row[0] not in s_genres:
            s_genres[row[0]] = [(row[1], row[2])]
        else:
            s_genres[row[0]].append((row[1], row[2]))
    return s_genres


def load_similar_artists(artist_id):
    txt = 'SELECT SimilarArtistId FROM SimilarArtists WHERE ArtistId = %i ORDER BY Dist' % artist_id    
    s = sqltext(txt)    
    rows = db.session.execute(s).fetchall()    
    return [row[0] for row in rows]
    

def get_artists_name(artist_id):
    s = sqltext('SELECT Name FROM Artists WHERE ArtistId = %i' % artist_id)
    row = db.session.execute(s).fetchone()
    return row[0]
    

def isnull(v, r=0):
    return v if v is not None else 0
    

def plot(projection, labels, annotate=True, color=None, plot_size=15):
    #assert projection.shape[0] >= len(labels), 'More labels than embeddings'
    pylab.figure(figsize=(plot_size, plot_size))  # in inches
    for l_id in labels:
        x, y = projection[l_id-1,:]
        if color is not None:
            pylab.scatter(x,y, c=color[i], s=500, cmap='gray')
        else:
            pylab.scatter(x, y)
        if annotate:
            pylab.annotate(labels[l_id], xy=(x, y), xytext=(5, 2), textcoords='offset points',
                           ha='right', va='bottom')
    pylab.show()
    
    
def plot2(projection, labels, annotate=True, color=None):
    assert projection.shape[0] >= len(labels), 'More labels than embeddings'
    pylab.figure(figsize=(15,15))  # in inches
    pylab.scatter(projection[:,0],projection[:,1], c=color, s=30)
    #pylab.colorbar()
    pylab.show()
    
def plot3(projection, labels, color):    
    pylab.figure(figsize=(15,10))  # in inches
    max_color = np.max(color)
    jet = cm = pylab.get_cmap('jet')
    cNorm  = colors.Normalize(vmin=0, vmax=max_color)
    scalarMap = cmx.ScalarMappable(norm=cNorm, cmap=jet)
    
    # display color by color
    for c in list(np.unique(color)):
        col = scalarMap.to_rgba(c)
        pylab.scatter(projection[color == c,0], projection[color==c,1],c=col, s=30,
                      label=labels[c])#'color: (%4.2f,%4.2f,%4.2f)'%(col[0],col[1],col[2]))
        
    
    pylab.legend()
    pylab.show()
    

def tsne_projection(song_features):
    tsne = TSNE(perplexity=30, n_components=2, init='pca', n_iter=5000)
    return tsne.fit_transform(song_features)
    

def prepare_songs(song_features, scaler=None):
    permutation = np.random.permutation(song_features.shape[0])
    song_features = song_features[permutation,:]
    song_genres = song_features[:,14]
    # normalize features
    normalized_features = song_features[:,0:9]
    features_mean = np.mean(normalized_features, axis=0)
    features_std = np.std(normalized_features, axis=0)
    scaler = scaler or sklearn.preprocessing.StandardScaler(copy=False).fit(normalized_features)
    scaler.transform(normalized_features)    
    return song_features, song_genres, features_mean, features_std, scaler

In [None]:
song_features, song_names = load_songs(select_top_songs, count_top_songs)

In [None]:
song_features, song_genres, features_mean, features_std, features_scaler = prepare_songs(song_features)

In [None]:
featured_features = [0,1,2,3,4,5,7,8]
feature_minimum = np.min(song_features, axis=0)
feature_maximum = np.max(song_features, axis=0)


def print_features(f_vec):
    for i in range(min(len(feature_names),len(f_vec))):
        print('%s: %i%% (%f)'%(feature_names[i], (f_vec[i] - feature_minimum[i])*100/(feature_maximum[i] - 
                                                                                    feature_minimum[i]), f_vec[i]))
        
def get_core_genres(gr_song_genres, describe=True):
    # print(gr_song_genres[gr_song_genres < 0])
    gr_g_array = np.array(gr_song_genres, dtype=np.int64)
    gr_g_bins = np.bincount(gr_g_array)
    gr_g_bin_sort = np.argsort(gr_g_bins)[::-1]
    # get maximum 20 genres
    min_g_count = 1
    if len(gr_g_bin_sort) > 20:
        min_g_count = max(gr_g_bins[gr_g_bin_sort[20]], 1)
    if describe:
        for sort_id in gr_g_bin_sort:
            c = gr_g_bins[sort_id]
            if c >= min_g_count:
                print ('Genre %s count %i' % (genres[sort_id], c))

    gr_top_g = gr_g_bin_sort[gr_g_bins[gr_g_bin_sort] >= min_g_count]
    # print(gr_top_g)
    # print(len(gr_song_genres))
    # print(len(gr_song_genres[gr_song_genres>=0]))
    # print(gr_song_genres[gr_song_genres>=0])
    # print(str(gr_song_genres))
    gr_top_g_indexer = np.ndarray(len(gr_song_genres), dtype=np.bool)    
    for i, g in enumerate(gr_song_genres):
        gr_top_g_indexer[i] = g in gr_top_g and g != 0
        
    return gr_top_g_indexer
        
        
def outliers_info(features, core_features, gr_song_features, gr_song_names, genres):
    # novelty detection
    clf = sklearn.svm.OneClassSVM(nu=0.1, kernel="rbf", gamma=0.1)
    clf.fit(core_features) # gr_song_features[gr_top_g_indexer][:,featured_features]
    y_pred_train = clf.predict(features)
    print('%% of outliers in dataset: %f%%' % (y_pred_train[y_pred_train == -1].size * 100.0 / features.shape[0]))
    y_decision_train = clf.decision_function(features)[:,0]
    y_decision_train_sort = np.argsort(y_decision_train)
    for sort_id in y_decision_train_sort[:20]:
        if y_pred_train[sort_id] == -1:
            song = gr_song_features[sort_id]
            print('song %s-%s(%i) is outlier with dist %f' % (gr_song_names[song[13]], genres[song[14]],
                                                              song[13], y_decision_train[sort_id]))
    print('most crazy outlier')
    print(print_features(gr_song_features[y_decision_train_sort[0],0:9]))

    print('')
    y_decision_train_sort = y_decision_train_sort[::-1]
    for sort_id in y_decision_train_sort[:20]:
        if y_pred_train[sort_id] == 1:
            song = gr_song_features[sort_id]
            print('song %s-%s(%i) is ingroup with dist %f' % (gr_song_names[song[13]], genres[song[14]],
                                                              song[13], y_decision_train[sort_id]))
    print('most ingrouped')
    print(print_features(gr_song_features[y_decision_train_sort[0],0:9]))
    # y_pred_top = clf.predict(gr_song_features[gr_top_g_indexer][:,0:9])
    # print(y_pred_top[y_pred_top == -1].size * 100.0 / gr_song_features[gr_top_g_indexer].shape[0])


def plot_clusters(projection, y_pred):
    y_pred = y_pred + 1
    y_pred_bc = np.bincount(y_pred)
    n_c = len(y_pred_bc)
    print('count elements in clusters %s' % str(y_pred_bc))
    components = {0: 'noise samples'}
    for c in range(1, n_c+1):
        components[c] = 'component %i' % c    
    plot3(projection, components, color=y_pred)
    
    
def fit_dpgmm(features, n_components=8):        
    # choose best model: commented out as GMM is clearly inferior
    # lowest_bic = np.infty
    # bic = []
    # n_components_range = range(1, 10)
    #cv_types = ['spherical', 'tied', 'diag', 'full']
    #for cv_type in cv_types:
    #    for n_components in n_components_range:
    #        # Fit a mixture of Gaussians with EM
    #        gmm = mixture.GMM(n_components=n_components, covariance_type=cv_type, n_iter=1000)
    #        gmm.fit(features)
    #        bic.append(gmm.bic(features))
    #        if bic[-1] < lowest_bic:
    #            lowest_bic = bic[-1]
    #            best_gmm = gmm
    # y_pred = best_gmm.predict(features)
    
    # Fit a Dirichlet process mixture 
    dpgmm = sklearn.mixture.DPGMM(n_components=n_components, covariance_type='tied', n_iter=1000, verbose=0) #, alpha=5
    dpgmm.fit(features)
    y_pred = dpgmm.predict(features)
    return y_pred
    
    
def fit_DBSCAN(features, eps=0.5, min_samples=20):
    cls = sklearn.cluster.DBSCAN(eps=eps, min_samples=min_samples)
    cls.fit(features)
    y_pred = cls.labels_
    # Number of clusters in labels, ignoring noise if present.
    # n_clusters_ = len(set(y_pred)) - (1 if -1 in y_pred else 0)
    # plot_clusters(features, y_pred, n_clusters_)
    return y_pred
    

def fit_affinity_prop(features):
    cls = sklearn.cluster.AffinityPropagation(preference=-60)
    cls.fit(features)
    y_pred = cls.labels_
    # Number of clusters in labels, ignoring noise if present.
    # cluster_centers_indices = cls.cluster_centers_indices_
    # n_clusters_ = len(cluster_centers_indices)
    # plot_clusters(features, y_pred, n_clusters_)
    return y_pred
    

def kmeans_gap_statistics(df, max_k):
    # https://datasciencelab.wordpress.com/2013/12/27/finding-the-k-in-k-means-clustering/
    # https://github.com/Zelazny7/gap-statistic/blob/master/gap.py
    def get_rand_data(col):
        rng = col.max() - col.min()
        return np.random.random_sample(len(col))*rng + col.min()
    
    def iter_kmeans(refs, n_clusters):
        vals = np.zeros(len(refs))
        k = sklearn.cluster.KMeans(n_clusters=n_clusters)
        for i in range(len(refs)):            
            k.fit(refs[i])
            # print('Ref k: %s' % k.get_params()['n_clusters'])
            vals[i] = k.inertia_
        return vals
    
    gaps = np.zeros(max_k)
    sks = np.zeros(max_k)
    B = 10 # number of test data sets
    refs = [np.apply_along_axis(get_rand_data, 0, df) for b in range(B)]
    print(len(refs))
    for k in range(max_k):
        km_act = sklearn.cluster.KMeans(n_clusters=k+1)
        km_act.fit(df)
        logWk = np.log(km_act.inertia_)        
        logWkbs = np.log(iter_kmeans(refs, n_clusters=k+1))
        logWkb = sum(logWkbs)/B
        gaps[k] = logWkb - logWk        
        sks[k] = np.sqrt(sum((logWkbs-logWkb)**2)/B)*np.sqrt(1+1/B)
        print('logWkb: %f   logWk: %f  gap: %f sks: %f delta: %f' %
              ( logWkb, logWk, gaps[k], sks[k], gaps[k]-sks[k]))            
    
    for n_clusters_ in range(max_k-1):
        if gaps[n_clusters_] > gaps[n_clusters_+1] - sks[n_clusters_+1]:
            break
    return n_clusters_+1


def kmeans_f_of_K(features, max_k):
    def f_of_K(X, ck, Skm1=0):    
        # http://www.ee.columbia.edu/~dpwe/papers/PhamDN05-kmeans.pdf
        # https://datasciencelab.wordpress.com/2014/01/21/selection-of-k-in-k-means-clustering-reloaded/
        # http://www.stat.cmu.edu/~ryantibs/datamining/lectures/06-clus3.pdf
        Nd = X.shape[1]
        # print(Nd)
        a = lambda k, Nd: 1 - 3/(4*Nd) if k == 2 else a(k-1, Nd) + (1-a(k-1, Nd))/6
        km = sklearn.cluster.KMeans(n_clusters=ck)
        km.fit(X)
        #dists = km.transform(X)
        #Sk = 0
        #for k in range(ck):
        #    Sk += sum(dists[km.labels_==k, k])
        clusters = km.labels_
        mu = km.cluster_centers_
        # print(mu)
        Sk = sum([np.linalg.norm(mu[i]-c)**2 for i in range(ck) for c in X[clusters==i]])
        if ck == 1:
            fs = 1
        elif Skm1 == 0:
            fs = 1
        else:
            fs = Sk/(a(ck,Nd)*Skm1)
        return fs, Sk 

    fs = np.zeros(max_k)
    fs[0], Sk = f_of_K(features, 1)
    for k in range(2, max_k+1):
        fs[k-1], Sk = f_of_K(features, k, Skm1=Sk)
        print('clusters %i f_of_K %f' % (k, fs[k-1]))
    # print(fs)
    return np.argmin(fs) + 1


def kmeans_silhouette(features, max_k):
    best_n_clusters = None
    best_score = -1 # worst possible

    for n_clusters_ in range(2, max_k+1):
        cls = sklearn.cluster.KMeans(n_clusters=n_clusters_)
        cls.fit(features)
        y_pred = cls.labels_
        score = sklearn.metrics.silhouette_score(features, y_pred)
        print('%i clusters score %f' % (n_clusters_, score))
        if score > best_score:
            best_score = score
            best_n_clusters = n_clusters_    

    return best_n_clusters


def fit_kmeans(features, n_clusters_):
    cls = sklearn.cluster.KMeans(n_clusters=n_clusters_)
    cls.fit(features)
    y_pred = cls.labels_
    # plot_clusters(features, y_pred, n_clusters_)
    return y_pred
    
    
def describe_songs(gr_song_features, gr_song_names, gr_song_genres, distance_mod=[1,1,1,1,1,1,1,1]):    
    features = gr_song_features[:,featured_features]
    weighted_features = features*np.asarray(distance_mod)
    printable_features = gr_song_features[:,:9]
    
    gr_top_g_indexer = get_core_genres(gr_song_genres)
    core_features = features[gr_top_g_indexer]
    projection = tsne_projection(core_features)
    plot3(projection, genres, color=gr_song_genres[gr_top_g_indexer])
    print('')
    print('mean features for all songs in group')
    gr_song_mean = np.mean(printable_features, axis=0)
    # print(gr_song_mean)
    print_features(gr_song_mean)
    print('')
    print('mean features for all songs in top genres')
    gr_song_mean = np.mean(printable_features[gr_top_g_indexer], axis=0)
    print_features(gr_song_mean)
    print('')
    print('outliers information')
    outliers_info(features, core_features, gr_song_features, gr_song_names, genres)
    print('')
    print('computing TSNE projection')
    projection = tsne_projection(weighted_features)
    print('')
    print('clustering with DPGMM')
    y_pred = fit_dpgmm(weighted_features, n_components=10)
    plot_clusters(projection, y_pred)
    print('')
    print('clustering with DBSCAN')
    y_pred = fit_DBSCAN(weighted_features)
    plot_clusters(projection, y_pred)
    print('')
    print('clustering with affinity propagation')
    y_pred = fit_affinity_prop(weighted_features)
    plot_clusters(projection, y_pred)
    max_k = 10
    print('')
    print('detection no cluster with kmeans_gap_statistics')
    nc = kmeans_gap_statistics(weighted_features, max_k)
    print('got %i clusters' % nc)
    print('')
    print('detection no cluster with kmeans_silhouette')
    nc = kmeans_silhouette(weighted_features, max_k)
    print('got %i clusters' % nc)
    print('')
    print('detection no cluster with kmeans_f_of_K')
    nc = kmeans_f_of_K(weighted_features, max_k)
    print('got %i clusters -> used for plotting' % nc)
    y_pred = fit_kmeans(weighted_features, nc)
    plot_clusters(projection, y_pred)
    print('')
    print('')
    
    
def describe_song_group(group_name, group_id, distance_mod):
    print('GROUP %s(%i) INFO' % (group_name, group_id))
    print('--------------------------------------')
    gr_song_features, gr_song_names = load_songs(select_song_group(group_id), count_song_group(group_id))
    gr_song_features, gr_song_genres, _, _, _ = prepare_songs(gr_song_features, features_scaler)
    describe_songs(gr_song_features, gr_song_names, gr_song_genres, distance_mod)
    
    
def describe_song_genre(genre_name, distance_mod):
    genre_id = genres_name[genre_name]
    print('GENRE %s(%i) INFO' % (genre_name, genre_id))
    print('--------------------------------------')
    gr_song_features, gr_song_names = load_songs(select_song_in_genre(genre_id), count_song_in_genre(genre_id))
    gr_song_features, gr_song_genres, _, _, _ = prepare_songs(gr_song_features, features_scaler)
    describe_songs(gr_song_features, gr_song_names, gr_song_genres, distance_mod)    
    
    
def describe_song_group_type(name, group_type, distance_mod):
    print('GROUP TYPE %s(%i) INFO' % (name, group_type))
    print('--------------------------------------')
    gr_song_features, gr_song_names = load_songs(select_song_group_type(group_type), count_song_group_type(group_type))
    gr_song_features, gr_song_genres, _, _, _ = prepare_songs(gr_song_features, features_scaler)
    #for i in range(gr_song_features.shape[1]):
    #    for j in range(gr_song_features.shape[0]):
    #        if np.isnan(gr_song_features[j,i]):
    #            print('nan detected x %i y %i' % (j,i))
            
    #for i in range(gr_song_features.shape[1]):
    #    print('%i: %s' % (i, str(gr_song_features[np.isnan(gr_song_features[:,i]),i])))
    describe_songs(gr_song_features, gr_song_names, gr_song_genres, distance_mod)

In [None]:
# db.session.rollback()
# describe_song_group('sarnecka library', 66, [3,0.3,1,0.5,1,1,1,5])
# describe_song_group_type('sleep playlists', 1, [3,0.3,1,0.5,1,1,1,5])
# describe_song_genre('warm drone', [3,0.3,1,0.5,1,1,1,5])

In [582]:
from api import user_helper
from spotify import spotify_helper
import importlib
importlib.reload(spotify_helper)


spotify_helper.refresh_token_on_expired = True

feature_minimum = np.min(song_features, axis=0)
feature_maximum = np.max(song_features, axis=0)

def f_val_prc(i, f):
    return (f[i] - feature_minimum[i])/(feature_maximum[i] - feature_minimum[i])


def is_sleep_song(features):
    energy_val = f_val_prc(0, features)
    return energy_val < 0.6


def is_wakeup_song(features):
    energy_val = f_val_prc(0, features)
    return energy_val > 0.6
    

def sleepines(features):
    #low energy(0), tempo(2), valence(7), danceability(8), loudness(6)
    #high acousticness(4)    
    
    return 1 - f_val_prc(0, features) + 1 - f_val_prc(2, features)  + 1 - f_val_prc(8, features)#\ #feature_maximum[2] - features[2]
            # feature_maximum[8] - features[8] #+ feature_maximum[6] - features[6] #+ features[4] + feature_maximum[7] - features[7]

    
def wakefulness(features):
    #low energy(0), tempo(2), valence(7), danceability(8), loudness(6)
    #high acousticness(4)
    energy = f_val_prc(0, features)
    danceability = f_val_prc(8, features)
    valence = f_val_prc(7, features)
    return max(energy+valence, danceability+valence) #features[0] + features[8] + features[6] + features[7] #+ features[4] + feature_maximum[7] - features[7]


def song2tracks(song_id):
    #s = sqlselect([SongTracks.SpotifyId]).where(sqltext("SongId=:sid"))
    s = sqltext('SELECT SpotifyId FROM SongTracks WHERE SongId=%i' % song_id)
    #print(s) # {'sid': song_id}
    track_ids = []
    for row in db.session.execute(s).fetchall():
        #print('%i > %s' % (song_id, row[0]))
        track_ids.append((song_id, row[0]))
    return track_ids


def save_playlist(spotify_user_id, playlist_name, song_ids):
    user = user_helper.load_user(spotify_user_id)
    song_mappings = []
    for song_id in song_ids:
        song_mappings.extend(song2tracks(song_id))
    #print(song_mappings)
    spotify_tracks, added_songs = spotify_helper.resolve_tracks_for_user(user, song_mappings)
    tracks = [t['uri'] for t in spotify_tracks]
    sp_user_pl = spotify_helper.get_or_create_playlist_by_name(user, playlist_name)        
    # print(added_songs)
    spotify_helper.set_playlist_content(user, sp_user_pl['id'], tracks)
    return added_songs


def outliers_to_playlists(user, name, features, core_features, gr_song_features, gr_song_names, genres):
    # novelty detection
    clf = sklearn.svm.OneClassSVM(nu=0.1, kernel="rbf", gamma=0.1)
    clf.fit(core_features)
    y_pred_train = clf.predict(features)
    print('%% of outliers in dataset: %f%%' % (y_pred_train[y_pred_train == -1].size * 100.0 / features.shape[0]))
    y_decision_train = clf.decision_function(features)[:,0]
    y_decision_train_sort = np.argsort(y_decision_train)
    outliers = []
    for sort_id in y_decision_train_sort[:20]:
        if y_pred_train[sort_id] == -1:
            song = gr_song_features[sort_id]
            outliers.append(song[13])
            print('song %s-%s(%i) is outlier with dist %f' % (gr_song_names[song[13]], genres[song[14]],
                                                              song[13], y_decision_train[sort_id]))
    #save_playlist('rudolfix-us', name + ' outliers', outliers)

    print('')
    ingroup = []
    y_decision_train_sort = y_decision_train_sort[::-1]
    for sort_id in y_decision_train_sort[:20]:
        if y_pred_train[sort_id] == 1:
            song = gr_song_features[sort_id]
            ingroup.append(song[13])
            print('song %s-%s(%i) is ingroup with dist %f' % (gr_song_names[song[13]], genres[song[14]],
                                                              song[13], y_decision_train[sort_id]))            
    # 1130122659
    # 'rudolfix-us'
    if len(ingroup) > 0:
        save_playlist(user, name + ' ingroup', ingroup)
        
        
def top_songs_with_affinity(gr_song_features, limit, f_affinity):
    printable_features = gr_song_features[:,:9]
    # is_aff_songs = np.apply_along_axis(f_affinity_treshold, 1, printable_features)    
    # songs_with_affinity = printable_features[is_aff_songs]
    # print(songs_with_affinity.shape)
    features_affinity = np.zeros(len(printable_features), dtype=np.float32)
    for s_id, f in enumerate(printable_features):
        features_affinity[s_id] = f_affinity(f)
    features_affinity_sorted_idx = np.argsort(features_affinity)
    features_affinity_sorted_idx_rev = features_affinity_sorted_idx[::-1]
    return features_affinity_sorted_idx_rev[:limit]
        
        

def compute_genres_for_songs(gr_song_features, gr_song_genres, artists_genres, genre_affinity, affinity_threshold=0.45, 
                            genre_prevalence_threshold=0.02, genre_prevalence_count_threshold=10000):
    artists_ids = gr_song_features[:,15] #np.unique(
    genres_accu = []
    for aid in artists_ids:
        if aid in artists_genres:
            genres_accu.extend(artists_genres[aid])
    genres_count = np.bincount(genres_accu)
    
    # lambda f: (sleepines(f)-min_sleepiness)/(max_sleepiness-min_sleepiness)
    # genre_sleepiness = np.apply_along_axis(sleepines, 1, genre_features)
    #print(genres_count.shape)
    #print(genre_sleepiness.shape)
    #genres_count_weighted = np.multiply(genres_count,genre_sleepiness[:len(genres_count)])
    tot_in_sleep = sum(genres_count[genre_affinity[:len(genres_count)]>affinity_threshold])
    # print(tot_in_sleep)
    #print(genres_count_weighted.shape)
    top_sleep_genres = []
    g_bin_sort = np.argsort(genres_count)[::-1]
    for i in g_bin_sort:
        cnt = genres_count[i]
        prevalence = cnt / tot_in_sleep
        # mind the OR below
        if  genre_affinity[i] > affinity_threshold and \
                (cnt > genre_prevalence_count_threshold or prevalence > genre_prevalence_threshold):
            print('%s(%i): %i(%f%%) affinity:%f' % (genres[i], i, cnt, 100.0*prevalence, genre_affinity[i]))
            top_sleep_genres.append((i, prevalence, genre_sleepiness[i]))
            # print('')
            # print_features(genre_features[i])
            #print('---------------------')
    
    return top_sleep_genres

def describe_clusters(name, gr_song_features, y_pred, gr_song_names, gr_song_genres):
    y_pred = y_pred + 1
    print('%s %s' % (name, np.bincount(y_pred)))
    print('--------------------------')
    for i, c in enumerate(np.bincount(y_pred)):
        if c>0:
            print('Cluster %i' % i)
            print('---------------------')
            describe_songs(gr_song_features[y_pred==i], gr_song_names, gr_song_genres[y_pred==i])
            
def clusters_to_playlists(user, name, gr_song_features, y_pred, gr_song_names, gr_song_genres):
    y_pred = y_pred + 1
    print('%s %s' % (name, np.bincount(y_pred)))
    print('--------------------------')
    for i, c in enumerate(np.bincount(y_pred)):
        if c>0:
            print('Cluster %i' % i)
            print('---------------------')
            features = gr_song_features[y_pred==i][:,featured_features]
            gr_top_g_indexer = get_core_genres(gr_song_genres[y_pred==i])
            core_features = features[gr_top_g_indexer]
            outliers_to_playlists(user, '%s cluster %i - ' % (name, i), features, core_features, gr_song_features[y_pred==i],\
                                    gr_song_names, genres)
            
def extract_sleepy_clusters(genre_id, dist_mod, f_affinity, f_has_affinity, show_clusters=False, song_limit=5000,
                            preserve_clusters_size=0.2, min_cluster_affinity_level=0):
    gr_song_features, gr_song_names = load_songs(select_song_in_genre(genre_id, song_limit), count_song_in_genre(genre_id, song_limit))
    gr_song_features, gr_song_genres, _, _, _ = prepare_songs(gr_song_features, features_scaler)
    # remove songs without affinity (sleepy, wakeful etc)
    gr_songs_affinity = np.apply_along_axis(f_has_affinity, 1, gr_song_features)
    gr_song_features = gr_song_features[gr_songs_affinity]
    gr_song_genres = gr_song_genres[gr_songs_affinity]
    features = gr_song_features[:,featured_features]
    printable_features = gr_song_features[:,:9]
    weighted_features = features*np.asarray(dist_mod)
    # get clusters
    y_pred = fit_dpgmm(weighted_features, 12)
    print('cluster sizes: %s' % str(np.bincount(y_pred)))
    if show_clusters:
        print('computing TSNE projection')
        projection = tsne_projection(weighted_features)
        plot_clusters(projection, y_pred)
    #find clusters > 30% of all elements    
    significant_clusters = []
    cluster_sizes = np.bincount(y_pred)
    for i, c in  enumerate(cluster_sizes):
        if c / len(y_pred) > preserve_clusters_size:
            significant_clusters.append([i,c])
    if len(significant_clusters) == 0:
        #add biggest cluster
        max_c_idx = np.argmax(cluster_sizes)
        significant_clusters.append([max_c_idx,cluster_sizes[max_c_idx]])
    print(significant_clusters)
    #remove outliers
    for cluster in significant_clusters:        
        cluster_songs_affinity = []
        cluster_songs = []
        cluster_id = cluster[0]        
        cluster_features = weighted_features[y_pred==cluster_id]
        gr_cluster_features = gr_song_features[y_pred==cluster_id]
        clf = sklearn.svm.OneClassSVM(nu=0.1, kernel="rbf", gamma=0.1)
        clf.fit(cluster_features)
        novelty_pred = clf.predict(cluster_features)
        print('%% of outliers in dataset: %f%%' % (novelty_pred[novelty_pred == -1].size * 100.0 / cluster_features.shape[0]))
        novelty_decision = clf.decision_function(cluster_features)[:,0]
        novelty_decision_sort = np.argsort(novelty_decision)[::-1]    
        for sort_id in novelty_decision_sort:
            if novelty_pred[sort_id] == 1:
                song = gr_cluster_features[sort_id]                
                cluster_songs_affinity.append(f_affinity(song))
                cluster_songs.append(song)
        cluster.append(np.array(cluster_songs))
        cluster.append(np.mean(cluster_songs_affinity))
    #print(significant_clusters)
    #return cluster list (cluster_id, size, songs, sleepiness)
    return [c for c in significant_clusters if c[3] > min_cluster_affinity_level] #leave only sleepy clusters


def compute_genre_features():
    genre_features = np.zeros((len(genres), 9), dtype=np.float32)
    genre_sleepiness = np.zeros((len(genres)), dtype=np.float32)
    genre_wakefulness = np.zeros((len(genres)), dtype=np.float32)
    for genre_id in genres:
        genre_songs = song_features[song_genres==genre_id][:,:9]        
        if genre_songs.shape[0] > 0:
            genre_songs_sleepiness = np.apply_along_axis(lambda f: 1 if is_sleep_song(f) else 0, 1, genre_songs)
            genre_songs_wakefulness = np.apply_along_axis(lambda f: 1 if is_wakeup_song(f) else 0, 1, genre_songs)
            mean = np.mean(genre_songs, axis=0)
            genre_features[genre_id] = mean
            genre_sleepiness[genre_id] = np.mean(genre_songs_sleepiness)
            genre_wakefulness[genre_id] = np.mean(genre_songs_wakefulness)
            
    return genre_features, genre_sleepiness, genre_wakefulness


def similar_genres_from_similar_artists(artists_genres, connected_metric_f=len, use_foreign_sim=True):
    # init similarity with genre co-occurence
    genre_similarity = {}
    
    def proc_l2l(g_l, s_g_l):
        for gid in g_l:
            if gid not in genre_similarity:
                g_dict = {}
                genre_similarity[gid] = g_dict
            else:
                g_dict = genre_similarity[gid]
            for s_gid in s_g_l:
                if s_gid != gid:
                    if s_gid not in g_dict:
                        g_dict[s_gid] = 1
                    else:
                        g_dict[s_gid] += 1
                        
    for g_l in artists_genres.values():
        if len(g_l) == 1:
            continue
        # all genres on the list co-occur
        proc_l2l(g_l, g_l)
                        
    
    if use_foreign_sim:
        txt = 'SELECT ArtistId, SimilarArtistId, Dist FROM SimilarArtists ORDER BY ArtistId, Dist'
        s = sqltext(txt)    
        rows = db.session.execute(s).fetchall()
        rc = 0
        for row in rows:
            if row[0] in artists_genres and row[1] in artists_genres:
                proc_l2l(artists_genres[row[0]], artists_genres[row[1]])
            if rc % 100000 == 0:
                print(rc)
            rc += 1            
            # rows = db.session.execute(s).fetchmany()
                        
    #find maximally connected genre
    max_g = max([(i[0], connected_metric_f(i[1].values())) for i in genre_similarity.items()], key=itemgetter(1))
    #print(max_g)
    #print(genres[max_g[0]])
    #normalize weights
    for gid in genre_similarity:
        g_dict = genre_similarity[gid]
        for s_gid in list(g_dict.keys()):
            if g_dict[s_gid] > 3:
                g_dict[s_gid] = 1 - connected_metric_f([g_dict[s_gid]]) / max_g[1]
            else:
                g_dict.pop(s_gid)
    
    return genre_similarity    


# finds closest vector to ref_song among songs using feature indexes 'distance_features', euclidean distance
def find_closest_song(ref_song, songs, distance_features, randlimit=5):
    def ec_dist(song):
        # print(zip(list(ref_song[distance_features]), list(song[distance_features])))
        return math.sqrt(functools.reduce(
                lambda y,x: y + (x[0]-x[1])**2, zip(ref_song[distance_features], song[distance_features]), 0))
    distances = np.apply_along_axis(ec_dist,1,songs)
    #print(distances[:100])
    min_dis_idx =  np.argsort(distances)[:randlimit][random.randint(0,min(randlimit-1,len(distances)-1))]
    #print(distances[min_dis_idx])
    return min_dis_idx

def compute_genre_similarity_graph(genre_similarity):
    G_G = nx.Graph()
    # similar_genres = load_similar_genres() # poor and not fully connected graph
    # similar_genres = similar_genres_from_similar_artists(artists_genres, connected_metric_f=lambda x: math.log(sum(x))
    for gid, simgenres in genre_similarity.items():
        for simg in simgenres.items():
            G_G.add_edge(gid, simg[0], weight=simg[1])
    return G_G


def display_graph(G, with_labels=False):
    pylab.figure(1, figsize=(100, 100))
    # layout graphs with positions using graphviz neato
    pos = graphviz_layout(G_G, prog="neato")
    nx.draw(G_G, pos,
                 node_size=40,
                 vmin=0.0,
                 vmax=1.0,
                 with_labels=with_labels)


def k_shortest_paths(G, source, target, k, weight=None):
    return list(islice(nx.shortest_simple_paths(G, source, target, weight=weight), k))


def edges_path_iter(G, path, data=False):
    for i in range(len(path)-1):
        n1 = path[i]
        n2 = path[i+1]
        if data is not False:
            yield (n1, n2, G.edge[n1][n2][data])
        else:
            yield (n1, n2, G.edge[n1][n2])



def find_closest_genre(G, gid, possible_genres):
    shortest_paths = []
    for possible_gid in possible_genres:
        try:
            for path in k_shortest_paths(G, gid, possible_gid, 1, weight='weight'):
                w = functools.reduce(lambda w, edge: w + edge[2], edges_path_iter(G, path, data='weight'), 0 )
                shortest_paths.append((path, w))                
                #print('%s:%f' % ([genres[g]for g in path], w))
        except nx.NetworkXNoPath:
            pass
        
    if not shortest_paths:
        return None
    return sorted(shortest_paths, key=itemgetter(1))[0][0][-1]


def fing_closest_genre_by_acoustics(ref_genre, genre_features, distance_features):
    return find_closest_song(ref_genre, genre_features, distance_features)

In [508]:
# compute genre-wide stats
db.session.rollback()
artists_genres = load_artist_genres()
genre_features, genre_sleepiness, genre_wakefulness = compute_genre_features()
# do not use foreign-similarity. artist self similarity (genre occurence) produces fully connected graph
# except czech pop ;>
genre_similarity = similar_genres_from_similar_artists(artists_genres, connected_metric_f=lambda x: sum(x),
                                                    use_foreign_sim=False)
G_genre_sim = compute_genre_similarity_graph(genre_similarity)


In [None]:
db.session.rollback()
#describe_song_genre('warm drone', [3,0.3,1,0.5,1,1,1,5])
# genre_id = genres_name['warm drone']
# gr_song_features, gr_song_names = load_songs(select_song_in_genre(genre_id, 5000), count_song_in_genre(genre_id, 5000))
#gr_song_features, gr_song_names = load_songs(select_song_group_type(1), count_song_group_type(1))
# gr_song_features = gr_song_features[gr_song_features[:,14]>0]
gr_song_features, gr_song_names = load_songs(select_song_group(66), count_song_group(66))
gr_song_features, gr_song_genres, _, _, _ = prepare_songs(gr_song_features, features_scaler)
# apply genre sleepiness
    #else:
    #    print('genre %s %i not present in dataset' % (genres[genre_id], genre_id))
# genre_sleepiness = np.zeros((len(genres)+1, 1), dtype=np.float32)
#max_sleepiness = sleepines(feature_maximum[:9])
#min_sleepiness = sleepines(feature_minimum[:9])

#gr_g_array = np.array(gr_song_genres, dtype=np.int64)
#gr_g_bins = np.bincount(gr_g_array)
#gr_g_bin_sort = np.argsort(gr_g_bins)[::-1]
#for i, cnt in enumerate(gr_g_bins):
#    if cnt > 0:
#        print('%s: %i' % (genres[i], cnt))
#print(repr(gr_g_bins))
#print(len(gr_g_bins))
#print(gr_g_bin_sort)

In [565]:
db.session.rollback()

    
most_n_indexer = top_songs_with_affinity(gr_song_features, 100, sleepines)
most_song_features = gr_song_features[most_n_indexer]
#print(gr_song_features.shape)
most_song_genres = gr_song_genres[most_n_indexer]
most_features = most_song_features[:,featured_features]
most_printable_features = most_song_features[:,:9]
top_sleep_genres = compute_genres_for_songs(most_song_features, most_song_genres, artists_genres, genre_sleepiness)


print('')
most_n_indexer = top_songs_with_affinity(gr_song_features, 100, wakefulness)
most_song_features = gr_song_features[most_n_indexer]
#print(gr_song_features.shape)
most_song_genres = gr_song_genres[most_n_indexer]
most_features = most_song_features[:,featured_features]
most_printable_features = most_song_features[:,:9]

top_wake_genres = compute_genres_for_songs(most_song_features, most_song_genres, artists_genres, genre_wakefulness,
                                          affinity_threshold=0.65, genre_prevalence_threshold=0.02)
print('')

genre_prevalence_threshold = min(10/gr_song_features.shape[0], 0.01) # 10 or more songs or 1%
top_genres = compute_genres_for_songs(gr_song_features, gr_song_genres, artists_genres, genre_sleepiness, affinity_threshold=0,
                                     genre_prevalence_threshold=0.01, genre_prevalence_count_threshold=10)

BABUBA()

# y_pred = fit_kmeans(genres_one_hot, 3)
# print(gr_song_features.shape)
# print(y_pred.shape)
# print(gr_song_genres.shape)
# y_pred = fit_kmeans(features, 3)
# describe_clusters('combined', gr_song_features, y_pred, gr_song_names, gr_song_genres)
# y_pred = fit_kmeans(features, 3)
#spotify_account = '1130122659'
spotify_account = 'rudolfix-us'
dist_mod_sleep = [3,0.3,1,0.5,1,1,1,5]
dist_mod_library = [3,0.3,1,0.5,2,2,1,3]
for sleep_genre in top_sleep_genres:
    genre_id,_,_ = sleep_genre
    sleep_clusters = extract_sleepy_clusters(genre_id, dist_mod_sleep, show_clusters=False)
    print('genre %s: (%s)' % (genres[genre_id], str(sleep_genre)))
    print([(c1,c2,len(c3),c4) for c1, c2, c3, c4 in sleep_clusters])
    for s_c in sleep_clusters:
        #print(s_c[2][:100][13])
        pl_name = 'sleep %s c %i' % (genres[genre_id], s_c[0])
        print(pl_name)
        save_playlist(spotify_account, pl_name, s_c[2][:100,13])
        print('------')
#weighted_features = features*np.asarray(dist_mod_sleep)




#clusters_to_playlists(spotify_account, 'acoustic features', gr_song_features, y_pred, gr_song_names, gr_song_genres)
#describe_clusters('acoustic features', gr_song_features, y_pred, gr_song_names, gr_song_genres)
#y_pred = fit_kmeans(genres_one_hot, 3)
#describe_clusters('genres_one_hot', gr_song_features, y_pred, gr_song_names, gr_song_genres)

#print(song_features.shape)
#print(features.shape)
#spotify_account = '1130122659'
features_sleepiness = np.zeros(len(printable_features), dtype=np.float32)
for s_id, f in enumerate(printable_features):
    features_sleepiness[s_id] = sleepines(f)
features_sleepiness_sorted_idx = np.argsort(features_sleepiness)
features_sleepiness_sorted_idx_rev = features_sleepiness_sorted_idx[::-1]

ingroup = []
for s in gr_song_features[features_sleepiness_sorted_idx_rev[:40],:]:
    print(gr_song_names[s[13]] + '(%i)' % s[13])
    #print_features(s)
    # print()
    ingroup.append(s[13])
#save_playlist(spotify_account, 'most sleepy', ingroup)


features_wakefullness = np.zeros(len(printable_features), dtype=np.float32)
for s_id, f in enumerate(printable_features):
    features_wakefullness[s_id] = wakefullness(f)
features_wakefullness_sorted_idx = np.argsort(features_wakefullness)
features_wakefullness_sorted_idx_rev = features_wakefullness_sorted_idx[::-1]
print('---------------------')
ingroup = []
for s in gr_song_features[features_wakefullness_sorted_idx_rev[:40],:]:
    print(gr_song_names[s[13]] + '(%i)' % s[13])
    #print_features(s)
    # print()
    ingroup.append(s[13])

#save_playlist(spotify_account, 'least sleepy', ingroup)

ambient idm(42): 16(8.888889%) affinity:0.464358
glitch(620): 14(7.777778%) affinity:0.700143
modern performance(893): 12(6.666667%) affinity:0.977929
minimal(879): 12(6.666667%) affinity:0.921856
dream pop(471): 12(6.666667%) affinity:0.613687
ethereal wave(524): 12(6.666667%) affinity:0.661905
classical christmas(260): 9(5.000000%) affinity:0.990141
choral(218): 9(5.000000%) affinity:1.000000
dub techno(481): 9(5.000000%) affinity:0.611684
warm drone(1358): 8(4.444444%) affinity:0.893130
fourth world(566): 7(3.888889%) affinity:0.902247
ambient(39): 7(3.888889%) affinity:0.923516
dark ambient(329): 6(3.333333%) affinity:0.813411
lowercase(828): 6(3.333333%) affinity:0.866667
abstract idm(5): 6(3.333333%) affinity:0.471891
hauntology(668): 6(3.333333%) affinity:0.643939
throat singing(1271): 5(2.777778%) affinity:0.888889

progressive psytrance(1056): 11(5.583756%) affinity:0.859736
breakcore(139): 10(5.076142%) affinity:0.877680
mathcore(847): 9(4.568528%) affinity:0.924863
metalcore

NameError: name 'BABUBA' is not defined

In [590]:
def generate_wakeup_playlist(wake_gid, wake_song_features, wake_song_genres, gr_song_features, gr_song_genres, top_sleep_genres,
                            top_genres):
    # find #start genre song and then #end genre with speechiness, acousticness and instru as close as possible
    # morph energy, temp, dance, valence linearly    
    # wake_song_features has wakeful songs filtered for current user
    init_song_idx = list(most_song_genres).index(wake_gid) # find the most wakeful song of given gid
    # todo: replace first song with other metrics like general popularity, when user added etc.
    # todo: extend with song from similar artists
    init_song = most_song_features[init_song_idx]
    # find matchin end genre from sleep genres by speechiness, acousticness and instru
    sound_similarity = [3,4,5]
    energy_similarity = [0,2,8]
    sound_energy_similarity = [0,3,4,5]
    possible_sleep_genres = list(set([g[0] for g in top_sleep_genres]).intersection(list(np.unique(gr_song_genres.astype(int)))))
    end_gid = fing_closest_genre_by_acoustics(genre_features[847], genre_features[np.asarray(possible_sleep_genres, dtype=np.int32)],
                                              sound_similarity)
    sleepy_clusters = extract_sleepy_clusters(end_gid, dist_mod_sleep, sleepines, is_sleep_song, show_clusters=False)
    end_songs = np.vstack(c[2] for c in sleepy_clusters)
    end_song_idx = find_closest_song(init_song, end_songs, sound_similarity,1)
    end_song = end_songs[end_song_idx]
    # use genre similarity graph to connect wake_gid to end_gid
    genre_path=[]
    try:
        for path in k_shortest_paths(G_genre_sim, wake_gid, end_gid, 1, weight='weight'):
            genre_path = path
            print(path)
    except nx.NetworkXNoPath:
        raise # todo: handle NetworkXNoPath somehow        
    # morph song into end song in genre_path steps
    genre_path = [g for g in genre_path for _ in (0, 1)]
    song_diff = (end_song - init_song) / (len(genre_path) - 1)
    # var below will be morphed
    song_iter = np.copy(end_song)
    wakeup_playlist = [init_song[13]]
    possible_genres = list(set([g[0] for g in top_genres]).intersection(list(np.unique(gr_song_genres.astype(int)))))
    
    def replace_closest_gid(gid):
        # print(possible_genres)
        if gid not in possible_genres:
            c_gid = find_closest_genre(G_genre_sim, gid, possible_genres)
            if c_gid is not None:
                print('----------replaced with %s(%i)' % (genres[c_gid], c_gid))
                return c_gid

        return gid

    for i in range(1, len(genre_path)-1):
        song_iter += song_diff
        print('processing %s' % genres[genre_path[i]])
        gid = genre_path[i]
        # choose afinity func
        if i <= len(genre_path) // 3:
            f_affinity = wakefulness
            f_has_affinity = is_wakeup_song
            dist_index = energy_similarity
            gid = replace_closest_gid(gid)
            # todo: below cerain number of songs extend by artist or to full genre
            #c_songs = gr_song_features[gr_song_genres==gid]
            clusters = extract_sleepy_clusters(gid, dist_mod_sleep, f_affinity, f_has_affinity, show_clusters=False)
            c_songs = np.vstack(c[2] for c in clusters)
            print(len(c_songs))
        elif i <= 2*len(genre_path) // 3:
            f_affinity = lambda x: 1 #identity
            f_has_affinity = lambda x: True        
            dist_index = energy_similarity
            gid = replace_closest_gid(gid)
            # todo: below cerain number of songs extend by artist or to full genre
            #c_songs = gr_song_features[gr_song_genres==gid]
            clusters = extract_sleepy_clusters(gid, dist_mod_sleep, f_affinity, f_has_affinity, show_clusters=False)
            c_songs = np.vstack(c[2] for c in clusters)
            print(len(c_songs))
        else:
            f_affinity = sleepines
            f_has_affinity = is_sleep_song
            dist_index = sound_energy_similarity
            # find any song from cluster
            clusters = extract_sleepy_clusters(gid, dist_mod_sleep, f_affinity, f_has_affinity, show_clusters=False)
            c_songs = np.vstack(c[2] for c in clusters)

        c_song_idx = find_closest_song(song_iter, c_songs,dist_index)
        print('----by %s' % get_artists_name(c_songs[c_song_idx][15]))
        wakeup_playlist.append(c_songs[c_song_idx][13])
    wakeup_playlist.append(end_song[13])
    
    return wakeup_playlist

wakeup_playlist = generate_wakeup_playlist(1134, most_song_features,
                                           most_song_genres, gr_song_features, gr_song_genres, top_sleep_genres, top_genres)
#print(wakeup_playlist)

cluster sizes: [   0    0  104 1116    0  208    0    0  224    0    0  384]
[[3, 1116]]
% of outliers in dataset: 9.946237%
[1134, 1086, 1308, 1365, 3]
processing screamo
cluster sizes: [1618    0    0 1260    0  266    0    0    0    0 1162]
[[0, 1618], [3, 1260], [10, 1162]]
% of outliers in dataset: 10.259580%
% of outliers in dataset: 9.682540%
% of outliers in dataset: 9.982788%
3636
----by Before Their Eyes
processing rap rock
----------replaced with metalcore(869)
cluster sizes: [2480  341    0  129    0    0    0  844  187    0  526]
[[0, 2480]]
% of outliers in dataset: 10.040323%
2231
----by Converge
processing rap rock
----------replaced with metalcore(869)
cluster sizes: [2566    0    0  259    0  556  176    0    0    0    0  950]
[[0, 2566], [11, 950]]
% of outliers in dataset: 10.054560%
% of outliers in dataset: 9.789474%
3165
----by Zao
processing turntablism
----------replaced with abstract hip hop(4)
cluster sizes: [ 675 2495  807    0    0  101    0  143  474    0 

In [566]:

    

# find_closest_genres(G_genre_sim, genres_name['psychedelic trance'], [g[0] for g in top_genres])
# G_genre_sim.edge[1065][378]
possible_genres = list(set([g[0] for g in top_sleep_genres]).intersection(list(np.unique(gr_song_genres.astype(int)))))
print(np.asarray(possible_genres))
closest_gid = fing_closest_genre_by_acoustics(genre_features[847], genre_features[np.asarray(possible_genres, dtype=np.int32)], [3,4,5])
print(possible_genres[closest_gid])
print(genres[possible_genres[closest_gid]])


[ 481    5   39   42  620  879  471 1271  218  668]
42
ambient idm


In [591]:

# comp
#print_features(most_printable_features[init_song_idx])
#print('')
#print_features(end_songs[end_song_idx][:9])
# save
pl_name = 'wake me up 3'
spotify_account = 'rudolfix-us'
#save_playlist(spotify_account, pl_name, [init_song[13], end_songs[end_song_idx][13]])
added_songs = save_playlist(spotify_account, pl_name, wakeup_playlist[::-1])
print(wakeup_playlist[::-1])
print(set(wakeup_playlist).difference(added_songs.keys()))

[2653311.0, 1831300.0, 1831300.0, 48353.0, 1046594.0, 501622.0, 689883.0, 882508.0, 228294.0, 227559.0]
{1046594.0, 2653311.0}


In [523]:
save_playlist(spotify_account, pl_name, wakeup_playlist[::-1])

['spotify:track:1N81UrowQOnFls2csPFBAz', 'spotify:track:1YC6I5egSCYkCdID1CZMAt', 'spotify:track:3fd8OekZV9WVhrg4fQw0xR', 'spotify:track:65be8oawWem4SpeqABuL62', 'spotify:track:0MItBzDaZzBBzOeiZnuW1a', 'spotify:track:21tWM1trHeoDI2XkTXIbfR', 'spotify:track:1fFts1wzm6WMifjbO7HeIl']


In [398]:
#print(artists_genres.values())
db.session.rollback()
similar_genres = similar_genres_from_similar_artists(artists_genres, connected_metric_f=lambda x: sum(x),
                                                    use_foreign_sim=False)
#gid = top_sleep_genres[0][0]
gid = top_wake_genres[0][0]
simg = similar_genres[gid].items()
simg = sorted(simg, key=itemgetter(1))[::-1]
print('%s(%i):' % (genres[gid], gid))
for sg in simg:
    print('%s(%i) %f'%(genres[sg[0]], sg[0], sg[1]))
similar_genres2 = load_similar_genres()
simg = similar_genres2[top_sleep_genres[0][0]]
print(simg)
for sg in simg:
    print(genres[sg[0]])

progressive psytrance(1056):
deep psytrance(421) 0.013263
psychedelic trance(1065) 0.005637
full on(581) 0.004642
[(621, 0.10084)]
glitch beats


In [None]:
G = nx.DiGraph()

for i, g_id in enumerate(genre_sleepiness_sorted_idx_rev):  # work on reverse
    if g_id in genres:
        max_nb = min(i+5, len(genre_sleepiness_sorted_idx))  # 5 neighbours
        for nb in range(i+1, max_nb):
            n_g_id = genre_sleepiness_sorted_idx_rev[nb]
            if n_g_id in genres: #genre_sleepiness[n_g_id] - genre_sleepiness[g_id]
                G.add_edge(genres[g_id], genres[n_g_id], weight=genre_sleepiness[g_id] - genre_sleepiness[n_g_id])

In [505]:
G_G = nx.Graph()
# similar_genres = load_similar_genres()
# similar_genres = similar_genres_from_similar_artists(artists_genres, connected_metric_f=lambda x: math.log(sum(x))
genre_similarity = similar_genres_from_similar_artists(artists_genres, connected_metric_f=lambda x: sum(x),
                                                    use_foreign_sim=False)
for gid, simgenres in genre_similarity.items():
    for simg in simgenres.items():
        G_G.add_edge(genres[gid], genres[simg[0]], weight=simg[1])

In [None]:
pylab.figure(1, figsize=(100, 100))
# layout graphs with positions using graphviz neato
pos = graphviz_layout(G_G, prog="neato")
nx.draw(G_G, pos,
             node_size=40,
             vmin=0.0,
             vmax=1.0,
             with_labels=True)

In [507]:
# print(G['disco house'])
# print(G['deep alternative r&b'])

#paths = list(nx.shortest_simple_paths(G, 'deep alternative r&b', 'disco house'))
#print(paths)

#'disco house'
from_g_name = genres[top_wake_genres[0][0]]
for gid,_,_ in top_sleep_genres:
    to_g_name = genres[gid]
    print('%s -> %s' % (from_g_name, to_g_name))
    shortest = nx.dijkstra_path(G_G, from_g_name, to_g_name, weight='weight')
    try:
        for path in k_shortest_paths(G_G, from_g_name, to_g_name, 1, weight='weight'):
            simplest = path
            print(path)
    except nx.NetworkXNoPath:
        print ('no path')
        path = []
    diff = set(shortest).symmetric_difference(simplest)
    if len(diff) > 0:
        print(shortest)
        print(simplest)
        print(diff)

progressive psytrance -> ambient idm
['progressive psytrance', 'deep psytrance', 'glitch hop', 'intelligent dance music', 'ambient idm']
progressive psytrance -> glitch
['progressive psytrance', 'deep psytrance', 'glitch hop', 'glitch']
progressive psytrance -> modern performance
['progressive psytrance', 'deep psytrance', 'glitch hop', 'glitch', 'drone', 'modern performance']
progressive psytrance -> minimal
['progressive psytrance', 'deep psytrance', 'glitch hop', 'glitch', 'drone', 'minimal']
progressive psytrance -> dream pop
['progressive psytrance', 'deep psytrance', 'glitch hop', 'wonky', 'indie r&b', 'dream pop']
progressive psytrance -> ethereal wave
['progressive psytrance', 'psychedelic trance', 'psychill', 'world fusion', 'medieval', 'ethereal wave']
progressive psytrance -> classical christmas
['progressive psytrance', 'psychedelic trance', 'psychill', 'ambient', 'new age', 'classical christmas']
progressive psytrance -> choral
['progressive psytrance', 'psychedelic trance

In [None]:
class MGSongGroup():
    def __init__(self, name, group_id):
        self.name = name
        self.group_id = group_id
        self.gr_song_features, self.gr_song_names = load_songs(select_song_group(group_id),
                                                               count_song_group(group_id))
        self.gr_song_features, self.gr_song_genres, _, _, _ = prepare_songs(self.gr_song_features, features_scaler)
        self.features = self.gr_song_features[:,featured_features]
        self.printable_features = self.gr_song_features[:,:9]
    
        self.gr_top_g_indexer = get_core_genres(self.gr_song_genres, describe=False)
        self.core_features = self.features[self.gr_top_g_indexer]
        self.clf = sklearn.svm.OneClassSVM(nu=0.1, kernel="rbf", gamma=0.1)
        self.clf.fit(self.core_features)
        self.y_pred, self.fit_prc, self.y_decision = self.compute_outliers(self.features)        
        
    def compute_outliers(self, more_features):
        if more_features.shape[0] == 0:
            return None, 0, None
        y_pred = self.clf.predict(more_features)
        fit_prc = 100-(y_pred[y_pred == -1].size * 100.0 / more_features.shape[0])
        y_decision = self.clf.decision_function(more_features)[:,0]
        return y_pred, fit_prc, y_decision
    
    @property
    def ingroup_features(self):
        return self.features[self.y_pred==1]
    
    @property
    def outlier_features(self):
        return self.features[self.y_pred==-1]
        
    def describe_outliers(self, y_pred, fit_prc, y_decision, gr_song_features, gr_song_names):
        print('%% of outliers in dataset: %f%%' % fit_prc)
        y_decision_sort = np.argsort(y_decision)
        for sort_id in y_decision_sort[:20]:
            if y_pred[sort_id] == -1:
                song = gr_song_features[sort_id]
                print('song %s-%s(%i) is outlier with dist %f' % (gr_song_names[song[13]], genres[song[14]],
                                                                  song[13], y_decision[sort_id]))
        print('most crazy outlier')
        print(print_features(gr_song_features[y_decision_sort[0],0:9]))

        print('')
        y_decision_sort = y_decision_sort[::-1]
        for sort_id in y_decision_sort[:20]:
            if y_pred[sort_id] == 1:
                song = gr_song_features[sort_id]
                print('song %s-%s(%i) is ingroup with dist %f' % (gr_song_names[song[13]], genres[song[14]],
                                                                  song[13], y_decision[sort_id]))
        print('most ingrouped')
        print(print_features(gr_song_features[y_decision_sort[0],0:9]))
        
    def describe_self(self):
        self.describe_outliers(self.y_pred, self.fit_prc, self.y_decision, self.gr_song_features, self.gr_song_names)

In [None]:
db.session.rollback()
# describe_song_group("ambient", 31)
group_30 = MGSongGroup("rfix library", 66)
# group_30.describe_self()
# describe_song_group_type('FALL ASLEEP', 1)
for gr in db.session.query(Group).order_by(Group.GroupId):
    group = MGSongGroup(gr.Name, gr.GroupId)
    _, ext_fit, _ = group_30.compute_outliers(group.features)
    _, ext_fit_ing, _ = group_30.compute_outliers(group.ingroup_features)
    print('group %s (%i) fit into group %s = %f vs %f ING, self fit %f' % 
          (group.name, group.group_id, group_30.name, ext_fit, ext_fit_ing, group.fit_prc))

In [None]:
#np.max(song_genres)
#np.bincount(song_genres[1:1000])
# song_genres[1:1000]/1383.0
#song_names[1:1000].shape
#projection[:,2]
#flabels = song_names[song_genres < 10]
#len(flabels)
#print(len(genres))
#print(np.mean(song_features, axis=0))
#print(np.std(song_features, axis=0))
#print(np.max(song_features, axis=0))
#print(np.min(song_features, axis=0))
#projection
#genres
#for i, label in enumerate(genres):
#    print('%i - %s' % (i,label))
#print(np.mean(genre_features, axis=0))
#print(np.std(genre_features, axis=0))
#print(np.max(genre_features, axis=0))
#print(np.min(genre_features, axis=0))
# print(song_names[9943])
# print(song_features[song_features[:, 13] == 9943])
# print(song_features[song_features[:, 14] == 10, 0])
print(np.mean(song_features[song_features[:, 14] == 10, 0]))
print(genre_features[9])

In [None]:
featured_features = [0,1,2,3,4,5,7,8]
features = gr_song_features[:,featured_features]

n_clusters_ = 6
from sklearn.neighbors import kneighbors_graph
connectivity = kneighbors_graph(features, n_neighbors=10, include_self=False)

cls = sklearn.cluster.AgglomerativeClustering(n_clusters=n_clusters_, connectivity=connectivity, linkage='ward')
cls.fit(features)
y_pred = cls.labels_
# [{'node_id': next(itertools.count(X.shape[0])), 'left': x[0], 'right':x[1]} for x in clustering.children_]
print('No of clusters %i' % n_clusters_)
# print(y_pred)
y_pred = y_pred + 1
print(np.bincount(y_pred))
components = {0: 'noise samples'}
for c in range(1, n_clusters_+1):
    components[c] = 'component %i' % c
projection = tsne_projection(features) #
plot3(projection, components, color=y_pred)

In [None]:
# make one hot encoded features for genres
genres_one_hot = np.zeros([gr_song_features.shape[0], len(genres)])
print(genres_one_hot.shape)
#print(artists_genres)
for row in range(gr_song_features.shape[0]):
    s = gr_song_features[row]
    if s[15] in artists_genres:
        for g in artists_genres[s[15]]:
            genres_one_hot[row,g-1] = 1
# combine features
combined = np.hstack((features, genres_one_hot))
max_k = 10
nc = 6
#print('')
#print('detection no cluster with kmeans_gap_statistics')
#nc = kmeans_gap_statistics(combined, max_k)
#print('got %i clusters' % nc)
print('')
print('detection no cluster with kmeans_silhouette')
# nc = kmeans_silhouette(combined, max_k)
print('got %i clusters' % nc)
print('')
print('detection no cluster with kmeans_f_of_K')
# nc = kmeans_f_of_K(combined, max_k)
print('got %i clusters -> used for plotting' % nc)
# y_pred = fit_kmeans(features, 6)
for i, n in enumerate(feature_names):
    print('%i: %s' % (i,n))
dist_mod_sleep = [3,0.3,1,0.5,1,1,1,5]
dist_mod_library = [3,0.3,1,0.5,2,2,1,3]
weighted_features = features*np.asarray(dist_mod_sleep)
print('computing TSNE projection')
#projection = tsne_projection(weighted_features)
#y_pred = fit_dpgmm(weighted_features, 12)
plot_clusters(projection, y_pred)

#describe_clusters('combined', gr_song_features, y_pred, gr_song_names, gr_song_genres)
#clusters_to_playlists('clustered library', gr_song_features, y_pred, gr_song_names, gr_song_genres)
clusters_to_playlists('acoustic features', gr_song_features, y_pred, gr_song_names, gr_song_genres)
pca = sklearn.decomposition.PCA(n_components=50) # 
pca_proj = pca.fit(combined).transform(combined)

fit_kmeans(genres_one_hot, 3)
fit_kmeans(pca_proj, 3)
fit_kmeans(combined, 3)
fit_kmeans(features, 3)