In [1]:
"""
    iterates over the million playlist dataset and outputs info
    about what is in there.

    Usage:

        python stats.py path-to-mpd-data
"""
import sys
import json
import re
import collections
import os
import datetime

total_playlists = 0
total_tracks = 0
tracks = set()
artists = set()
albums = set()
titles = set()
total_descriptions = 0
ntitles = set()
title_histogram = collections.Counter()
artist_histogram = collections.Counter()
track_histogram = collections.Counter()
last_modified_histogram = collections.Counter()
num_edits_histogram = collections.Counter()
playlist_length_histogram = collections.Counter()
num_followers_histogram = collections.Counter()


track_dict = {}

playlist_dict = {}

artist_dict = {}


haha = len(playlist_dict)

def mpd_path(path):
    fullpath = path
    f = open(fullpath)
    js = f.read()
    f.close()
    mpd_slice = json.loads(js)
    process_info(mpd_slice['info'])
    for playlist in mpd_slice['playlists']:
        process_playlist(playlist)

def show_summary():
    print
    print "number of playlists", total_playlists
    print "number of tracks", total_tracks
    print "number of unique tracks", len(tracks)
    print "number of unique albums", len(albums)
    print "number of unique artists", len(artists)
    print "number of unique titles", len(titles)
    print "number of playlists with descriptions", total_descriptions
    print "number of unique normalized titles", len(ntitles)
    print "avg playlist length", float(total_tracks) / total_playlists
    print
    print "top playlist titles"
    for title, count in title_histogram.most_common(20):
        print "%7d %s" % (count, title)

    print
    print "top tracks"
    for track, count in track_histogram.most_common(20):
        print "%7d %s" % (count, track)

    print
    print "top artists"
    for artist, count in artist_histogram.most_common(20):
        print "%7d %s" % (count, artist)

    print
    print "numedits histogram"
    for num_edits, count in num_edits_histogram.most_common(20):
        print "%7d %d" % (count, num_edits)

    print
    print "last modified histogram"
    for ts, count in last_modified_histogram.most_common(20):
        print "%7d %s" % (count, to_date(ts))

    print
    print "playlist length histogram"
    for length, count in playlist_length_histogram.most_common(20):
        print "%7d %d" % (count, length)

    print
    print "num followers histogram"
    for followers, count in num_followers_histogram.most_common(20):
        print "%7d %d" % (count, followers)


def normalize_name(name):
    name = name.lower()
    name = re.sub(r"[.,\/#!$%\^\*;:{}=\_`~()@]", ' ', name)
    name = re.sub(r'\s+', ' ', name).strip()
    return name


def to_date(epoch):
    return datetime.datetime.fromtimestamp(epoch).strftime("%Y-%m-%d")

def process_playlist(playlist):
    global total_playlists, total_tracks, total_descriptions

    total_playlists += 1
    # print playlist['playlist_id'], playlist['name']

    if 'description' in playlist:
        total_descriptions += 1

    titles.add(playlist['name'])
    nname = normalize_name(playlist['name'])
    #haha = len(playlist_dict) + 1
    playlist_dict[total_playlists - 1] = []
    
    ntitles.add(nname)
    title_histogram[nname] += 1

    playlist_length_histogram[playlist['num_tracks']] += 1
    last_modified_histogram[playlist['modified_at']] += 1
    num_edits_histogram[playlist['num_edits']] += 1
    num_followers_histogram[playlist['num_followers']] += 1

    for track in playlist['tracks']:
        total_tracks += 1
        albums.add(track['album_uri'])
        tracks.add(track['track_uri'])
        artists.add(track['artist_uri'])
        
        if track['artist_uri'] not in artist_dict:
            artist_dict[track['artist_uri']] = 1
        else :
            artist_dict[track['artist_uri']] += 1
            
        
        #if track['track_uri'] not in track_dict:
         #   track_dict[track['track_uri']] = 1
        #else:
         #   track_dict[track['track_uri']] += 1
            
        if track['artist_uri'] not in playlist_dict[total_playlists - 1]:
            playlist_dict[total_playlists - 1].append(track['artist_uri'])

        full_name = track['track_name'] + " by " + track['artist_name']
        artist_histogram[track['artist_name']] += 1
        track_histogram[full_name] += 1


def process_info(_):
    pass


In [2]:
print haha

0


In [3]:
file_path_1 = 'mpd.slice.4000-4999.json'



In [4]:
fullpath = file_path_1
f = open(fullpath)
js = f.read()
f.close()
mpd_slice = json.loads(js)
process_info(mpd_slice['info'])
for playlist in mpd_slice['playlists']:
    process_playlist(playlist)



In [5]:
show_summary()


number of playlists 1000
number of tracks 68101
number of unique tracks 35654
number of unique albums 20322
number of unique artists 10217
number of unique titles 834
number of playlists with descriptions 24
number of unique normalized titles 732
avg playlist length 68.101

top playlist titles
     28 country
     19 chill
     11 oldies
      8 rock
      8 workout
      8 throwback
      7 lit
      6 edm
      6 vibes
      6 sad
      6 pregame
      6 party
      6 old school
      5 rap
      5 hype
      5 jams
      5 good vibes
      4 sleep
      4 classic rock
      4 christian

top tracks
     48 Ignition - Remix by R. Kelly
     47 HUMBLE. by Kendrick Lamar
     44 Broccoli (feat. Lil Yachty) by DRAM
     39 Caroline by Aminé
     39 Closer by The Chainsmokers
     38 Congratulations by Post Malone
     35 Mask Off by Future
     34 I'm the One by DJ Khaled
     34 Shape of You by Ed Sheeran
     34 Ni**as In Paris by JAY Z
     33 Body Like A Back Road by Sam Hunt
     3

In [6]:
print len(artist_dict)
print len(playlist_dict)

10217
1000


In [7]:
new_array = sorted(artist_dict, key=artist_dict.__getitem__, reverse=True)


In [9]:
vocabulary = new_array[0:9000]
context = new_array[0:3000]

In [10]:
for i in range(20):
    print (i+1, context[i], artist_dict[context[i]] )
    print ()


(1, u'spotify:artist:3TVXtAsR1Inumwj472S9r4', 853)
()
(2, u'spotify:artist:5K4W6rqBFWDnAN6FQUkS6x', 443)
()
(3, u'spotify:artist:2YZyLoL8N0Wb9xBt1NhZWg', 340)
()
(4, u'spotify:artist:1Xyo4u8uXC1ZmMpatF05PJ', 332)
()
(5, u'spotify:artist:5pKCCKE2ajJHZ9KAiaK11H', 297)
()
(6, u'spotify:artist:1RyvyyTE3xzB2ZywiAwp0i', 277)
()
(7, u'spotify:artist:6l3HvQ5sa6mXTsMTB19rO5', 271)
()
(8, u'spotify:artist:0BvkDsjIUla7X0k6CSWh1I', 257)
()
(9, u'spotify:artist:69GGBxA162lTqCwzJG5jLp', 256)
()
(10, u'spotify:artist:7dGJo4pcD2V6oG8kP0tJRR', 251)
()
(11, u'spotify:artist:6eUKZXaKkcviH0Ku9w2n3V', 230)
()
(12, u'spotify:artist:0c173mlxpT3dSFRgMO8XPh', 218)
()
(13, u'spotify:artist:55Aa2cqylxrFIXC767Z865', 212)
()
(14, u'spotify:artist:7CajNmpbOovFoOoasH2HaY', 212)
()
(15, u'spotify:artist:3nFkdlSjzX9mRTtwJOzDYB', 211)
()
(16, u'spotify:artist:3YQKmKGau1PzlVlkL1iodx', 205)
()
(17, u'spotify:artist:6vWDO969PvNqNYHIOW5v0m', 203)
()
(18, u'spotify:artist:3b8QkneNDz4JHKKKlLgYZg', 202)
()
(19, u'spotify:arti

In [11]:
import numpy as np
cooccurence_matrix = np.zeros((len(vocabulary), len(context)))
for k in range(len(vocabulary)):
    for l in range(len(playlist_dict)):
        indices = [i for i, x in enumerate(playlist_dict[l]) if x == vocabulary[k]]
        for i in indices:  
            for j in range(20): # Window size of 10
                if i-j >= 0:
                    if playlist_dict[l][i-j] in context:
                        index = context.index(playlist_dict[l][i-j])
                        cooccurence_matrix[k][index] += 1                
                if i+j <= len(playlist_dict[l])-1:
                    if playlist_dict[l][i+j] in context:
                        index = context.index(playlist_dict[l][i+j])
                        cooccurence_matrix[k][index] += 1
                                                

In [12]:
np.seterr(invalid='ignore')

from scipy.sparse import csr_matrix

Probability_c_given_w = csr_matrix((len(vocabulary), len(context)))
sum_of_rows = np.sum(cooccurence_matrix, axis=1)
sum_of_rows = sum_of_rows.reshape((len(sum_of_rows),1))

Probability_c_given_w = cooccurence_matrix/sum_of_rows
Probability_c_given_w[np.isnan(Probability_c_given_w)] = 0

print (Probability_c_given_w)     

[[0.08030593 0.01253452 0.01189717 ... 0.         0.         0.00021245]
 [0.01549777 0.07407407 0.00971894 ... 0.         0.         0.        ]
 [0.01808201 0.01194705 0.07620278 ... 0.         0.         0.00032289]
 ...
 [0.         0.         0.         ... 0.         0.         0.        ]
 [0.         0.         0.         ... 0.         0.         0.        ]
 [0.         0.         0.         ... 0.         0.         0.        ]]


In [13]:
Overall_probability_distribution = csr_matrix(len(context))

total_sum = cooccurence_matrix.sum()
#for k in range(len(context)):
 #   sum_of_column = cooccurence_matrix[:, k].sum()
  #  Overall_probability_distribution = sum_of_column/float(total_sum)
sum_of_columns = np.sum(cooccurence_matrix, axis=0)
Overall_probability_distribution = sum_of_columns / float(total_sum)

print Overall_probability_distribution

[5.75627652e-03 4.62109651e-03 3.83825171e-03 ... 7.92754226e-05
 3.96377113e-05 1.29923609e-04]


In [14]:
# Representation of each word in the vocabulary as a vector 
np.seterr(divide='ignore')
from math import log
vector_matrix = csr_matrix((len(vocabulary), len(context)))
#vector_matrix = np.divide(Probability_c_given_w, Overall_probability_distribution[:, None], out=np.zeros_like(Probability_c_given_w), where=(Overall_probability_distribution)!=0)

Overall_probability_distribution = Overall_probability_distribution.reshape(len(Overall_probability_distribution), 1)
vector_matrix = np.divide(Probability_c_given_w, np.transpose(Overall_probability_distribution))
vector_matrix = np.log10(vector_matrix)
#vector_matrix = np.log10(vocabulary_vector_matrix)    
zero_matrix = np.zeros((len(vocabulary), len(context)))
vocabulary_vector_matrix = np.maximum(vector_matrix, zero_matrix)

vocabulary_vector_matrix[np.isnan(vocabulary_vector_matrix)] = 0


print(vocabulary_vector_matrix)

[[1.14460595 0.43336277 0.49131038 ... 0.         0.         0.21356772]
 [0.43012749 1.20492119 0.40348539 ... 0.         0.         0.        ]
 [0.49710517 0.41251548 1.29783735 ... 0.         0.         0.39537072]
 ...
 [0.         0.         0.         ... 0.         0.         0.        ]
 [0.         0.         0.         ... 0.         0.         0.        ]
 [0.         0.         0.         ... 0.         0.         0.        ]]


In [15]:
# Clustering the words using KMeans algorithm

from sklearn.cluster import KMeans
kmeans = KMeans(n_clusters=100).fit(vocabulary_vector_matrix)

clusters = {}
s = 0
for label in kmeans.labels_:
    if label not in clusters:
        clusters[label] = []
    clusters[label].append(vocabulary[s])
    s = s+1

In [16]:
# Finding the nearest neighbors of the top 20 most frequent words in the dictionary
from sklearn.metrics.pairwise import cosine_distances

distance_matrix = cosine_distances(vocabulary_vector_matrix[:1000], vocabulary_vector_matrix)
print distance_matrix.shape



(1000, 9000)


In [18]:
#print (distance_matrix.shape)
for i in range(len(distance_matrix)):
    sorted_indices = np.argsort(distance_matrix[i])
    print (vocabulary[i] + str(' : ') + vocabulary[sorted_indices[1]] + str(' , ') + vocabulary[sorted_indices[2]] + str(' , ') + vocabulary[sorted_indices[3]] + str(' , ') + vocabulary[sorted_indices[4]] + str(' , ') + vocabulary[sorted_indices[5]])
    print 
    print 
    

spotify:artist:3TVXtAsR1Inumwj472S9r4 : spotify:artist:0Y5tJX1MQlPlqiwlOH1tJY , spotify:artist:2hWs22BmQkK4czFtDLnar2 , spotify:artist:6oMuImdp5ZcFhWP0ESe6mG , spotify:artist:1RyvyyTE3xzB2ZywiAwp0i , spotify:artist:0c173mlxpT3dSFRgMO8XPh


spotify:artist:5K4W6rqBFWDnAN6FQUkS6x : spotify:artist:5IcR3N7QB1j6KBL8eImZ8m , spotify:artist:3nFkdlSjzX9mRTtwJOzDYB , spotify:artist:55Aa2cqylxrFIXC767Z865 , spotify:artist:2YZyLoL8N0Wb9xBt1NhZWg , spotify:artist:0QHgL1lAIqAw0HtD7YldmP


spotify:artist:2YZyLoL8N0Wb9xBt1NhZWg : spotify:artist:17lzZA2AlOHwCwFALHttmp , spotify:artist:1RyvyyTE3xzB2ZywiAwp0i , spotify:artist:6oMuImdp5ZcFhWP0ESe6mG , spotify:artist:5dHt1vcEm9qb8fCyLcB3HL , spotify:artist:3TVXtAsR1Inumwj472S9r4


spotify:artist:1Xyo4u8uXC1ZmMpatF05PJ : spotify:artist:7iZtZyCzp3LItcw1wtPI3D , spotify:artist:50co4Is1HCEo8bhOyUWKpn , spotify:artist:3TVXtAsR1Inumwj472S9r4 , spotify:artist:246dkjvS1zLTtiykXe5h60 , spotify:artist:0c173mlxpT3dSFRgMO8XPh


spotify:artist:5pKCCKE2ajJHZ9KAiaK11H : 

spotify:artist:6d47Z08T4snK50HgTEHo5Z : spotify:artist:6KZDXtSj0SzGOV705nNeh3 , spotify:artist:0A0FS04o6zMoto8OKPsDwY , spotify:artist:6oMuImdp5ZcFhWP0ESe6mG , spotify:artist:7c0XG5cIJTrrAgEC3ULPiq , spotify:artist:6Ha4aES39QiVjR0L2lwuwq


spotify:artist:2KsP6tYLJlTBvSUxnwlVWa : spotify:artist:1uNFoZAHBGtllmzznpCI3s , spotify:artist:69GGBxA162lTqCwzJG5jLp , spotify:artist:6T5tfhQCknKG4UnH90qGnz , spotify:artist:738wLrAtLtCtFOLvQBXOXp , spotify:artist:540vIaP2JwjQb9dm3aArA4


spotify:artist:2xe8IXgCTpwHE3eA9hTs4n : spotify:artist:053q0ukIDRgzwTr4vNSwab , spotify:artist:6qqNVTkY8uBg9cP3Jd7DAH , spotify:artist:1TtJ8j22Roc24e2Jx3OcU4 , spotify:artist:4kubsO16bEfCADaVUyoYb6 , spotify:artist:6nxWCVXbOlEVRexSbLsTer


spotify:artist:1xU878Z1QtBldR7ru9owdU : spotify:artist:536BYVgOnRky0xjsPT96zl , spotify:artist:0SwO7SWeDHJijQ3XNS7xEE , spotify:artist:4j56EQDQu5XnL7R3E9iFJT , spotify:artist:5BvJzeQpmsdsFp4HGUYUEx , spotify:artist:6liAMWkVf5LH7YR9yfFy1Y


spotify:artist:75dQReiBOHN37fQgWQrIAJ : 

spotify:artist:3I4IAI5sdU0vakMOD6SLZL : spotify:artist:4AX0I32V6XRGh9aRv7wj2h , spotify:artist:61jXuN59QewmUA4GPRKBj4 , spotify:artist:1mYsTxnqsietFxj1OgoGbG , spotify:artist:7gXLT7X0z4mE87tvo4V4l9 , spotify:artist:4L3GTE04bW5N7azA9QPhjA


spotify:artist:329e4yvIujISKGKz1BZZbO : spotify:artist:1vyhD5VmyZ7KMfW5gqLgo5 , spotify:artist:5fJsY7afrbsyzJj9wdzJMh , spotify:artist:2jSGzJw0ebJLu7OLVSOcBP , spotify:artist:3IEvQoAohcGX7CdrbtIle7 , spotify:artist:4DYFVNKZ1uixa6SQTvzQwJ


spotify:artist:0rG0AZBscc8S8q1ahIsasI : spotify:artist:44PA0rCQXikgOWbfY7Fq7m , spotify:artist:1Zz5UxfKSSqc6hpa3xJPCw , spotify:artist:50NoVNy9GU1lCrDV8iGpyu , spotify:artist:2BvzbqWWwLN11XGBYgDZzx , spotify:artist:0IF46mUS8NXjgHabxk2MCM


spotify:artist:165ZgPlLkK7bf5bDoFc6Sb : spotify:artist:0KDuKk6YdEu3hR56HtXmxt , spotify:artist:6gZq1Q6bdOxsUPUG1TaFbF , spotify:artist:1UdQqCUR7RwB9YYJONwbdM , spotify:artist:4OMgOSIBM1FmY0dTY3O14J , spotify:artist:3RNrq3jvMZxD9ZyoOZbQOD


spotify:artist:0FWzNDaEu9jdgcYTbcOa4F : 

In [None]:
import numpy as np
import tensorflow as tf
import helpers

tf.reset_default_graph()
sess = tf.InteractiveSession()


import sys
import json
import re
import collections
import os
import datetime
import helpers

In [None]:
#fullpath = 'mpd.slice.1000-1999_10_set.json'
fullpath = ['mpd.slice.1000-1999_100_set.json']
#fullpath = '../train_set_untouched/data/mpd.slice.1000-1999.json'

In [None]:
# Input to seq to seq model
X_train = []
count = 2
play_list_names = []

# Dictionary to maintain the song index and track name, other features can be added later
track_dict = {}

# Dicionray to maintain count to uri mapping
uri_dict = {}
  
for item in fullpath:
    #print item
    f = open(item)
    file_content = f.read()
    f.close()
    mpd_slice = json.loads(file_content)


    # Playlist name list
    play_list_names += [re.sub('[^a-zA-Z0-9]+', '', mpd_slice['playlists'][idx]['name'].lower()) for idx in range(0, len(mpd_slice['playlists']))]
    #print play_list_names
    #print item, len(play_list_names), count

count += len(set(play_list_names))
#print count 
    
for item in fullpath:
    #print item
    f = open(item)
    file_content = f.read()
    f.close()
    mpd_slice = json.loads(file_content)


    # Looping to create the X_train
    for idx in range(0, len(mpd_slice['playlists'])):
        lst_current = [] 

        lst_current.append(play_list_names.index(re.sub('[^a-zA-Z0-9]+', '', mpd_slice['playlists'][idx]['name'].lower())) + 2)    

        for idy in range(0, len(mpd_slice['playlists'][idx]['tracks'])):
            track_uri = re.sub('spotify:track:', '', mpd_slice['playlists'][idx]['tracks'][idy]['track_uri'])
            if(track_uri not in track_dict):
                track_dict[track_uri] = count
                uri_dict[count] = [track_uri, mpd_slice['playlists'][idx]['tracks'][idy]['track_name']]
                count += 1
            lst_current.append(track_dict[track_uri])
            #print lst_current
            #print X_train, "sac"
        X_train.append(lst_current)
        #print "done", idx
    print "done"

    #print X_train
    

#print (X_train)
print (count)
print (len(play_list_names))
print (len(set(play_list_names)))

In [None]:
print track_dict


In [None]:
print track_dict['5hqfUSjOet968lBbIQXIJR']


In [None]:
print uri_dict

In [None]:
vocabulary_dict = {}
