In [10]:
import json
import os
import pickle

In [13]:
# path to dataset
data_path = '/recsys/data/spotify/spotify_million_playlist_dataset/data/'
# path for output data
data_out_path = '/recsys/data/spotify/s2v_data/'

## Data preperation
We parse the raw json files and create tokenized playlists.
We don't save any Meta Data since this project does not use any Meta Data

The output is:
- train: 800K playlists
- test: 200K playlists
- track_data_map: mapping from song name to token
- reverse_track_lookup: mapping from token to song name

In [3]:
track_data_map = {}
reverse_track_lookup = {}

In [4]:
playlists = []

In [5]:
s_id = 0
filenames = os.listdir(data_path)
for filename in sorted(filenames):
    if filename.startswith("mpd.slice.") and filename.endswith(".json"):
        fullpath = os.sep.join((data_path, filename))
        
        with open(fullpath) as f:

            js = f.read()

        mpd_slice = json.loads(js)
        
        for playlist in mpd_slice["playlists"]:
            pl = []
            for track in playlist["tracks"]:
                uri = track["track_uri"]
                if uri not in track_data_map:
                    track_name = track['track_name']
                    artist_name = track['artist_name']
                    artist_uri = track['artist_uri']
                    album_uri = track['album_uri']
                    album_name = track['album_name']
                    duration = float(track['duration_ms']) / 1000
                    
                    track_data = {'id': s_id, 
                                  'name': track_name,
                                  'duration': duration,
                                  'artist_name': artist_name,
                                  'artist_uri': artist_uri,
                                  'album_name': album_name,
                                  'album_uri': album_uri}
                    
                    track_data_map[uri] = track_data
                    reverse_track_lookup[s_id] = uri
                    s_id += 1
                    
                pl.append(track_data_map[uri]['id'])
            
            playlists.append(' '.join([str(s) for s in pl]))

In [7]:
train, test = playlists[:800000], playlists[800000:]

In [11]:
data = [train, test, track_data_map, reverse_track_lookup]

In [15]:
with open(data_out_path + 'sentences_and_maps.pickle', 'wb') as handle:
    pickle.dump(data, handle, protocol=pickle.HIGHEST_PROTOCOL)