## CSV file maker track uri - numeric ID
Makes a csv that assigns each track_uri a numeric ID <br>
The higher the ID is, the most popular the track is <br>
Also makes the following dictionaries:
* Trackuri - ID
* Trackuri - title
* ID - title

In [1]:
import sys
import json
import re
import os
import pandas as pd
import numpy as np
import csv
import scipy.sparse as sp
import scipy.sparse.linalg  as la
import itertools
from tqdm import tqdm_notebook
import collections 
from whoosh.analysis import CharsetFilter, StemmingAnalyzer
from whoosh import fields
from whoosh.support.charset import accent_map
from utils import normalize_name

mpd_path = '../../MPD/data/' #DEFINE YOUR PATH


quick = False
max_files_for_quick_processing = 1

In [2]:
total_tracks = 0
track_histogram = collections.Counter()
num = 4
dict_sorted_trackuri_id = dict()
dict_trackuri_title = dict()
dict_playlistpid_title = dict()

def process_mpd(path):
    count = 0
    filenames = os.listdir(path)
    for filename in tqdm_notebook(sorted(filenames)):
        if filename.startswith("mpd.slice.") and filename.endswith(".json"):
            fullpath = os.sep.join((path, filename))
            f = open(fullpath)
            js = f.read()
            f.close()
            mpd_slice = json.loads(js)
            for playlist in mpd_slice['playlists']:
                process_playlist(playlist)
            count += 1

            if quick and count > max_files_for_quick_processing:
                break

    reorder()
    write_files()

    
def write_files():
    if not os.path.exists("dictionaries"):
        os.makedirs("dictionaries")
    
    
    with open('dictionaries/dict_sorted_trackuri_id.csv', 'w', newline='') as csvfile:
        spamwriter = csv.writer(csvfile, delimiter=',', quoting=csv.QUOTE_MINIMAL)
        spamwriter.writerow(["id","track_uri"])
        for trackuri in dict_sorted_trackuri_id:
            spamwriter.writerow([trackuri,dict_sorted_trackuri_id[trackuri]])
    
    with open('dictionaries/dict_trackuri_title.csv', 'w', newline='', encoding='utf-8') as csvfile:
        spamwriter = csv.writer(csvfile, delimiter=',', quoting=csv.QUOTE_MINIMAL)
        spamwriter.writerow(["track_uri","title"])
        for trackuri in dict_trackuri_title:
            spamwriter.writerow([trackuri,dict_trackuri_title[trackuri]])
            
    with open('dictionaries/dict_id_title.csv', 'w', newline='', encoding='utf-8') as csvfile:
        spamwriter = csv.writer(csvfile, delimiter=',', quoting=csv.QUOTE_MINIMAL)
        spamwriter.writerow(["track_uri","title"])
        for trackuri in dict_trackuri_title:
            spamwriter.writerow([dict_sorted_trackuri_id[trackuri],dict_trackuri_title[trackuri]])
            
    with open('dictionaries/dict_playlistpid_title.csv', 'w', newline='', encoding='utf-8') as csvfile:
        spamwriter = csv.writer(csvfile, delimiter=',', quoting=csv.QUOTE_MINIMAL)
        spamwriter.writerow(["playlist_pid","title"])
        for pid in dict_playlistpid_title:
            spamwriter.writerow([pid,dict_playlistpid_title[pid]])
            
        
# reorders by popularity, selects the ones appeareing more than 4 times and makes dictionary
def reorder():
    # tracks sorted by popularity
    dict_sorted_occurences = collections.OrderedDict(track_histogram.most_common())
    # assign each track an id, track: new_id, where the lowest the new_id is, the most common is the track
    for word in dict_sorted_occurences:
        dict_sorted_trackuri_id[word] = len(dict_sorted_trackuri_id)
    
        
    t1 = len(list(filter(lambda y: y[1]>4, track_histogram.items())))
    print("number of tracks", total_tracks)
    print("number of unique tracks", len(track_histogram))
    print("number tracks appearing > "+str(num)+" times: "+ str(t1) +
          " that is "+ str(round(t1/len(track_histogram)*100,2))+"%" )
    print("number of playlists", len(dict_playlistpid_title))

# fills track_histogram
def process_playlist(playlist):
    global total_tracks
    
    if playlist['pid'] not in dict_playlistpid_title:
        dict_playlistpid_title[playlist['pid']] = playlist['name']
    
    for track in playlist['tracks']:
        total_tracks += 1
        track_histogram[track['track_uri']] += 1
        if track['track_uri'] not in dict_trackuri_title:
            dict_trackuri_title[track['track_uri']]= track["track_name"]
            
        
        
process_mpd(mpd_path)


number of tracks 66346428
number of unique tracks 2262292
number tracks appearing > 4 times: 599341 that is 26.49%
number of playlists 1000000


In [3]:
total_artists = 0
artist_histogram = collections.Counter()
dict_artisturi_id = dict()
dict_artisturi_artist = dict()

total_normalized_titles = 0
ntitles_histogram = collections.Counter()
dict_ntitle_id = dict()

def process_mpd(path):
    count = 0
    filenames = os.listdir(path)
    for filename in tqdm_notebook(sorted(filenames)):
        if filename.startswith("mpd.slice.") and filename.endswith(".json"):
            fullpath = os.sep.join((path, filename))
            f = open(fullpath)
            js = f.read()
            f.close()
            mpd_slice = json.loads(js)
            for playlist in mpd_slice['playlists']:
                process_playlist(playlist)
            count += 1

            if quick and count > max_files_for_quick_processing:
                break

    reorder()
    write_files()

def write_files():
    if not os.path.exists("dictionaries"):
        os.makedirs("dictionaries")
    with open('dictionaries/dict_artisturi_id.csv', 'w', newline='') as csvfile:
        spamwriter = csv.writer(csvfile, delimiter=',', quoting=csv.QUOTE_MINIMAL)
        spamwriter.writerow(["artist_uri","id"])
        for artisturi in dict_artisturi_id:
            spamwriter.writerow([artisturi,dict_artisturi_id[artisturi]])
    
    with open('dictionaries/dict_artisturi_artist.csv', 'w', newline='', encoding='utf-8') as csvfile:
        spamwriter = csv.writer(csvfile, delimiter=',', quoting=csv.QUOTE_MINIMAL)
        spamwriter.writerow(["artist_uri","artist"])
        for uri in dict_artisturi_artist:
            spamwriter.writerow([uri,dict_artisturi_artist[uri]])
        
            
    with open('dictionaries/dict_id_ntitle_final.csv', 'w', newline='', encoding='utf-8') as csvfile:
        spamwriter = csv.writer(csvfile, delimiter=',', quoting=csv.QUOTE_MINIMAL)
        spamwriter.writerow(["id","normalized_title"])
        for title in dict_ntitle_id.keys():
            spamwriter.writerow([dict_ntitle_id[title],title] )
            

            
        
# reorders by popularity, selects the ones appeareing more than 4 times and makes dictionary
def reorder():
    # tracks sorted by popularity
    dict_sorted_occurences_artist = collections.OrderedDict(artist_histogram.most_common())
    dict_sorted_occurences_ntitle = collections.OrderedDict(ntitles_histogram.most_common())
    
    # assign each track an id, track: new_id, where the lowest the new_id is, the most common is the artist/album
    for word in dict_sorted_occurences_artist:
        dict_artisturi_id[word] = len(dict_artisturi_id)
    for word in dict_sorted_occurences_ntitle:
        dict_ntitle_id[word] = len(dict_ntitle_id)
    
    print("number of unique artists", len(artist_histogram))
    print("number of unique normalized titles: "+str(len(dict_ntitle_id)))
    ar = len(list(filter(lambda y: y[1]>4, artist_histogram.items())))
    
    
# fills track_histogram
def process_playlist(playlist):
    ntitles_histogram[normalize_name(playlist["name"])] += 1
   
    for track in playlist['tracks']:
        artist_histogram[track['artist_uri']] += 1
        
        
    # creating the dictionaries with original artist and album names 
    if track['artist_uri'] not in dict_artisturi_artist:
        dict_artisturi_artist[track['artist_uri']]= track["artist_name"]   
    
        
process_mpd(mpd_path)


number of unique artists 295860
number of unique normalized titles: 15875
