# Clean data slices with playists by ID
* [tracks...] : creates a file with less information than the original one specifically, a csv file containing all the playlists each line will be the ID tracks of one playlist 
* normalized_title:[tracks..] dictionary with track ids, where the key is the id of the normalized title
* [normalized_playlists...] creates a new csv file containing, for each track, a list of all the playlist pids it appears in. 


In [1]:
import csv
import pandas as pd
import gensim
import os
import numpy as np
import sys
from tqdm import tqdm_notebook
import json
from gensim.models import Word2Vec
import time
import re
from whoosh.analysis import CharsetFilter, StemmingAnalyzer
from whoosh import fields
from whoosh.support.charset import accent_map
import pickle
from string import ascii_letters
from utils import normalize_name

mpd_path = '../../MPD/data/' #DEFINE YOUR PATH
mpd_sequence_path = 'mpd_playlist_track_seq/' #DEFINE YOUR PATH TO OUTPUT PLAYLIST TRACK ID sequences

quick = False
max_files_for_quick_processing = 2

## tracks

In [2]:
size_slices = 1000 
#csv to dict
reader = csv.reader(open('dictionaries/dict_sorted_trackuri_id.csv', 'r'))
dict_trackuri_ids = {k:v for (k,v) in reader}
slices_playlists = []

def process_mpd(path):
    count = 0 # slices counter   
    filenames = os.listdir(path)
    
    #for each slice
    for filename in tqdm_notebook(sorted(filenames)):
        #read slice
        if filename.startswith("mpd.slice.") and filename.endswith(".json"):
            fullpath = os.sep.join((path, filename))
            f = open(fullpath)
            js = f.read()
            f.close()
            mpd_slice = json.loads(js)

            #process slice and write a csv
            process_slice(mpd_slice)
            count += 1
            if count%size_slices==0:
                write_file(int(count/size_slices))
                slices_playlists.clear()
                
            

        if quick and count > max_files_for_quick_processing:
            break
                
            
def process_slice(mpd_slice):
    for playlist in mpd_slice['playlists']:
        slices_playlists.append([dict_trackuri_ids[track["track_uri"]] for track in playlist["tracks"]])
       
    
def write_file(count):  
    if not os.path.exists(mpd_sequence_path):
        os.makedirs(mpd_sequence_path)
    with open(mpd_sequence_path + "/tracks_sentences_id.txt", 
              'w', newline='',encoding="utf-8")as csvfile:
        spamwriter = csv.writer(csvfile, delimiter=' ', quoting=csv.QUOTE_MINIMAL)
        for playlist in slices_playlists:
            spamwriter.writerow(playlist)       



process_mpd(mpd_path)





## tracks and title

In [3]:
size_slices = 1000 
#csv to dict
reader = csv.reader(open('dictionaries/dict_sorted_trackuri_id.csv', 'r'))
dict_trackuri_ids = {k:v for (k,v) in reader}
reader = csv.reader(open('dictionaries/dict_id_ntitle_final.csv', 'r',encoding="utf-8"))
dict_ntitle_id = {v:k for (k,v) in reader}
slices_playlists = []

def process_mpd(path):
    count = 0 # slices counter   
    filenames = os.listdir(path)
    
    #for each slice
    for filename in tqdm_notebook(sorted(filenames)):
        #read slice
        if filename.startswith("mpd.slice.") and filename.endswith(".json"):
            fullpath = os.sep.join((path, filename))
            f = open(fullpath)
            js = f.read()
            f.close()
            mpd_slice = json.loads(js)

            process_slice(mpd_slice)
            count += 1   
                           

        if quick and count > max_files_for_quick_processing:
            break
            
    write_file()
    slices_playlists.clear()
                
            
def process_slice(mpd_slice):
    for playlist in mpd_slice['playlists']:
        slices_playlists.append((dict_ntitle_id[normalize_name(playlist["name"])], 
                                  [dict_trackuri_ids[track["track_uri"]] for track in playlist["tracks"]]))
       
    
def write_file():  
    if not os.path.exists(mpd_sequence_path):
        os.makedirs(mpd_sequence_path)
    with open(mpd_sequence_path + '/playlists_ntitle_tracks_sentences_id_final', 'wb') as fp:
        pickle.dump(slices_playlists, fp)


process_mpd(mpd_path)




## Normalized playlists

In [4]:
size_slices = 50 
#csv to dict
reader = csv.reader(open('dictionaries/dict_sorted_trackuri_id.csv', 'r'))
dict_trackuri_ids = {k:v for (k,v) in reader}
reader = csv.reader(open('dictionaries/dict_id_ntitle_final.csv', 'r',encoding="utf-8"))
dict_ntitle_id = {v:k for (k,v) in reader}
reader = csv.reader(open('dictionaries/dict_id_ntitle_final.csv', 'r',encoding="utf-8"))
dict_id_ntitle = {k:v for (k,v) in reader}
reader = csv.reader(open('dictionaries/dict_sorted_trackuri_id.csv', 'r'))
MPD_ntitle_id = {k:[] for (k,v) in reader}



def process_mpd(path):
    count = 0 # slices counter   
    filenames = os.listdir(path)
    
    #for each slice
    for filename in tqdm_notebook(sorted(filenames)):
        #read slice
        if filename.startswith("mpd.slice.") and filename.endswith(".json"):
            fullpath = os.sep.join((path, filename))
            f = open(fullpath)
            js = f.read()
            f.close()
            mpd_slice = json.loads(js)

            #process slice and write a csv
            process_slice(mpd_slice)
            count += 1              
            

        if quick and count > max_files_for_quick_processing:
            break
    write_file()
                
            
def process_slice(mpd_slice):
    for playlist in mpd_slice['playlists']:
        ntitle_id = dict_ntitle_id[normalize_name(playlist['name'])]
        for track in playlist["tracks"]:
            MPD_ntitle_id[track["track_uri"]].append(ntitle_id)
    
    
def write_file():  
    if not os.path.exists(mpd_sequence_path):
        os.makedirs(mpd_sequence_path)
    with open(mpd_sequence_path + "/ntitles_sentences_pid_final.txt", 
              'w', newline='',encoding="utf-8")as csvfile:
        spamwriter = csv.writer(csvfile, delimiter=' ', quoting=csv.QUOTE_MINIMAL)
        for trackuri in MPD_ntitle_id:
            spamwriter.writerow(MPD_ntitle_id[trackuri])       



process_mpd(mpd_path)




## Artists

In [2]:
size_slices = 50 
#csv to dict
reader = csv.reader(open('dictionaries/dict_artisturi_id.csv', 'r'))
dict_artisturi_id = {k:v for (k,v) in reader}

MPD_artist_id = dict()
MPD_artist_id_unique = dict()


def process_mpd(path):
    count = 0 # slices counter   
    filenames = os.listdir(path)
    
    #for each slice
    for filename in tqdm_notebook(sorted(filenames)):
        #read slice
        if filename.startswith("mpd.slice.") and filename.endswith(".json"):
            fullpath = os.sep.join((path, filename))
            f = open(fullpath)
            js = f.read()
            f.close()
            mpd_slice = json.loads(js)

            #process slice and write a csv
            process_slice(mpd_slice)
            count += 1
            #if count%size_slices==0:
            #if count%1==0:              
            

        if quick and count > max_files_for_quick_processing:
            break
    write_file()
                
        
        
def process_slice(mpd_slice):
    for playlist in mpd_slice['playlists']:
        artists = [dict_artisturi_id[track["artist_uri"]] for track in playlist["tracks"]]
        MPD_artist_id[playlist["pid"]] = artists
        MPD_artist_id_unique[playlist["pid"]] = list(set(artists)) #delete repeated artists
        
    
def write_file():  
    with open(mpd_sequence_path + "/artists_sentences_pid.txt", 
              'w', newline='',encoding="utf-8")as csvfile:
        spamwriter = csv.writer(csvfile, delimiter=' ', quoting=csv.QUOTE_MINIMAL)
        for uri in MPD_artist_id:
            spamwriter.writerow(MPD_artist_id[uri])   
    
    with open(mpd_sequence_path +"/artists_sentences_pid_unique.txt", 
              'w', newline='',encoding="utf-8")as csvfile:
        spamwriter = csv.writer(csvfile, delimiter=' ', quoting=csv.QUOTE_MINIMAL)
        for uri in MPD_artist_id_unique:
            spamwriter.writerow(MPD_artist_id_unique[uri]) 



process_mpd(mpd_path)


