# Clean data slices with playists by ID
* [tracks...] : creates a file with less information than the original one specifically, a csv file containing all the playlists each line will be the ID tracks of one playlist 
* normalized_title:[tracks..] dictionary with track ids, where the key is the id of the normalized title
* [normalized_playlists...] creates a new csv file containing, for each track, a list of all the playlist pids it appears in. 


In [7]:
import csv
import pandas as pd
import gensim
import os
import numpy as np
import sys
from tqdm import tqdm_notebook
import json
from gensim.models import Word2Vec
import time
import re
from whoosh.analysis import CharsetFilter, StemmingAnalyzer
from whoosh import fields
from whoosh.support.charset import accent_map
import pickle
from string import ascii_letters


def normalize_name(name):
    stem = True
  
    letters = list(name)
    
    # if format w o r k o u t / w.o.r.k.o.u.t/ w*o*r*k*o*u*t join togother
    if len(letters)>4:
        if len(set([letters[i] for i in range(0,len(letters),2)]))==1:
            name = "".join([letters[i] for i in range(1,len(letters),2)])
        elif len(set([letters[i] for i in range(1,len(letters),2)]))==1:
            name = "".join([letters[i] for i in range(0,len(letters),2)])
             
    # if there is and & not surrounded by spaces, leave alone (example 'r&b)
    if "&" in letters:
        position = letters.index("&")
        if position>0 and position<len(letters)-1:
            if letters[position-1]!=' ' and letters[position+1]!=' ':
                stem  = False
    
      
    # if there is a k surrounded by numbers turn to 0
    if "k" in letters and '2' in letters:
        positions = [x for x in range(len(letters)) if letters[x]=='k']
        for pos in positions:
             if pos>0 and pos<len(letters)-1:
                if letters[pos-1]=='2':
                    letters[pos]='0'
                    name = "".join(letters)
           
    # proceed to stem   
    if stem: 
        my_analyzer = StemmingAnalyzer() | CharsetFilter(accent_map)
        tokens = my_analyzer(name)
        words = [token.text for token in tokens]
        
        # if the reuslt is empyt, leave alone, if not, return as a list
        if len(words)!=0:
            result=""
            for el in words:
                result +=el+" "
            letters = list(result)[:-1]
    # softer stem
    else:
        name = name.lower()
        name = re.sub(r"[.,'\/#!$%\^\*;:{}=\_`~()@]", ' ', name)
        name = re.sub(r'\s+', ' ', name).strip()
        letters = list(name)
        
            
            
    # if last n characters are equal leave only 1 
    last = letters[-1]
    if last in ascii_letters and len(letters)>1:
        while(letters[-2]==last):
            letters.pop(-2)
            if len(letters)==1: break
    
    
    return ''.join(letters)

## tracks

In [None]:
quick = False
max_files_for_quick_processing = 2
size_slices = 1000 
#csv to dict
reader = csv.reader(open('dictionaries/dict_sorted_trackuri_id.csv', 'r'))
dict_trackuri_ids = {k:v for (k,v) in reader}
slices_playlists = []

def process_mpd(path):
    count = 0 # slices counter   
    filenames = os.listdir(path)
    
    #for each slice
    for filename in tqdm_notebook(sorted(filenames)):
        #read slice
        if filename.startswith("mpd.slice.") and filename.endswith(".json"):
            fullpath = os.sep.join((path, filename))
            f = open(fullpath)
            js = f.read()
            f.close()
            mpd_slice = json.loads(js)

            #process slice and write a csv
            process_slice(mpd_slice)
            count += 1
            if count%size_slices==0:
                write_file(int(count/size_slices))
                slices_playlists.clear()
                
            

        if quick and count > max_files_for_quick_processing:
            break
                
            
def process_slice(mpd_slice):
    for playlist in mpd_slice['playlists']:
        slices_playlists.append([dict_trackuri_ids[track["track_uri"]] for track in playlist["tracks"]])
       
    
def write_file(count):  
    if not os.path.exists("MPD_line_sentence"):
        os.makedirs("MPD_line_sentence")
    with open("MPD_line_sentence/playlists_sentences_id.txt", 
              'w', newline='',encoding="utf-8")as csvfile:
        spamwriter = csv.writer(csvfile, delimiter=' ', quoting=csv.QUOTE_MINIMAL)
        for playlist in slices_playlists:
            spamwriter.writerow(playlist)       


if __name__ == '__main__':
    path = sys.argv[1]
    if len(sys.argv) > 2 and sys.argv[2] == '--quick':
        quick = False
    process_mpd(path)


## tracks and title

In [10]:
quick = False
max_files_for_quick_processing = 2
size_slices = 1000 
#csv to dict
reader = csv.reader(open('dictionaries/dict_sorted_trackuri_id.csv', 'r'))
dict_trackuri_ids = {k:v for (k,v) in reader}
reader = csv.reader(open('dictionaries/dict_id_ntitle_final.csv', 'r',encoding="utf-8"))
dict_ntitle_id = {v:k for (k,v) in reader}
slices_playlists = []

def process_mpd(path):
    count = 0 # slices counter   
    filenames = os.listdir(path)
    
    #for each slice
    for filename in tqdm_notebook(sorted(filenames)):
        #read slice
        if filename.startswith("mpd.slice.") and filename.endswith(".json"):
            fullpath = os.sep.join((path, filename))
            f = open(fullpath)
            js = f.read()
            f.close()
            mpd_slice = json.loads(js)

            process_slice(mpd_slice)
            count += 1   
                           

        if quick and count > max_files_for_quick_processing:
            break
            
    write_file()
    slices_playlists.clear()
                
            
def process_slice(mpd_slice):
    for playlist in mpd_slice['playlists']:
        slices_playlists.append((dict_ntitle_id[normalize_name(playlist["name"])], 
                                  [dict_trackuri_ids[track["track_uri"]] for track in playlist["tracks"]]))
       
    
def write_file():  
     if not os.path.exists("MPD_line_sentence"):
        os.makedirs("MPD_line_sentence")
    with open('MPD_line_sentence/playlists_ntitle_tracks_sentences_id_final', 'wb') as fp:
        pickle.dump(slices_playlists, fp)



if __name__ == '__main__':
    path = sys.argv[1]
    if len(sys.argv) > 2 and sys.argv[2] == '--quick':
        quick = False
    #process_mpd(path)
    process_mpd("../MPD/data")




## Normalized playlists

In [None]:
quick = False
max_files_for_quick_processing = 2
size_slices = 50 
#csv to dict
reader = csv.reader(open('dictionaries/dict_sorted_trackuri_id.csv', 'r'))
dict_trackuri_ids = {k:v for (k,v) in reader}
reader = csv.reader(open('dictionaries/dict_id_ntitle_final.csv', 'r',encoding="utf-8"))
dict_ntitle_id = {v:k for (k,v) in reader}
reader = csv.reader(open('dictionaries/dict_id_ntitle_final.csv', 'r',encoding="utf-8"))
dict_id_ntitle = {k:v for (k,v) in reader}
reader = csv.reader(open('dictionaries/dict_sorted_trackuri_id.csv', 'r'))
MPD_ntitle_id = {k:[] for (k,v) in reader}



def process_mpd(path):
    count = 0 # slices counter   
    filenames = os.listdir(path)
    
    #for each slice
    for filename in tqdm_notebook(sorted(filenames)):
        #read slice
        if filename.startswith("mpd.slice.") and filename.endswith(".json"):
            fullpath = os.sep.join((path, filename))
            f = open(fullpath)
            js = f.read()
            f.close()
            mpd_slice = json.loads(js)

            #process slice and write a csv
            process_slice(mpd_slice)
            count += 1              
            

        if quick and count > max_files_for_quick_processing:
            break
    write_file()
                
            
def process_slice(mpd_slice):
    for playlist in mpd_slice['playlists']:
        ntitle_id = dict_ntitle_id[normalize_name(playlist['name'])]
        for track in playlist["tracks"]:
            MPD_ntitle_id[track["track_uri"]].append(ntitle_id)
    
    
def write_file():  
    if not os.path.exists("MPD_line_sentence"):
        os.makedirs("MPD_line_sentence")
    with open("MPD_line_sentence/ntitles_sentences_pid_final.txt", 
              'w', newline='',encoding="utf-8")as csvfile:
        spamwriter = csv.writer(csvfile, delimiter=' ', quoting=csv.QUOTE_MINIMAL)
        for trackuri in MPD_ntitle_id:
            spamwriter.writerow(MPD_ntitle_id[trackuri])       


if __name__ == '__main__':
    path = sys.argv[1]
    if len(sys.argv) > 2 and sys.argv[2] == '--quick':
        quick = False
    process_mpd(path)