## CSV file maker track uri - numeric ID
Makes a csv that assigns each track_uri a numeric ID <br>
The higher the ID is, the most popular the track is <br>
Also makes the following dictionaries:
* Trackuri - ID
* Trackuri - title
* ID - title

In [None]:
"""
   usage:

        python 1_MPD_id_trackuri_maker.py path-to-mpd-data/
"""


import sys
import json
import re
import os
import pandas as pd
import numpy as np
import csv
import scipy.sparse as sp
import scipy.sparse.linalg  as la
import itertools
from tqdm import tqdm_notebook
from collections import Counter, OrderedDict
from whoosh.analysis import CharsetFilter, StemmingAnalyzer
from whoosh import fields
from whoosh.support.charset import accent_map

def normalize_name(name):
    stem = True
  
    letters = list(name)
    
    # if format w o r k o u t / w.o.r.k.o.u.t/ w*o*r*k*o*u*t join togother
    if len(letters)>4:
        if len(set([letters[i] for i in range(0,len(letters),2)]))==1:
            name = "".join([letters[i] for i in range(1,len(letters),2)])
        elif len(set([letters[i] for i in range(1,len(letters),2)]))==1:
            name = "".join([letters[i] for i in range(0,len(letters),2)])
             
    # if there is and & not surrounded by spaces, leave alone (example 'r&b)
    if "&" in letters:
        position = letters.index("&")
        if position>0 and position<len(letters)-1:
            if letters[position-1]!=' ' and letters[position+1]!=' ':
                stem  = False
    
      
    # if there is a k surrounded by numbers turn to 0
    if "k" in letters and '2' in letters:
        positions = [x for x in range(len(letters)) if letters[x]=='k']
        for pos in positions:
             if pos>0 and pos<len(letters)-1:
                if letters[pos-1]=='2':
                    letters[pos]='0'
                    name = "".join(letters)
           
    # proceed to stem   
    if stem: 
        my_analyzer = StemmingAnalyzer() | CharsetFilter(accent_map)
        tokens = my_analyzer(name)
        words = [token.text for token in tokens]
        
        # if the reuslt is empyt, leave alone, if not, return as a list
        if len(words)!=0:
            result=""
            for el in words:
                result +=el+" "
            letters = list(result)[:-1]
    # softer stem
    else:
        name = name.lower()
        name = re.sub(r"[.,\/#!$%\^\*;:{}=\_`~()@]", ' ', name)
        name = re.sub(r'\s+', ' ', name).strip()
        letters = list(name)
        
            
            
    # if last n characters are equal leave only 1 
    last = letters[-1]
    if last in ascii_letters and len(letters)>1:
        while(letters[-2]==last):
            letters.pop(-2)
            if len(letters)==1: break
    
    
    return ''.join(letters)

total_tracks = 0
track_histogram = collections.Counter()
num = 4
dict_sorted_trackuri_id = dict()
dict_trackuri_title = dict()
dict_playlistpid_title = dict()


quick = False
max_files_for_quick_processing = 1

def process_mpd(path):
    count = 0
    filenames = os.listdir(path)
    for filename in tqdm_notebook(sorted(filenames)):
        if filename.startswith("mpd.slice.") and filename.endswith(".json"):
            fullpath = os.sep.join((path, filename))
            f = open(fullpath)
            js = f.read()
            f.close()
            mpd_slice = json.loads(js)
            for playlist in mpd_slice['playlists']:
                process_playlist(playlist)
            count += 1

            if quick and count > max_files_for_quick_processing:
                break

    reorder()
    write_files()

    
def write_files():
    if not os.path.exists("dictionaries"):
        os.makedirs("dictionaries")
    
    
    with open('dictionaries/dict_sorted_trackuri_id.csv', 'w', newline='') as csvfile:
        spamwriter = csv.writer(csvfile, delimiter=',', quoting=csv.QUOTE_MINIMAL)
        spamwriter.writerow(["id","track_uri"])
        for trackuri in dict_sorted_trackuri_id:
            spamwriter.writerow([trackuri,dict_sorted_trackuri_id[trackuri]])
    
    with open('dictionaries/dict_trackuri_title.csv', 'w', newline='', encoding='utf-8') as csvfile:
        spamwriter = csv.writer(csvfile, delimiter=',', quoting=csv.QUOTE_MINIMAL)
        spamwriter.writerow(["track_uri","title"])
        for trackuri in dict_trackuri_title:
            spamwriter.writerow([trackuri,dict_trackuri_title[trackuri]])
            
    with open('dictionaries/dict_id_title.csv', 'w', newline='', encoding='utf-8') as csvfile:
        spamwriter = csv.writer(csvfile, delimiter=',', quoting=csv.QUOTE_MINIMAL)
        spamwriter.writerow(["track_uri","title"])
        for trackuri in dict_trackuri_title:
            spamwriter.writerow([dict_sorted_trackuri_id[trackuri],dict_trackuri_title[trackuri]])
            
    with open('dictionaries/dict_playlistpid_title.csv', 'w', newline='', encoding='utf-8') as csvfile:
        spamwriter = csv.writer(csvfile, delimiter=',', quoting=csv.QUOTE_MINIMAL)
        spamwriter.writerow(["playlist_pid","title"])
        for pid in dict_playlistpid_title:
            spamwriter.writerow([pid,dict_playlistpid_title[pid]])
            
        
# reorders by popularity, selects the ones appeareing more than 4 times and makes dictionary
def reorder():
    # tracks sorted by popularity
    dict_sorted_occurences = OrderedDict(track_histogram.most_common())
    # assign each track an id, track: new_id, where the lowest the new_id is, the most common is the track
    for word in dict_sorted_occurences:
        dict_sorted_trackuri_id[word] = len(dict_sorted_trackuri_id)
    
        
    t1 = len(list(filter(lambda y: y[1]>4, track_histogram.items())))
    print("number of tracks", total_tracks)
    print("number of unique tracks", len(track_histogram))
    print("number tracks appearing > "+str(num)+" times: "+ str(t1) +
          " that is "+ str(round(t1/len(track_histogram)*100,2))+"%" )
    print("number of playlists", len(dict_playlistpid_title))

# fills track_histogram
def process_playlist(playlist):
    global total_tracks
    
    if playlist['pid'] not in dict_playlistpid_title:
        dict_playlistpid_title[playlist['pid']] = playlist['name']
    
    for track in playlist['tracks']:
        total_tracks += 1
        track_histogram[track['track_uri']] += 1
        if track['track_uri'] not in dict_trackuri_title:
            dict_trackuri_title[track['track_uri']]= track["track_name"]
            
        
        
process_mpd('../../MPD/data/')

