### Dependency

In [1]:
import sys
import os
import numpy as np
import pandas as pd
import json
import os.path as osp
from tqdm.auto import tqdm
from collections import defaultdict

# Minhash and jaccard
from sklearn.metrics import jaccard_score
from datasketch import MinHash, MinHashLSH

### Load the dataset

In [4]:
data_dir = 'spotify_million_playlist_dataset/data'
file_names = os.listdir(data_dir)

file_dir = osp.join(data_dir, file_names[0])

f = open(file_dir) 

# Load json file
data = json.load(f) 
# check the number of playlists
print(len(data['playlists']))

1000


### Preprocessing the dataset

In [5]:
### helper function: convert dict to panda dataframe
def convert(data):
    """
    Convert the json file to a panda dataframe and a dict.
        
    Returns:
        df: dataframe with columns (playlist id, track name and ratings (all 1)).
        df_dict: python dict, key is the playlist id, values are a list of track names.
    """
    
    df = []
    df_dict = defaultdict(list)
    for idx, playlist in tqdm(enumerate(data['playlists'])):

        p_id = playlist['pid']

        tracks = playlist['tracks']

        for track in tracks:

            df.append([p_id, track['track_name'], 1])
            df_dict[p_id].append(track['track_name'])
            
            
    df = pd.DataFrame(df)
    df.columns = ['p_id', 'track_name', 'rating']
    
    return df, df_dict

In [14]:
df, df_dict = convert(data)

#preview df_dict
print('The tracks of playlist id 549000: \n',df_dict[549000])

# preview df
df

HBox(children=(FloatProgress(value=1.0, bar_style='info', max=1.0), HTML(value='')))


The tracks of playlist id 549000: 
 ['Boots of Spanish Leather', 'Mr. Tambourine Man', "Danny's Song", "A Hard Rain's A-Gonna Fall", "Blowin' In the Wind", 'John Wesley Harding', 'If You See Her, Say Hello', "Bob Dylan's 115th Dream", 'Love Minus Zero', 'Classy Girls', 'Girl from the North Country', "03' Bonnie & Clyde", 'Que Quieres De Mi', 'Wicked Games', 'Slow It Down', 'How Deep Is Your Love', 'Mi verdad - feat. Shakira', 'Moon River(Vocal Audrey Hepburn)', "I've Got The World On A String", 'A Little Less Conversation', "That's All Right", 'Piano Man', 'New York State of Mind', "(I Can't Help) Falling in Love With You", 'Happy Together', 'In My Dreams', 'Keep on Loving You - Remastered', "Somethin' Stupid", 'The Way You Look Tonight', 'Blue Moon - 1998 Digital Remaster', 'Hold You in My Arms', "Can't Fight This Feeling", 'Shooting Star', 'I Don\'t Want to Miss a Thing - From the Touchstone film, "Armageddon"', 'Then', 'Us Against the World', 'Love Your Love the Most', 'The Real Th

Unnamed: 0,p_id,track_name,rating
0,549000,Boots of Spanish Leather,1
1,549000,Mr. Tambourine Man,1
2,549000,Danny's Song,1
3,549000,A Hard Rain's A-Gonna Fall,1
4,549000,Blowin' In the Wind,1
...,...,...,...
68684,549999,"Fünf heitere Gesänge, Op.125: 1. ""Die Meerfee""",1
68685,549999,"L'incoronazione di Poppea / Act 3: ""Pur ti miro""",1
68686,549999,"Pergolesi: Stabat mater: VII. Eja Mater, fons ...",1
68687,549999,"Sound the trumpet, beat the drum, Z. 335",1


In [16]:
### Convert the dataframe to utility matrix
U = pd.pivot_table(df, values='rating', index='p_id', columns='track_name')
U

track_name,"""45""","""C"" Is For Cookie - Introduction","""If You Want a Burger..."" (feat. Kiesha Red)","""O Mio Babbino Caro"" (From Gianni Schicchi)","""The Take Over, The Breaks Over""",#9 Dream,"#BDAY (feat. Chris Brown, Siya and Sage The Gemini)",#Beastmode (feat. Gametime Henderson-Vine),#Beautiful,#Dubigal,...,아.미.고 Amigo (SHINee WORLD 1 Version),아직도 난 (Still you),양화대교 (Yanghwa Brdg),으르렁 Growl,잔소리 (feat. Simon D),좋아 She is,중독 Overdose,쿠데타 COUP D'ETAT,하이엔드걸 [High End Girl] (feat. Deez),헤픈엔딩 Happen Ending
p_id,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1,Unnamed: 11_level_1,Unnamed: 12_level_1,Unnamed: 13_level_1,Unnamed: 14_level_1,Unnamed: 15_level_1,Unnamed: 16_level_1,Unnamed: 17_level_1,Unnamed: 18_level_1,Unnamed: 19_level_1,Unnamed: 20_level_1,Unnamed: 21_level_1
549000,,,,,,,,,,,...,,,,,,,,,,
549001,,,,,,,,,,,...,,,,,,,,,,
549002,,,,,,,,,,,...,,,,,,,,,,
549003,,,,,,,,,,,...,,,,,,,,,,
549004,,,,,,,,,,,...,,,,,,,,,,
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
549995,,,,,,,,,,,...,,,,,,,,,,
549996,,,,,,,,,,,...,,,,,,,,,,
549997,,,,,,,,,,,...,,,,,,,,,,
549998,,,,,,,,,,,...,,,,,,,1.0,,,


### Minhash and LSH

In [17]:
# Convert null values to 0
U[U.isna()] = 0

In [29]:
random_seed = 100
threshold = 0.06 # jaccard sim threshold
# Initialize the LSH
lsh = MinHashLSH(threshold = threshold, num_perm=128)
MH = {}
for idx, key in tqdm(enumerate(sorted(df_dict))):
    
    MH[key] = MinHash(seed = random_seed)
    for d in df_dict[key]:
        MH[key].update(d.encode('utf8'))
  
    lsh.insert(key, MH[key])
    
# find the neighbor of query playlist q
q = 549121
result = lsh.query(MH[q])
print('playlist {} neighbors: {}'.format(q, result))

print('----')
for i in result:
    if i != q:
    
        print('Actually jaccard similarity: ', jaccard_score(U.loc[q, :], U.loc[i, :]))
        print('Predicted Similarity:', MH[549121].jaccard(MH[i]) , '\n')

HBox(children=(FloatProgress(value=1.0, bar_style='info', max=1.0), HTML(value='')))


playlist 549121 neighbors: [549121, 549411, 549194, 549775, 549489]
----
Actually jaccard similarity:  0.017241379310344827
Predicted Similarity: 0.03125 

Actually jaccard similarity:  0.0625
Predicted Similarity: 0.0859375 

Actually jaccard similarity:  0.022222222222222223
Predicted Similarity: 0.03125 

Actually jaccard similarity:  0.015748031496062992
Predicted Similarity: 0.015625 

