In [1]:
import pandas as pd
from urllib import request

### Getting the dataset

In [2]:
data = request.urlopen("https://storage.googleapis.com/maps-premium/dataset/yes_complete/train.txt")

### Parsing the HTTP response 

In [3]:
lines = data.read().decode("utf-8").split('\n')

### Splitting the songs id (remove playlists with one song)

In [19]:
playlists = [line.rstrip().split() for line in lines if len(line.split()) > 1]

### Load song metadata

In [20]:
metadata = request.urlopen("https://storage.googleapis.com/maps-premium/dataset/yes_complete/song_hash.txt")

### Parsing the HTTP response

In [21]:
metadata_lines = metadata.read().decode("utf-8").split('\n')

### Splitting the songs metadata

In [35]:
songs = [s.rstrip().split('\t') for s in metadata_lines]

In [45]:
songs

[['0 ', 'Gucci Time (w\\/ Swizz Beatz)', 'Gucci Mane'],
 ['1 ', 'Aston Martin Music (w\\/ Drake & Chrisette Michelle)', 'Rick Ross'],
 ['2 ', 'Get Back Up (w\\/ Chris Brown)', 'T.I.'],
 ['3 ', 'Hot Toddy (w\\/ Jay-Z & Ester Dean)', 'Usher'],
 ['4 ', 'Whip My Hair', 'Willow'],
 ['5 ', 'Down On Me (w\\/ 50 Cent)', 'Jeremih'],
 ['6 ', 'Black And Yellow', 'Wiz Khalifa'],
 ['7 ', 'Blowing Me Kisses', 'Soulja Boy'],
 ['8 ', 'Lay It Down', 'Lloyd'],
 ['9 ', 'Good For My Money (w\\/ Lloyd)', 'Baby Bash'],
 ['10 ', 'Shake It', 'Elephant Man'],
 ['11 ', 'My Cupp', 'Richie Loop'],
 ['12 ', "Who's That Chick", 'Rihanna'],
 ['13 ', 'Like A G6 (w\\/ The Cataracs & Dec)', 'Far East Movement'],
 ['14 ', "DJ Got Us Fallin' In Love (w\\/ Pitbull)", 'Usher'],
 ['15 ', 'Hurt Me Soul', 'Lupe Fiasco'],
 ['16 ', 'Put It In A Love Song (w\\/ Beyonce)', 'Alicia Keys'],
 ['17 ', 'All I Wants Is You (w\\/ J Cole)', 'Miguel'],
 ['18 ', 'Champagne Life', 'Ne-Yo'],
 ['19 ', 'Find Your Love', 'Drake'],
 ['20 ', 'You

### Preparing the DataFrame

In [41]:
songs_df = pd.DataFrame(data=songs, columns=['id', 'title', 'artist'])
songs_df = songs_df.set_index('id')

### Trainig the model
- Each song is represented by a vector of dimension 32
- We consider a window (context) of 20 songs before and after a song
- Define 50 negative samples


In [43]:
from gensim.models import Word2Vec

In [44]:
model = Word2Vec(
    playlists,
    vector_size=32,
    window=20,
    negative=50,
    min_count=1,
    workers=4
)

### Testing the model

In [47]:
songs_df.iloc[2172]

title     Fade To Black
artist        Metallica
Name: 2172 , dtype: object

In [None]:
model.wv.most_similar(positive=str(2172))

TypeError: list indices must be integers or slices, not tuple

### Checking the model response

In [48]:
import numpy as np

In [58]:
def print_recommendations(song_id):
    similar_songs = np.array(
        model.wv.most_similar(positive=str(song_id),topn=5)
    )[:,0]
    return songs_df.iloc[similar_songs]

In [60]:
print_recommendations(6)

Unnamed: 0_level_0,title,artist
id,Unnamed: 1_level_1,Unnamed: 2_level_1
525,Rude Boy,Rihanna
420,Mo Money Mo Problems (w\/ Mase & Puff Daddy),The Notorious B.I.G.
74,Letting Go (Dutty Love) (w\/ Nicki Minaj),Sean Kingston
12205,Give It Up To Me,Sean Paul
11961,Floor On Fire (w\/ Pitbull & Lil Jon),Machel Montano
