## Loading and Preparation of Data

In [2]:
from collections import Counter
import pandas as pd
import numpy as np
import os
import re

#Import all the dependencies
from gensim.models.doc2vec import Doc2Vec, TaggedDocument
from nltk.tokenize import word_tokenize
import nltk
from nltk.corpus import stopwords
# nltk.download('stopwords')
remove_these = set(stopwords.words('english'))

In [3]:
REMOVE_STOPWORDS = True

In [7]:
%%time

with open('mxm_dataset_train.txt','r') as f:
    lines = f.readlines()
    words = lines[17].replace('%','').split(',')
    all_songs_dict = dict()
    for i,l in list(enumerate(lines))[18:]:
        song_info = l.split(',')
        MSDID = song_info[0]
        song_bow = [x.split(':') for x in song_info[2:]]
        song_dict = {}
        for word, word_count in song_bow:
            song_dict[int(word)] = int(word_count.replace('\n',''))
        word_lists = [[words[word-1]]*song_dict[word] for word in song_dict.keys()]
        song = [word for word_list in word_lists for word in word_list]
        if REMOVE_STOPWORDS:
            song = [w for w in song if w not in remove_these]
        all_songs_dict[str(MSDID)] = ' '.join(song).replace('\n','')

Wall time: 1min 6s


In [8]:
print(len(all_songs_dict.keys()))
song_msd_ids = list(all_songs_dict.keys())

210519


In [9]:
all_songs_dict

{'TRAAAAV128F421A322': 'like like de got would seem someon understand pass river met piec damn worth flesh grace poor poor somehow ignor passion tide season seed resist order order piti fashion grant captur captur ici soil patienc social social highest highest slice leaf lifeless arrang wilder shark devast element',
 'TRAAABD128F429CF47': 'know know know know know time time time la la la la la la la get get got got got never never feel feel want want would would day day day day away away yeah yeah yeah yeah heart heart heart heart heart could could need need need look look thing thing think live onli onli tri keep keep keep dream dream wanna wanna wanna find find find hear believ everyth everyth someth someth someth someth insid insid chang chang soul soul soul soul soul soul soul new start start pleas pleas true deep deep deep deep beat fade fade wast wast wast trust alreadi alreadi style style asleep wors wors goal goal',
 'TRAAAED128E0783FAB': 'love love love love love love love lov

In [10]:
print(np.random.choice(song_msd_ids))

TRDKOGG128F422702F


In [11]:
d = {
    'MSDID': list(all_songs_dict.keys()),
    'cleaned_text': [all_songs_dict[x] for x in all_songs_dict.keys()]
    }
msdid_df = pd.DataFrame.from_dict(d)
print(msdid_df.shape)
msdid_df.head()

(210519, 2)


Unnamed: 0,MSDID,cleaned_text
0,TRAAAAV128F421A322,like like de got would seem someon understand ...
1,TRAAABD128F429CF47,know know know know know time time time la la ...
2,TRAAAED128E0783FAB,love love love love love love love love love l...
3,TRAAAEF128F4273421,know got got got feel let would would would ey...
4,TRAAAEW128F42930C0,like take would wo someth stay burn burn burn ...


In [13]:
%%time

all_song_meta_dict = dict()
with open('mxm_779k_matches.txt','r',encoding="utf-8") as f:
    lines = f.readlines()
    for i in range(18, len(lines)):
        line = lines[i].split('<SEP>')
        MSDID = line[0]
        artist = line[1]
        title = line[2]
        all_song_meta_dict[str(MSDID)] = {'artist': artist, 'title': title}

Wall time: 3.01 s


In [14]:
len(all_song_meta_dict.keys())

779056

In [15]:

d = {
    'MSDID': msdid_df['MSDID'],
    'artist': [all_song_meta_dict[x]['artist'] for x in msdid_df['MSDID']],
    'title': [all_song_meta_dict[x]['title'] for x in msdid_df['MSDID']]
    }
meta_df = pd.DataFrame.from_dict(d)
print(meta_df.shape)
meta_df.head()

(210519, 3)


Unnamed: 0,MSDID,artist,title
0,TRAAAAV128F421A322,Western Addiction,A Poor Recipe For Civic Cohesion
1,TRAAABD128F429CF47,The Box Tops,Soul Deep
2,TRAAAED128E0783FAB,Jamie Cullum,It's About Time
3,TRAAAEF128F4273421,Adam Ant,Something Girls
4,TRAAAEW128F42930C0,Broken Spindles,Burn My Body (Album Version)


In [16]:
full_df_ = pd.merge(msdid_df, meta_df, on='MSDID', how='left')
print(full_df_.shape)
full_df_.head()

(210519, 4)


Unnamed: 0,MSDID,cleaned_text,artist,title
0,TRAAAAV128F421A322,like like de got would seem someon understand ...,Western Addiction,A Poor Recipe For Civic Cohesion
1,TRAAABD128F429CF47,know know know know know time time time la la ...,The Box Tops,Soul Deep
2,TRAAAED128E0783FAB,love love love love love love love love love l...,Jamie Cullum,It's About Time
3,TRAAAEF128F4273421,know got got got feel let would would would ey...,Adam Ant,Something Girls
4,TRAAAEW128F42930C0,like take would wo someth stay burn burn burn ...,Broken Spindles,Burn My Body (Album Version)


In [17]:
full_df = full_df_.copy()
print(full_df.shape)
full_df.head()

(210519, 4)


Unnamed: 0,MSDID,cleaned_text,artist,title
0,TRAAAAV128F421A322,like like de got would seem someon understand ...,Western Addiction,A Poor Recipe For Civic Cohesion
1,TRAAABD128F429CF47,know know know know know time time time la la ...,The Box Tops,Soul Deep
2,TRAAAED128E0783FAB,love love love love love love love love love l...,Jamie Cullum,It's About Time
3,TRAAAEF128F4273421,know got got got feel let would would would ey...,Adam Ant,Something Girls
4,TRAAAEW128F42930C0,like take would wo someth stay burn burn burn ...,Broken Spindles,Burn My Body (Album Version)


## Train Model and Getting Embeddings

In [18]:
%%time

# get training data
data = full_df['cleaned_text'].tolist()
train_corpus = [TaggedDocument(words=word_tokenize(_d.lower()), tags=[str(i)]) for i, _d in list(enumerate(data))]
# build model
model = Doc2Vec(vector_size=50, min_count=1, epochs=10, dm=0)
model.build_vocab(train_corpus)
# train model
model.train(train_corpus, total_examples=model.corpus_count, epochs=model.epochs)

Wall time: 9min 11s


In [19]:
model.save('trained_model_example')

In [20]:
model = Doc2Vec.load('trained_model_example')

In [21]:
emb_df = pd.DataFrame([model.docvecs[f'{i}'] for i in range(len(full_df))])
fe_df = full_df.copy()
for c in emb_df.columns:
    fe_df[c] = emb_df[c]

In [22]:
fe_df.head()

Unnamed: 0,MSDID,cleaned_text,artist,title,0,1,2,3,4,5,...,40,41,42,43,44,45,46,47,48,49
0,TRAAAAV128F421A322,like like de got would seem someon understand ...,Western Addiction,A Poor Recipe For Civic Cohesion,0.186725,0.401601,0.128605,-0.084432,0.131973,0.062342,...,-0.38281,0.415129,-0.129113,0.17724,-0.36,0.207868,-0.355893,-0.339105,0.023608,-0.285683
1,TRAAABD128F429CF47,know know know know know time time time la la ...,The Box Tops,Soul Deep,0.353483,-0.092718,0.214373,-0.461915,-0.006981,0.44381,...,0.017024,-0.066364,0.280533,0.118458,-0.140707,-0.212626,-0.143725,-0.000992,-0.073097,0.025022
2,TRAAAED128E0783FAB,love love love love love love love love love l...,Jamie Cullum,It's About Time,0.588866,0.387003,-0.141386,0.033603,0.021757,0.234067,...,0.34397,0.117033,0.095,-0.142799,-0.067972,-0.266166,-0.010073,0.145012,-0.117268,-0.111525
3,TRAAAEF128F4273421,know got got got feel let would would would ey...,Adam Ant,Something Girls,0.580624,0.299594,0.332975,0.173341,-0.383277,-0.042725,...,0.625317,0.497023,0.299745,0.093928,0.306255,0.276892,-0.055125,0.399799,0.229794,-0.271073
4,TRAAAEW128F42930C0,like take would wo someth stay burn burn burn ...,Broken Spindles,Burn My Body (Album Version),0.252244,0.281475,0.122138,-0.271872,-0.46522,0.122065,...,-0.4206,0.132696,0.469236,-0.369253,0.051139,0.009011,0.067323,-0.163176,-0.073449,-0.46025


## Visulizing with T-SNE

In [23]:
from sklearn.preprocessing import StandardScaler
from sklearn.manifold import TSNE
from sklearn.pipeline import Pipeline

#### Setup a pipeline

In [None]:
n = 3
pipeline = Pipeline([('scaling', StandardScaler()), ('tsne', TSNE(n_components=n, perplexity=30, random_state=42))])
song_embedding = pipeline.fit_transform(emb_df)
tsne_df = pd.DataFrame(data = song_embedding, columns = list(range(n)))
tsne_df['title'] = full_df['title']
tsne_df['artist'] = full_df['artist']

In [None]:
print(tsne_df.shape)
tsne_df.head()

In [None]:
my_artists = ['Blink-182',
              'Tame Impala','Snoop Dogg','The Strokes',
              'Eminem',
              'Red Hot Chili Peppers','The Who',
              'Linkin Park',
              'Avenged Sevenfold','Eric Clapton']

my_artists

In [None]:
tsne_df.artist.value_counts()[:30]

In [None]:
plot_df = fe_df[fe_df['artist'].isin(my_artists)].reset_index(drop=True)

In [None]:
emb_df_ = emb_df[fe_df['artist'].isin(my_artists)]

In [None]:
tsne_df_['title'] = plot_df['title']
tsne_df_['artist'] = plot_df['artist']
tsne_df_.columns = ['t-SNE dimension 0','t-SNE dimension 1','t-SNE dimension 2','title','artist']
print(tsne_df_.shape)
tsne_df_.head()

In [None]:
import plotly.express as px

for x in [('t-SNE dimension 0','t-SNE dimension 1'),
          ('t-SNE dimension 0','t-SNE dimension 2'),
          ('t-SNE dimension 1','t-SNE dimension 2')]:
    fig = px.scatter(tsne_df_, 
                     x=x[0],
                     y=x[1],
                     hover_data=['artist','title'],
                     color='artist',
                     opacity=.8,
                     template='simple_white')\
            .update_traces(marker=dict(size=15))\
            .update_layout(
                title={
                    'text': "Song Lyric Embeddings",
                    'y':0.95,
                    'x':0.19,
                    'xanchor': 'center',
                    'yanchor': 'top'})
    fig.show()

In [None]:
for x in [('t-SNE dimension 0','t-SNE dimension 1','t-SNE dimension 2')]:   
    fig = px.scatter_3d(tsne_df_, 
                     x=x[0],
                     y=x[1],
                     z=x[2],
                     hover_data=['artist', 'title'],
                     color='artist',
                     template='simple_white')
fig.show()