## Summary

As language models such as BERT trained on Wikipedia, along with other sources, when it generates word embeddings in it's high dimensional space, it would bring out relationships that we are not able to capture using conventional techniques.

We follow the following basic steps:

1. Loaded the Parquet file to a Dask Data Frame. Currently processing just first partition (67503 songs)
2. Have used a concatenation of "Artist Name, Track Name, Album Name" as input to the language model (BERT)
3. Used BERT base uncased as the language model. Using transformers API from Hugging Face.
4. Generated word embedding (768 dimensional) for each song.
5. Used Facebook AI's Faiss (Faiss is a library for efficient similarity search and clustering of dense vectors) to get similar vectors.



## Install required packages

In [3]:
# install transformers to get a convenient pytorch API interface to various 
# language models such as BERT, GPT
!pip install transformers

# install fastparquet to access the parquet files generated earlier
!pip install fastparquet



## Load parquet files, and do basic checks

In [2]:
# make a local copy of the parquet files generated from the spotify CSV dataset
from google.colab import drive
drive.mount('/content/drive')

Go to this URL in a browser: https://accounts.google.com/o/oauth2/auth?client_id=947318989803-6bn6qk8qdgf4n4g3pfee6491hc0brc4i.apps.googleusercontent.com&redirect_uri=urn%3aietf%3awg%3aoauth%3a2.0%3aoob&response_type=code&scope=email%20https%3a%2f%2fwww.googleapis.com%2fauth%2fdocs.test%20https%3a%2f%2fwww.googleapis.com%2fauth%2fdrive%20https%3a%2f%2fwww.googleapis.com%2fauth%2fdrive.photos.readonly%20https%3a%2f%2fwww.googleapis.com%2fauth%2fpeopleapi.readonly

Enter your authorization code:
··········
Mounted at /content/drive


In [0]:
# copy data from google drive to local folder
# this is a time consuming step
# !cp -a '/content/drive/My Drive/109A/project/data/' '/content/Spotify.parquet'
# !cp -a '/content/drive/My Drive/109A/project/Spotify_Unique_Songs/' '/content/Spotify_Unique_Songs.parquet'

In [1]:
# copy data to a dask dataframe
from dask import dataframe as dd
from dask.dataframe import read_csv, read_parquet, to_csv, to_parquet

# path = '/content/drive/My Drive/109A/project/data/'
# path = '/content/Spotify.parquet/'
# ddf = dd.read_parquet(path=path, engine='fastparquet')
# ddf.columns # print the columns to verify

path = '/content/drive/My Drive/109A/project/Spotify_Unique_Songs/'
# path = '/content/Spotify_Unique_Songs.parquet/'
ddf = dd.read_parquet(path=path, engine='fastparquet')
ddf.columns # print the columns to verify

Index(['artist_name', 'track_name', 'album_name', 'track_uri', 'artist_uri',
       'album_uri'],
      dtype='object')

In [0]:
# compute top 100 artists
res = ddf.groupby('artist_name').artist_name.count().compute()
top100 = res.sort_values(ascending=False)[:100]

In [0]:
print("top artist in the entire dataset is {0}".format(top100.index[0]))

top artist in the entire dataset is Drake


In [0]:
top = top100.index[0] # get top artist
ddf_top = ddf[ddf['artist_name'] == top].compute() # ddf for top artist
 # print no of albums of top artist
print("No. 1 artist {0} has {1} albums".\
      format(top, len(ddf_top['album_uri'].unique())))

No. 1 artist Drake has 46 albums


In [0]:
# no of unique artists
# res_pCount = ddf.groupby('artist_uri').pid.nunique().compute()
# print("no of unique artists in the dataset are {0}.".format(res_pCount))

In [0]:
# we will work only on the first partition for now
# the first parition contains 67603 entries
# ddf0 = ddf_u.get_partition(0)
# ddf0 = ddf.get_partition(0)
# ddf0.describe().compute()

In [0]:
# !cp -a ./Spotify_Unique_Songs.parquet/. './drive/My Drive/109A/project/Spotify_Unique_Songs/'

## Create word embeddedings using BERT Language Model

In [1]:
import torch
from transformers import BertTokenizer, BertModel
from keras.preprocessing.sequence import pad_sequences
from tqdm import tqdm_notebook # for progress bar
from torch.utils.data import TensorDataset, DataLoader, SequentialSampler
import numpy as np
from dask import dataframe as dd
from dask.dataframe import read_csv, read_parquet, to_csv, to_parquet

device = 'cuda'

# Load tokenizer, model from pretrained model/vocabulary
tokenizer = BertTokenizer.from_pretrained('bert-base-uncased')

# Models are now set in evaluation mode by default when instantiated
# with the from_pretrained() method
model = BertModel.from_pretrained('bert-base-uncased')
# send model to cuda to use GPU for processing
model.to(device);

path = '/content/drive/My Drive/109A/project/Spotify_Unique_Songs/'
# path = '/content/Spotify_Unique_Songs.parquet/'
ddf_u = dd.read_parquet(path=path, engine='fastparquet')

track_embeddings = []

Using TensorFlow backend.


In [0]:
import torch
from transformers import BertTokenizer, BertModel
from keras.preprocessing.sequence import pad_sequences
from tqdm import tqdm_notebook # for progress bar
from torch.utils.data import TensorDataset, DataLoader, SequentialSampler
import numpy as np
from dask import dataframe as dd
from dask.dataframe import read_csv, read_parquet, to_csv, to_parquet
import datetime

device = 'cuda'

# Load tokenizer, model from pretrained model/vocabulary
tokenizer = BertTokenizer.from_pretrained('bert-base-uncased')

# Models are now set in evaluation mode by default when instantiated
# with the from_pretrained() method
model = BertModel.from_pretrained('bert-base-uncased')
# send model to cuda to use GPU for processing
model.to(device);

path = '/content/drive/My Drive/109A/project/Spotify_Unique_Songs/'
# path = '/content/Spotify_Unique_Songs.parquet/'
ddf_u = dd.read_parquet(path=path, engine='fastparquet')

track_embeddings = []

# same code can be used for all paritions
# however, memory constraints prevent us from doing so

# for p in range(ddf_u.npartitions):
for p in range(1):
  print("processing parition[{0}/{1}] ".format(p,ddf_u.npartitions), end="")
  ddf = ddf_u.get_partition(p)
  # ddf.dropna()
  song_data = ddf.compute()
  song_data.dropna(inplace=True)
  # track_names = song_data['track_name'].to_list()
  
  ######
  # step 1: tokenize
  ######
  print("[1 ", end="")
  # step 1.1: create string of "artist name - track name - album name"
  # step 1.2: enclose with [CLS] & [SEP] token, as required by BERT
  # step 1.3: encode the text into the encoding used by BERT

  # do this for all songs
  indexed_tokens = [tokenizer.encode(
                                    song_data.iloc[i,0] + " - " + \
                                    song_data.iloc[i,1] + " - " + \
                                    song_data.iloc[i,2],
                                    add_special_tokens=True) \
                    for i in range(len(song_data))]
   
  ######
  # step 2: remove false words
  ######
  print(" 2 ", end="")
  # false words are words in the album names that do not have significance
  # from perspective of song suggestions, but tends to throw BERT off

  falseWords = 'Deluxe Edition Remastered'
  falseWords_tokens = tokenizer.encode(falseWords)

  for song in indexed_tokens:
    for falseWord in falseWords_tokens:
      while falseWord in song:
        song.remove(falseWord)

  ######
  # step 3: padding
  ######
  print(" 3 ", end="")
  # we need to pad token strings generated earlier so that each has the same length
  MAX_LEN = 0
  for text in indexed_tokens:
    if len(text) > MAX_LEN:
      MAX_LEN = len(text)

  MAX_LEN += 1

  indexed_tokens_p = pad_sequences(indexed_tokens, maxlen=MAX_LEN, 
                            dtype ="long", truncating="post",padding ="post")

  # Convert inputs to PyTorch tensors
  tokens_tensors = torch.tensor(indexed_tokens_p)

  ######
  # step 4: prepare dataset and dataloader
  ######
  print(" 4 ", end="")
  batch_size = 1024
  tds = TensorDataset(tokens_tensors)
  ss = SequentialSampler(tds)
  dataloader = DataLoader(tds, sampler=ss, batch_size=batch_size)

  ######
  # step 5: generate word embeddings
  ######
  # print(" 5 ", end = "")
  print(" 5 ]")
  for tokens_tensor in tqdm_notebook(dataloader):
    tokens_tensor_cuda = tokens_tensor[0].to('cuda')
    with torch.no_grad():
      outputs = model(tokens_tensor_cuda)
    del tokens_tensor_cuda

    # The last hidden-state is the first element of the output tuple

    track_embedding = torch.mean(outputs[0], 1).to('cpu')
    del outputs
    track_embedding = [t.tolist() for t in track_embedding]
    track_embeddings.extend(track_embedding)
  
  # track_details.extend(song_data['track_uri', 'track_name', 'artist_name', 'album_name'].to_list())
  # print(" 6 ]")

In [0]:
# save results in a numpy array on a google drive location

######
# step 6: create numpy array
######

del track_vectors
track_vectors = np.column_stack((song_data, track_embeddings))
np.save('/content/drive/My Drive/109A/project/data/track_vectors-part00', \
        track_vectors)

In [2]:
!!nvidia-smi

['Tue Dec 10 10:38:30 2019       ',
 '+-----------------------------------------------------------------------------+',
 '| NVIDIA-SMI 440.36       Driver Version: 418.67       CUDA Version: 10.1     |',
 '|-------------------------------+----------------------+----------------------+',
 '| GPU  Name        Persistence-M| Bus-Id        Disp.A | Volatile Uncorr. ECC |',
 '| Fan  Temp  Perf  Pwr:Usage/Cap|         Memory-Usage | GPU-Util  Compute M. |',
 '|   0  Tesla K80           Off  | 00000000:00:04.0 Off |                    0 |',
 '| N/A   74C    P0    86W / 149W |      0MiB / 11441MiB |      0%      Default |',
 '+-------------------------------+----------------------+----------------------+',
 '                                                                               ',
 '+-----------------------------------------------------------------------------+',
 '| Processes:                                                       GPU Memory |',
 '|  GPU       PID   Type   Process name

## Create word embeddedings using GPT-2 Language Model

In [4]:
import torch
# from transformers import BertTokenizer, BertModel
from keras.preprocessing.sequence import pad_sequences
from tqdm import tqdm_notebook # for progress bar
from torch.utils.data import TensorDataset, DataLoader, SequentialSampler
import numpy as np
from dask import dataframe as dd
from dask.dataframe import read_csv, read_parquet, to_csv, to_parquet
import datetime

from transformers import GPT2Tokenizer, GPT2Model

# Load tokenizer, model from pretrained model/vocabulary
tokenizer = GPT2Tokenizer.from_pretrained('gpt2')
model = GPT2Model.from_pretrained('gpt2')
model.to('cuda');

device = 'cuda'

path = '/content/drive/My Drive/109A/project/Spotify_Unique_Songs/'
# path = '/content/Spotify_Unique_Songs.parquet/'
ddf_u = dd.read_parquet(path=path, engine='fastparquet')

track_embeddings = []

# same code can be used for all paritions
# however, memory constraints prevent us from doing so

# for p in range(ddf_u.npartitions):
for p in range(1):
  print("processing parition[{0}/{1}] ".format(p,ddf_u.npartitions), end="")
  ddf = ddf_u.get_partition(p)
  # ddf.dropna()
  song_data = ddf.compute()
  song_data.dropna(inplace=True)
  # track_names = song_data['track_name'].to_list()
  
  ######
  # step 1: tokenize
  ######
  print("[1 ", end="")
  # step 1.1: create string of "artist name - track name - album name"
  # step 1.2: enclose with [CLS] & [SEP] token, as required by BERT
  # step 1.3: encode the text into the encoding used by BERT

  # do this for all songs
  indexed_tokens = [tokenizer.encode(
                                    song_data.iloc[i,0] + " - " + \
                                    song_data.iloc[i,1] + " - " + \
                                    song_data.iloc[i,2],
                                    add_special_tokens=False) \
                    for i in range(len(song_data))]
   
  ######
  # step 2: remove false words
  ######
  print(" 2 ", end="")
  # false words are words in the album names that do not have significance
  # from perspective of song suggestions, but tends to throw BERT off

  falseWords = 'Deluxe Edition Remastered'
  falseWords_tokens = tokenizer.encode(falseWords)

  for song in indexed_tokens:
    for falseWord in falseWords_tokens:
      while falseWord in song:
        song.remove(falseWord)

  ######
  # step 3: padding
  ######
  print(" 3 ", end="")
  # we need to pad token strings generated earlier so that each has the same length
  MAX_LEN = 0
  for text in indexed_tokens:
    if len(text) > MAX_LEN:
      MAX_LEN = len(text)

  MAX_LEN += 1

  indexed_tokens_p = pad_sequences(indexed_tokens, maxlen=MAX_LEN, 
                            dtype ="long", truncating="post",padding ="post")

  # Convert inputs to PyTorch tensors
  tokens_tensors = torch.tensor(indexed_tokens_p)

  ######
  # step 4: prepare dataset and dataloader
  ######
  print(" 4 ", end="")
  batch_size = 512
  tds = TensorDataset(tokens_tensors)
  ss = SequentialSampler(tds)
  dataloader = DataLoader(tds, sampler=ss, batch_size=batch_size)

  ######
  # step 5: generate word embeddings
  ######
  # print(" 5 ", end = "")
  print(" 5 ]")
  for tokens_tensor in tqdm_notebook(dataloader):
    tokens_tensor_cuda = tokens_tensor[0].to('cuda')
    with torch.no_grad():
      outputs = model(tokens_tensor_cuda)
    del tokens_tensor_cuda

    # The last hidden-state is the first element of the output tuple

    track_embedding = torch.mean(outputs[0], 1).to('cpu')
    del outputs
    track_embedding = [t.tolist() for t in track_embedding]
    track_embeddings.extend(track_embedding)
  
  # track_details.extend(song_data['track_uri', 'track_name', 'artist_name', 'album_name'].to_list())
  # print(" 6 ]")

Using TensorFlow backend.


processing parition[0/20] [1  2  3  4  5 ]


HBox(children=(IntProgress(value=0, max=221), HTML(value='')))




In [0]:
# save results in a numpy array on a google drive location

######
# step 6: create numpy array
######

del track_vectors
track_vectors = np.column_stack((song_data, track_embeddings))
np.save('/content/drive/My Drive/109A/project/data/track_vectors-part00-GPT2', \
        track_vectors)