In [1]:
import pandas as pd
import numpy as np
from scipy import spatial
from sentence_transformers import SentenceTransformer

In [2]:
model = SentenceTransformer('sentence-transformers/bert-base-nli-mean-tokens')


In [3]:
def get_embeddings():
    df = pd.read_csv('lyrics.csv')
    df = df.assign(embeddings=df['Lyric'].apply(lambda x: model.encode(str(x))))
#     print(df)
    return df

In [4]:
def closest_lyrics(inp):
    data = get_embeddings()
    inp_vector = model.encode(inp)
    s = data['embeddings'].apply(lambda x: 1 - spatial.distance.cosine(x, inp_vector))
    data = data.assign(similarity=s)
    return (data.sort_values('similarity', ascending=False))



if __name__ == '__main__':

    print(closest_lyrics('thinking about you'))

       Artist                         Title  \
137  Dua Lipa                   Good Things   
155  Dua Lipa            Shine On Sad World   
228  Dua Lipa            Say My Name (Demo)   
91   Dua Lipa                  Roses & Fire   
226  Dua Lipa     Throw Away The Key (Lion)   
..        ...                           ...   
41   Dua Lipa           Cocoa Butter Kisses   
225  Dua Lipa  New Rules (Jake Minor Remix)   
67   Dua Lipa   Rollin/Did You See (Mashup)   
73   Dua Lipa    New Rules for COVID Dating   
27   Dua Lipa                     Bang Bang   

                                                 Lyric  \
137  lyrics to snippet    you were for me right now...   
155                                                NaN   
228                            unleakedunreleased song   
91   snippet oh in my heart i know we might be shar...   
226  lyrics from snippet  throw away the key ooh i ...   
..                                                 ...   
41   cigarettes on cigarettes

In [5]:
sentences = [
    "Three years later, the coffin was still full of Jello.",
    "The fish dreamed of escaping the fishbowl and into the toilet where he saw his friend go.",
    "The person box was packed with jelly many dozens of months later.",
    "Standing on one's head at job interviews forms a lasting impression.",
    "It took him a month to finish the meal.",
    "He found a leprechaun in his walnut shell."
]

In [6]:
from transformers import AutoTokenizer, AutoModel
import torch

tokenizer = AutoTokenizer.from_pretrained('sentence-transformers/bert-base-nli-mean-tokens')
model = AutoModel.from_pretrained('sentence-transformers/bert-base-nli-mean-tokens')

# initialize dictionary that will contain tokenized sentences
tokens = {'input_ids': [], 'attention_mask': []}

for sentence in sentences:
    # tokenize sentence and append to dictionary lists
    new_tokens = tokenizer.encode_plus(sentence,
                                       max_length=128,
                                       truncation=True,
                                       padding='max_length',
                                       return_tensors='pt'
                                      )
    
    tokens['input_ids'].append(new_tokens['input_ids'][0])
    tokens['attention_mask'].append(new_tokens['attention_mask'][0])

# reformat list of tensors into single tensor
tokens['input_ids'] = torch.stack(tokens['input_ids'])
tokens['attention_mask'] = torch.stack(tokens['attention_mask'])

In [7]:
tokens['input_ids'].shape

torch.Size([6, 128])

In [8]:
outputs = model(**tokens)
outputs.keys()

odict_keys(['last_hidden_state', 'pooler_output'])

In [9]:
embeddings = outputs.last_hidden_state
embeddings

tensor([[[-6.9229e-02,  6.2300e-01,  3.5371e-02,  ...,  8.0334e-01,
           1.6314e+00,  3.2812e-01],
         [ 3.6730e-02,  6.8419e-01,  1.9460e-01,  ...,  8.4759e-02,
           1.4747e+00, -3.0080e-01],
         [-1.2142e-02,  6.5431e-01, -7.2717e-02,  ..., -3.2600e-02,
           1.7717e+00, -6.8121e-01],
         ...,
         [ 1.9532e-01,  1.1085e+00,  3.3905e-01,  ...,  1.2826e+00,
           1.0114e+00, -7.2754e-02],
         [ 9.0217e-02,  1.0288e+00,  3.2973e-01,  ...,  1.2940e+00,
           9.8650e-01, -1.1125e-01],
         [ 1.2404e-01,  9.7365e-01,  3.9329e-01,  ...,  1.1359e+00,
           8.7685e-01, -1.0435e-01]],

        [[-3.2124e-01,  8.2512e-01,  1.0554e+00,  ..., -1.8555e-01,
           1.5169e-01,  3.9366e-01],
         [-7.1457e-01,  1.0297e+00,  1.1217e+00,  ...,  3.3118e-02,
           2.3820e-01, -1.5632e-01],
         [-2.3522e-01,  1.1353e+00,  8.5941e-01,  ..., -4.3096e-01,
          -2.7242e-02, -2.9677e-01],
         ...,
         [-5.4000e-01,  3

In [10]:
embeddings.shape

torch.Size([6, 128, 768])

In [11]:
attention_mask = tokens['attention_mask']
attention_mask.shape

torch.Size([6, 128])

In [12]:
mask = attention_mask.unsqueeze(-1).expand(embeddings.size()).float()
mask.shape

torch.Size([6, 128, 768])

In [13]:
masked_embeddings = embeddings * mask
masked_embeddings.shape

torch.Size([6, 128, 768])

In [14]:
summed = torch.sum(masked_embeddings, 1)
summed.shape

torch.Size([6, 768])

In [15]:
summed_mask = torch.clamp(mask.sum(1), min=1e-9)
summed_mask.shape

torch.Size([6, 768])

In [16]:
mean_pooled = summed / summed_mask
mean_pooled

tensor([[ 0.0745,  0.8637,  0.1795,  ...,  0.7734,  1.7247, -0.1803],
        [-0.3715,  0.9729,  1.0840,  ..., -0.2552, -0.2759,  0.0358],
        [-0.5030,  0.7950, -0.1240,  ...,  0.1441,  0.9704, -0.1791],
        [-0.0132,  0.9773,  1.4516,  ..., -0.8462, -1.4004, -0.4118],
        [-0.2019,  0.0597,  0.8603,  ..., -0.0100,  0.8431, -0.0841],
        [-0.2131,  1.0175, -0.8833,  ...,  0.7371,  0.1947, -0.3011]],
       grad_fn=<DivBackward0>)

In [17]:
from sklearn.metrics.pairwise import cosine_similarity

In [18]:
# convert from PyTorch tensor to numpy array
mean_pooled = mean_pooled.detach().numpy()

# calculate
cosine_similarity(
    [mean_pooled[0]],
    mean_pooled[1:]
)

array([[0.3308891 , 0.72192585, 0.17475507, 0.4470966 , 0.5548363 ]],
      dtype=float32)