## Semantic Song Search: Fine-Tune Base Model

In [1]:
from sentence_transformers import SentenceTransformer
import pandas as pd
import numpy as np

### Data Prep

In [2]:
sds = pd.read_csv("data/small_dataset.csv")

In [3]:
sds.dtypes

Unnamed: 0     int64
index          int64
title         object
tag           object
artist        object
year           int64
views          int64
features      object
lyrics        object
id             int64
dtype: object

In [4]:
sds['tag'].unique()

array(['rap', 'pop', 'rock', 'rb', 'country', 'misc'], dtype=object)

#### Remove rap because we don't like the lyrics
- Too explicit / NSFW / NSFS
- Many similar lines, less cohesive narratives/plots

##### Concern: Some rap songs are tagged as pop

In [5]:
sds = sds[sds['tag'] != 'rap']
sds.shape

(614, 10)

In [6]:
# rap song tagged as pop
sds.iloc[0]

Unnamed: 0                                                  424
index                                                       516
title                                                 Heartless
tag                                                         pop
artist                                               Kanye West
year                                                       2008
views                                                   1175109
features                                                     {}
lyrics        \nIn the night, I hear 'em talk\nThe coldest s...
id                                                          526
Name: 36, dtype: object

In [7]:
# all
lyrics = sds['lyrics']

### Remove duplicate lyrics lines
- Remove duplicate lines because otherwise our model may learn to match song halves based on repeated lyrics within a song
- Define function below to dedupe and preserve order

#### Concern: There can still be quite similar lines scattered through songs

In [8]:
# test case
test = sds.iloc[0]['lyrics']
sds.iloc[0]

Unnamed: 0                                                  424
index                                                       516
title                                                 Heartless
tag                                                         pop
artist                                               Kanye West
year                                                       2008
views                                                   1175109
features                                                     {}
lyrics        \nIn the night, I hear 'em talk\nThe coldest s...
id                                                          526
Name: 36, dtype: object

In [9]:
print('line',len(test.split('\n')))
print('unique lines',len(set(test.split('\n'))))

line 74
unique lines 46


In [10]:
set(test.split('\n'))

{'',
 "'Cause I already know how this thing go",
 "And now you wanna get me back and you gon' show me",
 'And on and, and on and on and on',
 "And we just gon' be enemies",
 "And you can't make it right",
 "And you just gon' keep hatin' me",
 "Ayo, I did some things but that's the old me",
 "Ayo, I know of some things that you ain't told me",
 "Baby, let's just knock it off",
 "But in the end, it's still so lonely",
 'Cold as the winter wind when it breeze, yo?',
 'He lost his soul to a woman so heartless',
 "Homie, I don't know, she's hot and cold",
 'How could you be so',
 'How could you be so Dr. Evil?',
 'How could you be so heartless?',
 'How could you be so heartless? (How, how?)',
 'I could just leave it wrong',
 "I decided we wasn't gon' speak so",
 "I know you can't believe",
 'I mean, after all the things that we been through',
 'I mean, after all the things we got into',
 "I won't stop, won't mess my groove up",
 "I'm gon' take off tonight into the night",
 "In the night, I 

In [11]:
# remove duplicates from list and preserve order
def dedupe(seq):
    seen = set()
    seen_add = seen.add
    return [x for x in seq if not (x in seen or seen_add(x))]

In [12]:
# deduped lines example
dedupe(test.split('\n'))

['',
 "In the night, I hear 'em talk",
 'The coldest story ever told',
 'Somewhere far along this road',
 'He lost his soul to a woman so heartless',
 'How could you be so heartless?',
 'Oh, how could you be so heartless?',
 'How could you be so',
 'Cold as the winter wind when it breeze, yo?',
 "Just remember that you talkin' to me though",
 "You need to watch the way you talkin' to me, yo",
 'I mean, after all the things that we been through',
 'I mean, after all the things we got into',
 "Ayo, I know of some things that you ain't told me",
 "Ayo, I did some things but that's the old me",
 "And now you wanna get me back and you gon' show me",
 "So you walk around like you don't know me",
 'You got a new friend, well, I got homies',
 "But in the end, it's still so lonely",
 'How could you be so Dr. Evil?',
 "You're bringin' out a side of me that I don't know",
 "I decided we wasn't gon' speak so",
 'Why we up 3 AM on the phone?',
 "Why do she be so mad at me fo'?",
 "Homie, I don't kn

In [13]:
# confirm dedupe line count
len(dedupe(test.split('\n'))) == len(set(test.split('\n')))

True

In [14]:
# reassemble deduped song
test = dedupe(test.split('\n'))
test = '\n'.join(test)
print(test)


In the night, I hear 'em talk
The coldest story ever told
Somewhere far along this road
He lost his soul to a woman so heartless
How could you be so heartless?
Oh, how could you be so heartless?
How could you be so
Cold as the winter wind when it breeze, yo?
Just remember that you talkin' to me though
You need to watch the way you talkin' to me, yo
I mean, after all the things that we been through
I mean, after all the things we got into
Ayo, I know of some things that you ain't told me
Ayo, I did some things but that's the old me
And now you wanna get me back and you gon' show me
So you walk around like you don't know me
You got a new friend, well, I got homies
But in the end, it's still so lonely
How could you be so Dr. Evil?
You're bringin' out a side of me that I don't know
I decided we wasn't gon' speak so
Why we up 3 AM on the phone?
Why do she be so mad at me fo'?
Homie, I don't know, she's hot and cold
I won't stop, won't mess my groove up
'Cause I already know how this thing 

In [15]:
# process dataset

lyrics_deduped = []

for song in lyrics:
    song = dedupe(song.split('\n'))
    song = '\n'.join(song)
    lyrics_deduped.append(song)

### Training

Reference: https://www.pinecone.io/learn/fine-tune-sentence-transformers-mnr/#fast-fine-tuning

In [16]:
from sentence_transformers import InputExample
from tqdm.auto import tqdm  # progress bar

train_samples = []
for song in tqdm(lyrics_deduped):
    # split songs into two halves for positive pair training
    half_1 = song[:len(song)//2]
    half_2 = song[len(song)//2:]
    train_samples.append(InputExample(
        texts=[half_1, half_2]
    ))

  0%|          | 0/614 [00:00<?, ?it/s]

In [17]:
len(train_samples)

614

In [18]:
from sentence_transformers import datasets

batch_size = 32

# remove duplicate pairings
loader = datasets.NoDuplicatesDataLoader(
    train_samples, batch_size=batch_size)
     

In [19]:
len(train_samples)

614

In [20]:
from sentence_transformers import models, SentenceTransformer

# bert = models.Transformer('bert-base-uncased')
# pooler = models.Pooling(
#    bert.get_word_embedding_dimension(),
#    pooling_mode_mean_tokens=True
#)
# model = SentenceTransformer(modules=[bert, pooler])

model = SentenceTransformer('all-MiniLM-L12-v2')

model

SentenceTransformer(
  (0): Transformer({'max_seq_length': 128, 'do_lower_case': False}) with Transformer model: BertModel 
  (1): Pooling({'word_embedding_dimension': 384, 'pooling_mode_cls_token': False, 'pooling_mode_mean_tokens': True, 'pooling_mode_max_tokens': False, 'pooling_mode_mean_sqrt_len_tokens': False})
  (2): Normalize()
)

In [21]:
from sentence_transformers import losses

# mnr loss
loss = losses.MultipleNegativesRankingLoss(model)

In [22]:

epochs = 1
warmup_steps = int(len(loader) * epochs * 0.1)

model.fit(
    train_objectives=[(loader, loss)],
    epochs=epochs,
    warmup_steps=warmup_steps,
    output_path='./finetune_test_mnr',
    show_progress_bar=False
) 

