In [None]:
import pandas as pd
import numpy as np

In [None]:
from google.colab import drive
drive.mount('/content/gdrive')

Mounted at /content/gdrive


In [None]:
# check system specs

gpu_info = !nvidia-smi
gpu_info = '\n'.join(gpu_info)
if gpu_info.find('failed') >= 0:
  print('Not connected to a GPU')
else:
  print('Connected to a GPU')

from psutil import virtual_memory
ram_gb = virtual_memory().total / 1e9

if ram_gb < 20:
  print('Not using a high-RAM runtime: {:.1f} gigabytes of available RAM'.format(ram_gb))
else:
  print('Using a high-RAM runtime: {:.1f} gigabytes of available RAM'.format(ram_gb))

Connected to a GPU
Using a high-RAM runtime: 89.6 gigabytes of available RAM


In [None]:
# note: place shortcut to shared project folder in google drive root directory
%cd /content/gdrive/MyDrive/ml2_project

/content/gdrive/.shortcut-targets-by-id/1WHLBzPq6pt_F7mh3d3goIQl4MwYlTfIh/ml2_project


In [None]:
df = pd.read_csv('data/clean_dataset.csv')
df = df[df['tag'] != 'rap']
lyrics = df['lyrics']

In [None]:
lyrics.shape

(95072,)

In [None]:
# remove duplicates from list and preserve order
def dedupe(seq):
    seen = set()
    seen_add = seen.add
    return [x for x in seq if not (x in seen or seen_add(x))]

In [None]:
# process dataset

lyrics_deduped = []

for song in lyrics:
    song = dedupe(str(song).split('\n'))
    song = '\n'.join(song)
    lyrics_deduped.append(song)

### Final Train
Train on entire dataset

In [None]:
!pip install sentence_transformers

Looking in indexes: https://pypi.org/simple, https://us-python.pkg.dev/colab-wheels/public/simple/


In [None]:
# prep training data

from sentence_transformers import InputExample

train_samples = []
for song in lyrics_deduped:
    # split songs into two halves for positive pair training
    song = str(song)
    half_1 = song[:len(song)//2]
    half_2 = song[len(song)//2:]
    train_samples.append(InputExample(
        texts=[half_1, half_2]
    ))

In [None]:
from sentence_transformers import datasets

batch_size = 32

# removes duplicate pairings
loader = datasets.NoDuplicatesDataLoader(
    train_samples, batch_size=batch_size)

In [None]:
from sentence_transformers import models, SentenceTransformer
import torch

# model = SentenceTransformer('all-MiniLM-L12-v2')
# model.save('./all-MiniLM-L12-v2')
model = SentenceTransformer('./models/all-MiniLM-L12-v2')
model.max_seq_length = 256
model = model.to(torch.device('cuda')) # use GPU

model

SentenceTransformer(
  (0): Transformer({'max_seq_length': 256, 'do_lower_case': False}) with Transformer model: BertModel 
  (1): Pooling({'word_embedding_dimension': 384, 'pooling_mode_cls_token': False, 'pooling_mode_mean_tokens': True, 'pooling_mode_max_tokens': False, 'pooling_mode_mean_sqrt_len_tokens': False})
  (2): Normalize()
)

In [None]:
# specify loss

from sentence_transformers import losses

# mnr loss
loss = losses.MultipleNegativesRankingLoss(model)

In [None]:
# model name for export/save
model_export_name = './models/finetune_mnr_256'

In [None]:
# train and export model

epochs = 1
warmup_steps = int(len(loader) * epochs * 0.1)

model.fit(
    train_objectives=[(loader, loss)],
    epochs=epochs,
    warmup_steps=warmup_steps,
    output_path=model_export_name,
    show_progress_bar=True
)

Epoch:   0%|          | 0/1 [00:00<?, ?it/s]

Iteration:   0%|          | 0/2971 [00:00<?, ?it/s]

In [None]:
model

SentenceTransformer(
  (0): Transformer({'max_seq_length': 128, 'do_lower_case': False}) with Transformer model: BertModel 
  (1): Pooling({'word_embedding_dimension': 384, 'pooling_mode_cls_token': False, 'pooling_mode_mean_tokens': True, 'pooling_mode_max_tokens': False, 'pooling_mode_mean_sqrt_len_tokens': False})
  (2): Normalize()
)