In [None]:
# check system specs

gpu_info = !nvidia-smi
gpu_info = '\n'.join(gpu_info)
if gpu_info.find('failed') >= 0:
  print('Not connected to a GPU')
else:
  print('Connected to a GPU')

from psutil import virtual_memory
ram_gb = virtual_memory().total / 1e9

if ram_gb < 20:
  print('Not using a high-RAM runtime: {:.1f} gigabytes of available RAM'.format(ram_gb))
else:
  print('Using a high-RAM runtime: {:.1f} gigabytes of available RAM'.format(ram_gb))

Connected to a GPU
Not using a high-RAM runtime: 13.6 gigabytes of available RAM


In [None]:
import pandas as pd
import numpy as np

In [None]:
!pip install sentence_transformers

Looking in indexes: https://pypi.org/simple, https://us-python.pkg.dev/colab-wheels/public/simple/
Collecting sentence_transformers
  Downloading sentence-transformers-2.2.2.tar.gz (85 kB)
[K     |████████████████████████████████| 85 kB 1.8 MB/s 
[?25hCollecting transformers<5.0.0,>=4.6.0
  Downloading transformers-4.25.1-py3-none-any.whl (5.8 MB)
[K     |████████████████████████████████| 5.8 MB 56.6 MB/s 
Collecting sentencepiece
  Downloading sentencepiece-0.1.97-cp38-cp38-manylinux_2_17_x86_64.manylinux2014_x86_64.whl (1.3 MB)
[K     |████████████████████████████████| 1.3 MB 54.1 MB/s 
[?25hCollecting huggingface-hub>=0.4.0
  Downloading huggingface_hub-0.11.1-py3-none-any.whl (182 kB)
[K     |████████████████████████████████| 182 kB 81.6 MB/s 
Collecting tokenizers!=0.11.3,<0.14,>=0.11.1
  Downloading tokenizers-0.13.2-cp38-cp38-manylinux_2_17_x86_64.manylinux2014_x86_64.whl (7.6 MB)
[K     |████████████████████████████████| 7.6 MB 33.1 MB/s 
Building wheels for collected pa

In [None]:
from google.colab import drive
drive.mount('/content/gdrive')

Mounted at /content/gdrive


In [None]:
# note: place shortcut to shared project folder in google drive root directory
%cd /content/gdrive/MyDrive/ml2_project

/content/gdrive/.shortcut-targets-by-id/1WHLBzPq6pt_F7mh3d3goIQl4MwYlTfIh/ml2_project


In [None]:
from sentence_transformers import SentenceTransformer
import torch

# base_model = SentenceTransformer('all-MiniLM-L12-v2')
# base_model.save('./all-MiniLM-L12-v2')

base_model = SentenceTransformer('./models/all-MiniLM-L12-v2')
base_model = base_model.to(torch.device('cuda')) # use GPU

ft_128_model = SentenceTransformer('./models/finetune_mnr_128seqlength')
ft_128_model = ft_128_model.to(torch.device('cuda')) # use GPU

ft_510_model = SentenceTransformer('./models/finetune_mnr_510seqlength')
ft_510_model = ft_510_model.to(torch.device('cuda')) # use GPU

### Validation Methodology
Half the validation songs will be the correct pairs with similarity score 1. The other half will be random halves with similarity 0.

Reference: https://www.pinecone.io/learn/fine-tune-sentence-transformers-mnr/#compare-sentence-transformers

In [None]:
lyrics_val = pd.read_csv('data/lyrics_val.csv',names = ['index','lyrics'])
lyrics_val = lyrics_val['lyrics']

In [None]:
# to be split into correct halves
lyrics_val_pos = lyrics_val[:len(lyrics_val)//2]

# to be split into randome halves
lyrics_val_neg = lyrics_val[len(lyrics_val)//2:]

In [None]:
from sentence_transformers import InputExample

In [None]:
# prep positives

val_samples_pos = []
for song in lyrics_val_pos:
    song = str(song)
    half_1 = song[:len(song)//2]
    half_2 = song[len(song)//2:]
    val_samples_pos.append(InputExample(
        texts=[half_1, half_2],
        label=1
    ))

In [None]:
# prep negatives

import random

val_samples_neg = []
for song in lyrics_val_neg:
    song = str(song)
    rand_index = random.randrange(len(lyrics_val_neg))
    other_song = str(lyrics_val_neg.iloc[rand_index])
    half_1 = song[:len(song)//2]
    half_2 = other_song[len(song)//2:]
    val_samples_neg.append(InputExample(
        texts=[half_1, half_2],
        label=0
    ))

In [None]:
# combine

val_samples = val_samples_pos + val_samples_neg

In [None]:
# create evaluator

from sentence_transformers.evaluation import EmbeddingSimilarityEvaluator

evaluator_val = EmbeddingSimilarityEvaluator.from_input_examples(
    val_samples, write_csv=False
)

In [None]:
evaluator_val(base_model)

0.782081161177257

In [None]:
evaluator_val(ft_128_model)

0.8575007884312303

In [None]:
evaluator_val(ft_510_model)

0.8636311716943952

### Final Test
Choosing 128 sequence length model because it performs nearly as well as the 510 model with significantly less compute needed

In [None]:
from sentence_transformers import InputExample
import random
from sentence_transformers.evaluation import EmbeddingSimilarityEvaluator

In [None]:
lyrics_test = pd.read_csv('data/lyrics_test.csv',names = ['index','lyrics'])
lyrics_test = lyrics_test['lyrics']

# to be split into correct halves
lyrics_test_pos = lyrics_test[:len(lyrics_test)//2]

# to be split into randome halves
lyrics_test_neg = lyrics_test[len(lyrics_test)//2:]

# prep positives

test_samples_pos = []
for song in lyrics_test_pos:
    song = str(song)
    half_1 = song[:len(song)//2]
    half_2 = song[len(song)//2:]
    test_samples_pos.append(InputExample(
        texts=[half_1, half_2],
        label=1
    ))

# prep negatives

test_samples_neg = []
for song in lyrics_test_neg:
    song = str(song)
    rand_index = random.randrange(len(lyrics_test_neg))
    other_song = str(lyrics_test_neg.iloc[rand_index])
    half_1 = song[:len(song)//2]
    half_2 = other_song[len(song)//2:]
    test_samples_neg.append(InputExample(
        texts=[half_1, half_2],
        label=0
    ))

# combine

test_samples = test_samples_pos + test_samples_neg

# create evaluator

evaluator_test = EmbeddingSimilarityEvaluator.from_input_examples(
    test_samples, write_csv=False
)


In [None]:
evaluator_test(ft_128_model)

0.8589947387214997