In [1]:
# check system specs

gpu_info = !nvidia-smi
gpu_info = '\n'.join(gpu_info)
if gpu_info.find('failed') >= 0:
  print('Not connected to a GPU')
else:
  print('Connected to a GPU')

from psutil import virtual_memory
ram_gb = virtual_memory().total / 1e9

if ram_gb < 20:
  print('Not using a high-RAM runtime: {:.1f} gigabytes of available RAM'.format(ram_gb))
else:
  print('Using a high-RAM runtime: {:.1f} gigabytes of available RAM'.format(ram_gb))

Connected to a GPU
Using a high-RAM runtime: 89.6 gigabytes of available RAM


In [2]:
import pandas as pd
import numpy as np

In [3]:
from google.colab import drive
drive.mount('/content/gdrive')

Mounted at /content/gdrive


In [4]:
# note: place shortcut to shared project folder in google drive root directory
%cd /content/gdrive/MyDrive/ml2_project

/content/gdrive/.shortcut-targets-by-id/1WHLBzPq6pt_F7mh3d3goIQl4MwYlTfIh/ml2_project


In [5]:
!pip install sentence_transformers

Looking in indexes: https://pypi.org/simple, https://us-python.pkg.dev/colab-wheels/public/simple/
Collecting sentence_transformers
  Downloading sentence-transformers-2.2.2.tar.gz (85 kB)
[K     |████████████████████████████████| 85 kB 3.0 MB/s 
[?25hCollecting transformers<5.0.0,>=4.6.0
  Downloading transformers-4.25.1-py3-none-any.whl (5.8 MB)
[K     |████████████████████████████████| 5.8 MB 27.5 MB/s 
Collecting sentencepiece
  Downloading sentencepiece-0.1.97-cp38-cp38-manylinux_2_17_x86_64.manylinux2014_x86_64.whl (1.3 MB)
[K     |████████████████████████████████| 1.3 MB 75.4 MB/s 
[?25hCollecting huggingface-hub>=0.4.0
  Downloading huggingface_hub-0.11.1-py3-none-any.whl (182 kB)
[K     |████████████████████████████████| 182 kB 72.9 MB/s 
Collecting tokenizers!=0.11.3,<0.14,>=0.11.1
  Downloading tokenizers-0.13.2-cp38-cp38-manylinux_2_17_x86_64.manylinux2014_x86_64.whl (7.6 MB)
[K     |████████████████████████████████| 7.6 MB 75.4 MB/s 
Building wheels for collected pa

In [6]:
# load models

from sentence_transformers import SentenceTransformer
import torch

# base_model = SentenceTransformer('all-MiniLM-L12-v2')
# base_model.save('./all-MiniLM-L12-v2')
base_model = SentenceTransformer('./models/all-MiniLM-L12-v2')
base_model = base_model.to(torch.device('cuda')) # use GPU

ft_final_model = SentenceTransformer('./models/finetune_mnr_final')
ft_final_model = ft_final_model.to(torch.device('cuda')) # use GPU


### Small Dataset

In [7]:
sds = pd.read_csv('data/small_dataset.csv')

In [8]:
embeddings_base = base_model.encode(sds['lyrics'])

In [9]:
embeddings_ft_final = ft_final_model.encode(sds['lyrics'])

In [10]:
sds['embeddings_base'] = list(embeddings_base)
sds['embeddings_ft_final'] = list(embeddings_ft_final)

In [11]:
sds.to_csv('data/small_dataset_w_embeddings.csv')

### Large Dataset

In [20]:
df = pd.read_csv('data/clean_dataset.csv')
df = df[df['tag'] != 'rap'] # remove rap
df = df[~df['lyrics'].isnull()] # remove null lyrics
df = df.reset_index()
df.shape

(94810, 11)

In [22]:
embeddings_base = base_model.encode(df['lyrics'])

In [23]:
embeddings_ft_final = ft_final_model.encode(df['lyrics'])

In [24]:
df['embeddings_base'] = list(embeddings_base)
df['embeddings_ft_final'] = list(embeddings_ft_final)

In [25]:
df.to_csv('data/clean_dataset_w_embeddings.csv')