In [4]:
from datasets import load_dataset
import numpy as np
import re

dataset = load_dataset('json',data_files='data.json').class_encode_column('artist')
artists_mappings = dataset['train'].features['artist'].names

Found cached dataset json (/home/ste/.cache/huggingface/datasets/json/default-901973c7d83af16c/0.0.0/0f7e3662623656454fcd2b650f34e886a7db4b9104504885bd462096cc7a9f51)
100%|██████████| 1/1 [00:00<00:00, 380.13it/s]
Loading cached processed dataset at /home/ste/.cache/huggingface/datasets/json/default-901973c7d83af16c/0.0.0/0f7e3662623656454fcd2b650f34e886a7db4b9104504885bd462096cc7a9f51/cache-3d9795bc45b8494f.arrow


In [5]:
def clean(example):
    allowed_parts = ['verse','break','chorus','intro', 'interlude', 'bridge', 'outro']
    for part in allowed_parts:
        example['lyrics']=re.sub("\[.*"+part+".*\]", f"[{part}]", example['lyrics'], flags=re.IGNORECASE)
    example['lyrics']=re.sub("\[(?!"+"|".join(allowed_parts)+").*?\]", "", example['lyrics'], flags=re.DOTALL)
    example['lyrics']=re.sub("[0-9]+embed", "", example['lyrics'], flags=re.IGNORECASE)
    return example

mapped_dataset = dataset.map(clean)


                                                                 

In [6]:
def list_song_parts(example):
    parts = re.findall(r'\[[^\[\]]+\]',example['lyrics']) # Capture everything enclosed in square brackets
    for i,part in enumerate(parts): 
        parts[i] = re.sub(r':.*(?=\])','',part) # Remove everything from : to the closing bracket ] (Most lyrics contain the name of the singer of these parts e.g. [Chorus: 2 Chainz])
    return {'parts': parts}
parts = mapped_dataset['train'].map(list_song_parts,remove_columns=dataset['train'].column_names)

parts:np.ndarray = np.unique([el for l in parts['parts'] for el in l ])

                                                                  

In [7]:
print(*parts)

[break] [bridge] [chorus] [interlude] [intro] [outro] [verse]


In [8]:
# print(mapped_dataset)
print(dataset['train'][1000]['lyrics'])
print("----------------")
print(mapped_dataset['train'][1000]['lyrics'])

XO TOUR Llif3 Lyrics[Intro]
Are you alright?
I'm alright, I'm quite alright
And my money's right
8… (Yeah)
Countin' them bands
All way to the top 'til they be fallin' over
(Yeah, yeah, yeah)
Countin' them bands
On my way to the top 'til we fallin' over

[Chorus]
I don't really care if you cry
On the real, you shoulda never lied
Shoulda saw the way she looked me in my eyes
She said, "Baby, I am not afraid to die"
Push me to the edge
All my friends are dead
Push me to the edge
All my friends are dead
Push me to the edge
All my friends are dead
Push me to the edge

[Verse 1]
Phantom that's all red, inside all white
Like somethin' you ride a sled down, I just want that head
My Brittany got mad, I'm barely her man now
Everybody got the same swag now
Watch the way that I tear it down
Stackin' my bands all the way to the top
All the way 'til my bands fallin' over
Every time that you leave your spot
Your girlfriend call me like, "Come on over!"
I like the way that she treat me
Gon' leave you, 

In [11]:
allowed_parts = ['verse','break','chorus','intro', 'interlude', 'bridge', 'outro']
def contains_one_of(allowed_parts,part:str):
    for p in allowed_parts:
        if p in part.lower(): return True
    return False
filtered_parts = np.unique([part for part in parts if contains_one_of(allowed_parts,part)])

In [12]:
sorted(filtered_parts)

['[break]', '[bridge]', '[chorus]', '[intro]', '[verse]']

# Preprocessing

In [27]:
counts = {}
def plot_dist(dataset):
    for example in dataset:
        if example['artist'] not in counts.keys():
            counts[example['artist']] = 0
        else:
            counts[example['artist']] += 1
    return counts
plot_dist(mapped_dataset['train'])

{15: 99,
 26: 99,
 61: 99,
 62: 99,
 31: 99,
 29: 99,
 39: 99,
 0: 99,
 42: 99,
 60: 99,
 38: 99,
 12: 98,
 51: 98,
 28: 99,
 33: 99,
 30: 99,
 57: 92,
 45: 99,
 3: 99,
 44: 99,
 1: 99,
 68: 99,
 49: 99,
 63: 99,
 59: 99,
 21: 99,
 14: 99,
 13: 99,
 32: 99,
 5: 99,
 56: 99,
 19: 97,
 37: 99,
 2: 99,
 52: 99,
 46: 99,
 8: 99,
 67: 99,
 41: 99,
 27: 99,
 4: 99,
 22: 99,
 47: 99,
 34: 96,
 54: 99,
 10: 99,
 6: 99,
 24: 99,
 48: 99,
 69: 99,
 70: 99,
 25: 99,
 11: 99,
 36: 99,
 16: 99,
 9: 99,
 35: 99,
 58: 99,
 43: 99,
 53: 99,
 23: 99,
 50: 98,
 20: 99,
 71: 99,
 64: 99,
 40: 99,
 7: 99,
 65: 99,
 18: 99,
 66: 99,
 55: 99,
 17: 99}

## Machine Learning

In [55]:
# This was made by ChatGPT, keep an eye for possible bugs

import torch
import transformers
from torch.utils.data import DataLoader
from torch.utils.data import Dataset
from transformers import AdamW
from transformers import get_linear_schedule_with_warmup
from transformers import AutoTokenizer, AutoModel,AutoConfig

# Define your dataset and data loader
class SimpleDataset(Dataset):
    def __init__(self, data, tokenizer):
        self.data = data
        self.tokenizer = tokenizer

    def __getitem__(self, index):
        item = {key: torch.tensor(val[index]) for key, val in self.data.items()}
        return item

    def __len__(self):
        return len(self.data['input_ids'])

# Define your triplet loss function
class TripletLoss(torch.nn.Module):
    def __init__(self, margin=1.0):
        super(TripletLoss, self).__init__()
        self.margin = margin

    def forward(self, anchor, positive, negative):
        dist_pos = torch.norm(anchor - positive, p=2, dim=1)
        dist_neg = torch.norm(anchor - negative, p=2, dim=1)
        loss = torch.mean(torch.relu(dist_pos - dist_neg + self.margin))
        return loss
    
# Define your model
class TransformerModel(torch.nn.Module):
    def __init__(self, model_name):
        super(TransformerModel, self).__init__()
        self.transformer = AutoModel.from_pretrained(model_name)
        self.dense_layer = torch.nn.Linear(self.transformer.config.hidden_size, self.transformer.config.hidden_size)

    def forward(self, input_ids, attention_mask):
        outputs = self.transformer(input_ids, attention_mask)
        pooled_output = outputs[1]
        embeddings = self.dense_layer(pooled_output)
        return embeddings

# Define your hyperparameters
learning_rate = 2e-5
epochs = 3
batch_size = 32

# Load your pre-trained model and define your model
model_name = 'bert-base-uncased'
tokenizer = AutoTokenizer.from_pretrained(model_name)
model = TransformerModel(model_name)

# # Freeze all the parameters in the pre-trained model
# for param in model.parameters():
#     param.requires_grad = False

# Set the model to training mode
model.train()

# Define your training data
data = {'input_ids': [[1, 2, 3], [4, 5, 6], [7, 8, 9]], 'attention_mask': [[1, 1, 1], [1, 1, 1], [1, 1, 1]]}
train_dataset = SimpleDataset(data, tokenizer)
train_loader = DataLoader(train_dataset, batch_size=batch_size, shuffle=True)

Some weights of the model checkpoint at bert-base-uncased were not used when initializing BertModel: ['cls.seq_relationship.bias', 'cls.predictions.transform.dense.bias', 'cls.seq_relationship.weight', 'cls.predictions.bias', 'cls.predictions.transform.LayerNorm.bias', 'cls.predictions.decoder.weight', 'cls.predictions.transform.LayerNorm.weight', 'cls.predictions.transform.dense.weight']
- This IS expected if you are initializing BertModel from the checkpoint of a model trained on another task or with another architecture (e.g. initializing a BertForSequenceClassification model from a BertForPreTraining model).
- This IS NOT expected if you are initializing BertModel from the checkpoint of a model that you expect to be exactly identical (initializing a BertForSequenceClassification model from a BertForSequenceClassification model).


In [56]:
# Define your optimizer and learning rate scheduler
optimizer = AdamW(model.parameters(), lr=learning_rate)
total_steps = len(train_loader) * epochs
scheduler = get_linear_schedule_with_warmup(optimizer, num_warmup_steps=0, num_training_steps=total_steps)

# Train your model
for epoch in range(epochs):
    for batch in train_loader:
        # Extract the input ids and attention masks from the batch
        input_ids = batch['input_ids']
        attention_mask = batch['attention_mask']

        # Encode the inputs using the pre-trained model
        embeddings = model(input_ids=input_ids, attention_mask=attention_mask)
        print(embeddings.shape)

        # Split the embeddings into anchor, positive, and negative examples
        embeddings = embeddings.reshape(-1, 3, model.transformer.config.hidden_size)
        anchor = embeddings[:, 0]
        positive = embeddings[:, 1]
        negative = embeddings[:, 2]

        # Compute the triplet loss and update the parameters
        loss_fn = TripletLoss()
        loss = loss_fn(anchor, positive, negative)
        loss.backward()
        optimizer.step()
        scheduler.step()
        optimizer.zero_grad()

    # Print the loss every epoch
    print('Epoch [{}/{}], Loss: {:.4f}'.format(epoch+1, epochs, loss.item()))

torch.Size([3, 768])
Epoch [1/3], Loss: 3.9939
torch.Size([3, 768])
Epoch [2/3], Loss: 0.0000
torch.Size([3, 768])
Epoch [3/3], Loss: 0.0000
