# Model Building


In [1]:
import pandas as pd
import numpy as np

In [2]:
df = pd.read_pickle("./pickles/02_cleaned_lyrics_df.pkl")

In [3]:
lyrics = df["lyrics"].values
labels = df["tag"].values

In [4]:
from sklearn.preprocessing import LabelEncoder

encoder = LabelEncoder()
labels = encoder.fit_transform(labels)

In [5]:
encoder.classes_

array(['country', 'misc', 'pop', 'rap', 'rb', 'rock'], dtype=object)

## Vocabulary Building


In [6]:
import torch
from torch.utils.data import Dataset, DataLoader

In [7]:
MAX_LENGTH = 128

In [9]:
from transformers import BertTokenizer

tokenizer = BertTokenizer.from_pretrained("bert-base-uncased")

In [10]:
text = "Hello, how are you?"
tokens = tokenizer(
    text, padding="max_length", max_length=MAX_LENGTH, truncation=True, return_tensors="pt"
)
tokens

{'input_ids': tensor([[ 101, 7592, 1010, 2129, 2024, 2017, 1029,  102,    0,    0,    0,    0,
            0,    0,    0,    0,    0,    0,    0,    0,    0,    0,    0,    0,
            0,    0,    0,    0,    0,    0,    0,    0,    0,    0,    0,    0,
            0,    0,    0,    0,    0,    0,    0,    0,    0,    0,    0,    0,
            0,    0,    0,    0,    0,    0,    0,    0,    0,    0,    0,    0,
            0,    0,    0,    0,    0,    0,    0,    0,    0,    0,    0,    0,
            0,    0,    0,    0,    0,    0,    0,    0,    0,    0,    0,    0,
            0,    0,    0,    0,    0,    0,    0,    0,    0,    0,    0,    0,
            0,    0,    0,    0,    0,    0,    0,    0,    0,    0,    0,    0,
            0,    0,    0,    0,    0,    0,    0,    0,    0,    0,    0,    0,
            0,    0,    0,    0,    0,    0,    0,    0]]), 'token_type_ids': tensor([[0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,
         0, 0, 0,

In [12]:
encoded_lyrics = tokenizer.batch_encode_plus(
    lyrics,
    add_special_tokens=True,
    padding="max_length",
    truncation=True,
    return_attention_mask=True,
    return_tensors="pt",
    max_length=MAX_LENGTH,
)

## Dataset And Dataloader


In [13]:
BATCH_SIZE = 16

In [14]:
class TextDataset(Dataset):
    def __init__(self, lyrics, labels, attention_masks):
        self.lyrics = lyrics
        self.labels = labels
        self.attention_masks = attention_masks

    def __len__(self):
        return len(self.lyrics)

    def __getitem__(self, idx):
        return {
            "input_ids": self.lyrics[idx],
            "labels": self.labels[idx],
            "attention_mask": self.attention_masks[idx],
        }


dataset = TextDataset(
    encoded_lyrics["input_ids"],
    labels=labels,
    attention_masks=encoded_lyrics["attention_mask"],
)
dataloader = DataLoader(dataset, batch_size=BATCH_SIZE, shuffle=True)

## Model Definition


In [15]:
from transformers import BertForSequenceClassification

model = BertForSequenceClassification.from_pretrained(
    "bert-base-uncased", num_labels=len(encoder.classes_)
)

Some weights of BertForSequenceClassification were not initialized from the model checkpoint at bert-base-uncased and are newly initialized: ['classifier.bias', 'classifier.weight']
You should probably TRAIN this model on a down-stream task to be able to use it for predictions and inference.


In [16]:
device = torch.device("cuda" if torch.cuda.is_available() else "cpu")
print(device)

cuda


In [17]:
model = model.to(device)

## Training


In [18]:
from torch.optim import Adam
from tqdm import tqdm
from torch.nn import CrossEntropyLoss

In [19]:
LEARNING_RATE = 2e-5

optimizer = Adam(model.parameters(), lr=LEARNING_RATE)
loss_fn = CrossEntropyLoss()

In [20]:
for epoch in range(3):  # Number of epochs
    model.train()
    total_loss = 0

    for batch in tqdm(dataloader):
        input_ids = batch["input_ids"].to(device)
        attention_mask = batch["attention_mask"].to(device)
        labels = batch["labels"].type(torch.LongTensor).to(device)

        outputs = model(
            input_ids=input_ids,
            token_type_ids=None,
            attention_mask=attention_mask,
        )

        loss = loss_fn(outputs.logits, labels)
        total_loss += loss

        optimizer.zero_grad()
        loss.backward()
        optimizer.step()

    avg_loss = total_loss / len(dataloader)
    print(f"Epoch {epoch + 1} - Loss: {avg_loss}")
    # Save Model
    model.save_pretrained("lyrics_model")
    tokenizer.save_pretrained("lyrics_model")

  attn_output = torch.nn.functional.scaled_dot_product_attention(
100%|██████████| 1875/1875 [30:04<00:00,  1.04it/s]


Epoch 1 - Loss: 1.0476025342941284


100%|██████████| 1875/1875 [38:45<00:00,  1.24s/it]  


Epoch 2 - Loss: 0.790023684501648


100%|██████████| 1875/1875 [27:44<00:00,  1.13it/s]


Epoch 3 - Loss: 0.5592418909072876


## Evaluation

In [23]:
model.eval()

BertForSequenceClassification(
  (bert): BertModel(
    (embeddings): BertEmbeddings(
      (word_embeddings): Embedding(30522, 768, padding_idx=0)
      (position_embeddings): Embedding(512, 768)
      (token_type_embeddings): Embedding(2, 768)
      (LayerNorm): LayerNorm((768,), eps=1e-12, elementwise_affine=True)
      (dropout): Dropout(p=0.1, inplace=False)
    )
    (encoder): BertEncoder(
      (layer): ModuleList(
        (0-11): 12 x BertLayer(
          (attention): BertAttention(
            (self): BertSdpaSelfAttention(
              (query): Linear(in_features=768, out_features=768, bias=True)
              (key): Linear(in_features=768, out_features=768, bias=True)
              (value): Linear(in_features=768, out_features=768, bias=True)
              (dropout): Dropout(p=0.1, inplace=False)
            )
            (output): BertSelfOutput(
              (dense): Linear(in_features=768, out_features=768, bias=True)
              (LayerNorm): LayerNorm((768,), eps=1e

In [31]:
numb_lyrics = """
I'm tired of being what you want me to be
Feeling so faithless, lost under the surface
I don't know what you're expecting of me
Put under the pressure of walking in your shoes
Every step that I take is another mistake to you
(Caught in the undertow, just caught in the undertow)
I've become so numb, I can't feel you there
Become so tired, so much more aware
I'm becoming this, all I want to do
Is be more like me and be less like you
Can't you see that you're smothering me?
Holding too tightly, afraid to lose control
'Cause everything that you thought I would be
Has fallen apart right in front of you
Every step that I take is another mistake to you
(Caught in the undertow, just caught in the undertow)
And every second I waste is more than I can take
I've become so numb, I can't feel you there
Become so tired, so much more aware
I'm becoming this, all I want to do
Is be more like me and be less like you
And I know
I may end up failing too
But I know
You were just like me, with someone disappointed in you
I've become so numb, I can't feel you there
Become so tired, so much more aware
I'm becoming this, all I want to do
Is be more like me and be less like you
I've become so numb, I can't feel you there
I'm tired of being what you want me to be
I've become so numb, I can't feel you there
I'm tired of being what you want me to be
"""

In [45]:
def classify_lyrics(lyrics: str) -> str:
    encoding = tokenizer(
        lyrics,
        padding='max_length',
        max_length=128,
        truncation=True,
        return_tensors='pt'
    )

    input_ids = encoding['input_ids'].to(device)
    attention_mask = encoding['attention_mask'].to(device)

    with torch.no_grad():
        outputs = model(input_ids=input_ids, attention_mask=attention_mask)
        logits = outputs.logits
        predicted_class = torch.argmax(logits, dim=1).item()

    decoded = encoder.inverse_transform([predicted_class])
    print(f"Predicted class for custom input: {decoded}")

In [46]:
def classify_lyrics_from_file(path:str) -> str:
    with open(path, "r") as text_file:
        return classify_lyrics(text_file.read())

In [48]:
lyrics_folder = "./datasets/test_lyrics/"

In [52]:
classify_lyrics_from_file(lyrics_folder + "redbone-childish_gambino.txt")

Predicted class for custom input: ['rb']


In [53]:
classify_lyrics_from_file(lyrics_folder + "euphoria-kendrick.txt")

Predicted class for custom input: ['rock']


In [55]:
classify_lyrics_from_file(lyrics_folder + "flaws_and_sins-juice_wrld.txt")

Predicted class for custom input: ['rb']


In [56]:
classify_lyrics_from_file(lyrics_folder + "hunting_wabbits-j_cole.txt")

Predicted class for custom input: ['rap']


In [58]:
classify_lyrics_from_file(lyrics_folder + "intentions-justin_bieber.txt")

Predicted class for custom input: ['rb']


In [59]:
classify_lyrics_from_file(lyrics_folder + "only_you-karri.txt")

Predicted class for custom input: ['pop']


In [60]:
classify_lyrics_from_file(lyrics_folder + "invisble-linkin_park.txt")

Predicted class for custom input: ['rock']
