# Model Building


## Loading Data


In [1]:
import pandas as pd
import numpy as np

In [2]:
df = pd.read_pickle("./pickles/02_cleaned_lyrics_df.pkl")

In [3]:
lyrics = df["lyrics"].values
labels = df["tag"].values

In [4]:
from sklearn.preprocessing import LabelEncoder

encoder = LabelEncoder()
labels = encoder.fit_transform(labels)

In [5]:
encoder.classes_

array(['country', 'misc', 'pop', 'rap', 'rb', 'rock'], dtype=object)

In [8]:
import pickle

with open("./pickles/label_encoder.pkl", "wb") as encoder_file:
    pickle.dump(encoder, encoder_file)

In [43]:
from sklearn.model_selection import train_test_split

X_train, X_test, y_train, y_test = train_test_split(lyrics, labels, test_size=0.2)

## Tokenization


In [18]:
import torch
from torch.utils.data import Dataset, DataLoader

In [19]:
MAX_LENGTH = 128

In [20]:
from transformers import BertTokenizer

tokenizer = BertTokenizer.from_pretrained("bert-base-uncased")

In [31]:
text = "Hello, how are you?"
tokens = tokenizer(
    text,
    padding="max_length",
    max_length=MAX_LENGTH,
    truncation=True,
    return_tensors="pt",
)

tokens

{'input_ids': tensor([[ 101, 7592, 1010, 2129, 2024, 2017, 1029,  102,    0,    0,    0,    0,
            0,    0,    0,    0,    0,    0,    0,    0,    0,    0,    0,    0,
            0,    0,    0,    0,    0,    0,    0,    0,    0,    0,    0,    0,
            0,    0,    0,    0,    0,    0,    0,    0,    0,    0,    0,    0,
            0,    0,    0,    0,    0,    0,    0,    0,    0,    0,    0,    0,
            0,    0,    0,    0,    0,    0,    0,    0,    0,    0,    0,    0,
            0,    0,    0,    0,    0,    0,    0,    0,    0,    0,    0,    0,
            0,    0,    0,    0,    0,    0,    0,    0,    0,    0,    0,    0,
            0,    0,    0,    0,    0,    0,    0,    0,    0,    0,    0,    0,
            0,    0,    0,    0,    0,    0,    0,    0,    0,    0,    0,    0,
            0,    0,    0,    0,    0,    0,    0,    0]]), 'token_type_ids': tensor([[0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,
         0, 0, 0,

In [44]:
X_train = tokenizer.batch_encode_plus(
    X_train,
    add_special_tokens=True,
    padding="max_length",
    truncation=True,
    return_attention_mask=True,
    return_tensors="pt",
    max_length=MAX_LENGTH,
)

In [47]:
X_test = tokenizer.batch_encode_plus(
    X_test,
    add_special_tokens=True,
    padding="max_length",
    truncation=True,
    return_attention_mask=True,
    return_tensors="pt",
    max_length=MAX_LENGTH,
)

In [14]:
with open("./pickles/tokenizer.pkl", "wb") as tokenizer_file:
    pickle.dump(tokenizer, tokenizer_file)

## Dataset And Dataloader


In [34]:
BATCH_SIZE = 16

In [35]:
class TextDataset(Dataset):
    def __init__(self, lyrics, labels, attention_masks):
        self.lyrics = lyrics
        self.labels = labels
        self.attention_masks = attention_masks

    def __len__(self):
        return len(self.lyrics)

    def __getitem__(self, idx):
        return {
            "input_ids": self.lyrics[idx],
            "labels": self.labels[idx],
            "attention_mask": self.attention_masks[idx],
        }




In [52]:
trainset = TextDataset(
    X_train["input_ids"],
    labels=y_train,
    attention_masks=X_train["attention_mask"],
)
trainloader = DataLoader(trainset, batch_size=BATCH_SIZE, shuffle=True)

testset = TextDataset(X_test["input_ids"], labels=y_test, attention_masks = X_test["attention_mask"])
testloader = DataLoader(testset, batch_size=BATCH_SIZE, shuffle=True)

## Model Definition


In [36]:
from transformers import BertForSequenceClassification

model = BertForSequenceClassification.from_pretrained(
    "bert-base-uncased", num_labels=len(encoder.classes_)
)

Some weights of BertForSequenceClassification were not initialized from the model checkpoint at bert-base-uncased and are newly initialized: ['classifier.bias', 'classifier.weight']
You should probably TRAIN this model on a down-stream task to be able to use it for predictions and inference.


In [37]:
device = torch.device("cuda" if torch.cuda.is_available() else "cpu")
print(device)

cuda


In [38]:
model = model.to(device)

## Training


In [53]:
from torch.optim import Adam
from tqdm import tqdm
from torch.nn import CrossEntropyLoss

In [19]:
LEARNING_RATE = 2e-5

optimizer = Adam(model.parameters(), lr=LEARNING_RATE)
loss_fn = CrossEntropyLoss()

In [20]:
for epoch in range(3):  # Number of epochs
    model.train()
    total_loss = 0

    for batch in tqdm(trainloader):
        input_ids = batch["input_ids"].to(device)
        attention_mask = batch["attention_mask"].to(device)
        labels = batch["labels"].type(torch.LongTensor).to(device)

        outputs = model(
            input_ids=input_ids,
            token_type_ids=None,
            attention_mask=attention_mask,
        )

        loss = loss_fn(outputs.logits, labels)
        total_loss += loss

        optimizer.zero_grad()
        loss.backward()
        optimizer.step()

    avg_loss = total_loss / len(trainloader)
    print(f"Epoch {epoch + 1} - Loss: {avg_loss}")
    # Save Model
    model.save_pretrained("lyrics_model")
    tokenizer.save_pretrained("lyrics_model")

  attn_output = torch.nn.functional.scaled_dot_product_attention(
100%|██████████| 1875/1875 [30:04<00:00,  1.04it/s]


Epoch 1 - Loss: 1.0476025342941284


100%|██████████| 1875/1875 [38:45<00:00,  1.24s/it]  


Epoch 2 - Loss: 0.790023684501648


100%|██████████| 1875/1875 [27:44<00:00,  1.13it/s]


Epoch 3 - Loss: 0.5592418909072876


## Evaluation


In [45]:
model.eval()

BertForSequenceClassification(
  (bert): BertModel(
    (embeddings): BertEmbeddings(
      (word_embeddings): Embedding(30522, 768, padding_idx=0)
      (position_embeddings): Embedding(512, 768)
      (token_type_embeddings): Embedding(2, 768)
      (LayerNorm): LayerNorm((768,), eps=1e-12, elementwise_affine=True)
      (dropout): Dropout(p=0.1, inplace=False)
    )
    (encoder): BertEncoder(
      (layer): ModuleList(
        (0-11): 12 x BertLayer(
          (attention): BertAttention(
            (self): BertSdpaSelfAttention(
              (query): Linear(in_features=768, out_features=768, bias=True)
              (key): Linear(in_features=768, out_features=768, bias=True)
              (value): Linear(in_features=768, out_features=768, bias=True)
              (dropout): Dropout(p=0.1, inplace=False)
            )
            (output): BertSelfOutput(
              (dense): Linear(in_features=768, out_features=768, bias=True)
              (LayerNorm): LayerNorm((768,), eps=1e

In [54]:
with torch.no_grad():
    y_true = []
    y_pred = []

    for batch in tqdm(testloader):
        input_ids = batch['input_ids'].to(device)
        attention_mask = batch['attention_mask'].to(device)
        labels = batch['labels'].type(torch.LongTensor).to(device)

        outputs = model(input_ids=input_ids, attention_mask=attention_mask)
        logits = outputs.logits
        predictions = torch.argmax(logits, dim=1)

        y_pred.extend(predictions.cpu().numpy())
        y_true.extend(labels.cpu().numpy())


100%|██████████| 375/375 [01:57<00:00,  3.20it/s]


In [56]:
y_pred[:10]

[3, 1, 2, 0, 1, 3, 5, 5, 3, 4]

In [60]:
from sklearn.metrics import classification_report

print(classification_report(y_true, y_pred, target_names=encoder.classes_))

              precision    recall  f1-score   support

     country       0.90      0.98      0.94       974
        misc       0.99      0.92      0.95      1075
         pop       0.90      0.77      0.83       994
         rap       0.96      0.93      0.95       956
          rb       0.79      0.93      0.85       972
        rock       0.91      0.90      0.91      1029

    accuracy                           0.91      6000
   macro avg       0.91      0.91      0.90      6000
weighted avg       0.91      0.91      0.91      6000

