# Inference

In [4]:
import torch
import pandas as pd
from transformers import BertForSequenceClassification
import pickle

In [8]:
with open("./pickles/label_encoder.pkl", "rb") as encoder_file:
    encoder = pickle.load(encoder_file)

In [6]:
with open("./pickles/tokenizer.pkl", "rb") as tokenizer_file:
    tokenizer = pickle.load(tokenizer_file)

In [7]:
tokenizer

BertTokenizer(name_or_path='bert-base-uncased', vocab_size=30522, model_max_length=512, is_fast=False, padding_side='right', truncation_side='right', special_tokens={'unk_token': '[UNK]', 'sep_token': '[SEP]', 'pad_token': '[PAD]', 'cls_token': '[CLS]', 'mask_token': '[MASK]'}, clean_up_tokenization_spaces=True),  added_tokens_decoder={
	0: AddedToken("[PAD]", rstrip=False, lstrip=False, single_word=False, normalized=False, special=True),
	100: AddedToken("[UNK]", rstrip=False, lstrip=False, single_word=False, normalized=False, special=True),
	101: AddedToken("[CLS]", rstrip=False, lstrip=False, single_word=False, normalized=False, special=True),
	102: AddedToken("[SEP]", rstrip=False, lstrip=False, single_word=False, normalized=False, special=True),
	103: AddedToken("[MASK]", rstrip=False, lstrip=False, single_word=False, normalized=False, special=True),
}

In [2]:
device = torch.device("cuda" if torch.cuda.is_available() else "cpu")
print(device)

cuda


In [3]:
model = BertForSequenceClassification.from_pretrained("lyrics_model")
model.to(device)
model.eval()

BertForSequenceClassification(
  (bert): BertModel(
    (embeddings): BertEmbeddings(
      (word_embeddings): Embedding(30522, 768, padding_idx=0)
      (position_embeddings): Embedding(512, 768)
      (token_type_embeddings): Embedding(2, 768)
      (LayerNorm): LayerNorm((768,), eps=1e-12, elementwise_affine=True)
      (dropout): Dropout(p=0.1, inplace=False)
    )
    (encoder): BertEncoder(
      (layer): ModuleList(
        (0-11): 12 x BertLayer(
          (attention): BertAttention(
            (self): BertSdpaSelfAttention(
              (query): Linear(in_features=768, out_features=768, bias=True)
              (key): Linear(in_features=768, out_features=768, bias=True)
              (value): Linear(in_features=768, out_features=768, bias=True)
              (dropout): Dropout(p=0.1, inplace=False)
            )
            (output): BertSelfOutput(
              (dense): Linear(in_features=768, out_features=768, bias=True)
              (LayerNorm): LayerNorm((768,), eps=1e

In [20]:
def classify_lyrics(lyrics: str) -> str:
    encoding = tokenizer(
        lyrics,
        padding='max_length',
        max_length=128,
        truncation=True,
        return_tensors='pt'
    )

    input_ids = encoding['input_ids'].to(device)
    attention_mask = encoding['attention_mask'].to(device)

    with torch.no_grad():
        outputs = model(input_ids=input_ids, attention_mask=attention_mask)
        logits = outputs.logits
        predicted_class = torch.argmax(logits, dim=1).item()

    decoded = encoder.inverse_transform([predicted_class])
    return decoded[0]

def classify_lyrics_from_file(path:str) -> str:
    with open(path, "r") as text_file:
        return classify_lyrics(text_file.read())

In [12]:
import os

In [14]:
folder = "./datasets/test_lyrics/"
lyrics_files = os.listdir(folder)

In [21]:
for filename in lyrics_files:
    path = folder + filename
    label = classify_lyrics_from_file(path)

    print("{:<50} = {}".format(filename, label))

beauty-layto.txt                                   = rb
euphoria-kendrick.txt                              = rock
flaws_and_sins-juice_wrld.txt                      = rb
hunting_wabbits-j_cole.txt                         = rap
intentions-justin_bieber.txt                       = rb
invisble-linkin_park.txt                           = rock
lowlife.txt                                        = rap
only_you-karri.txt                                 = pop
redbone-childish_gambino.txt                       = rb
