In [1]:
import sys
import os
import pandas as pd
import re

In [2]:
sys.path.append("../")
from src.embeddings import remove_stop_words

Some weights of the model checkpoint at roberta-base were not used when initializing RobertaModel: ['lm_head.bias', 'lm_head.dense.bias', 'lm_head.layer_norm.weight', 'lm_head.decoder.weight', 'lm_head.dense.weight', 'lm_head.layer_norm.bias']
- This IS expected if you are initializing RobertaModel from the checkpoint of a model trained on another task or with another architecture (e.g. initializing a BertForSequenceClassification model from a BertForPreTraining model).
- This IS NOT expected if you are initializing RobertaModel from the checkpoint of a model that you expect to be exactly identical (initializing a BertForSequenceClassification model from a BertForSequenceClassification model).
[nltk_data] Downloading package stopwords to /home/stefan/nltk_data...
[nltk_data]   Package stopwords is already up-to-date!


In [3]:
DATA_FOLDER = "../data/genius_lyrics"
ARTISTS = ["Travis Scott", "Queen", "The Beatles"]

**getting the data into pandas**

In [4]:
df = pd.DataFrame()
for filename in os.listdir(DATA_FOLDER):
    artist_name = filename.split('.')[0]
    if artist_name in ARTISTS:
        temp_df = pd.read_csv(os.path.join(DATA_FOLDER, filename),
                usecols=['artist', 'title', 'lyrics'])
        df = pd.concat([df, temp_df])

df.head()

Unnamed: 0,artist,title,lyrics
0,Travis Scott,SICKO MODE,SICKO MODE Lyrics[Part I]\n\n[Intro: Drake]\nA...
1,Travis Scott,​goosebumps,​goosebumps Lyrics[Intro: Travis Scott]\nYeah\...
2,Travis Scott,BUTTERFLY EFFECT,BUTTERFLY EFFECT Lyrics[Intro]\nAll the commas...
3,Travis Scott,HIGHEST IN THE ROOM,HIGHEST IN THE ROOM Lyrics[Chorus]\nI got room...
4,Travis Scott,Antidote,Antidote Lyrics[Chorus]\nDon't you open up tha...


In [5]:
df.artist.value_counts()

artist
Travis Scott    100
The Beatles     100
Queen           100
Name: count, dtype: int64

In [6]:
def clean_up_text(sentence):
    sentence = str(sentence).lower()
    sentence = re.sub("\[.*\]", "", sentence)  # remove sections markings in brakets
    sentence = re.sub(r'[^\w]', ' ', sentence)  # romove punctuation
    sentence = re.sub(r'[0-9]', '', sentence)  # remove numbers
    sentence = re.sub(r'\s[a-z]\s', ' ', sentence)  # remove single characters
    sentence = re.sub(r'^[a-z]\s', '', sentence)  # remove single characters from the start
    sentence = re.sub(r'\s+', ' ', sentence).strip()  # remove extra spaces

    sentence =  re.sub(r'.* lyrics', '', sentence)
    

    return sentence

df.lyrics = df.lyrics.apply(clean_up_text)
df.lyrics = df.lyrics.apply(remove_stop_words)

In [7]:
df = df.sort_values(by='title')
df = df.reset_index(drop=True)
df.head()

Unnamed: 0,artist,title,lyrics
0,Travis Scott,16 Chapels,know feel like floating night slow move groove...
1,Travis Scott,3500,lyrics bandana wrapped wrapped round head band...
2,Travis Scott,5% TINT,creeping window fore come outside got took end...
3,Travis Scott,90210,lyrics dean poppin yo travis mhm ooh ooh ooh o...
4,The Beatles,A Day in the Life,read news today oh boy lucky man made grade th...


**lets get those embeddings**

In [29]:
import torch
from transformers import RobertaTokenizer, RobertaModel

tokenizer = RobertaTokenizer.from_pretrained('roberta-base')
embedding_model = RobertaModel.from_pretrained('roberta-base')


def embed(sentence):
    tokens = tokenizer.encode_plus(sentence, add_special_tokens=True, max_length=512, return_token_type_ids=True, padding="max_length", truncation=True)
    input_ids = torch.tensor(tokens['input_ids']).unsqueeze(0)
    attention_mask = torch.tensor(tokens['attention_mask']).unsqueeze(0)
    with torch.no_grad():
        outputs = embedding_model(input_ids, attention_mask=attention_mask)
    return torch.mean(outputs[0], dim=1).squeeze()


df_dataset = pd.DataFrame()
df_dataset['vectors'] = df['lyrics'].apply(embed)
df_dataset.head()

Some weights of the model checkpoint at roberta-base were not used when initializing RobertaModel: ['lm_head.bias', 'lm_head.dense.bias', 'lm_head.layer_norm.weight', 'lm_head.decoder.weight', 'lm_head.dense.weight', 'lm_head.layer_norm.bias']
- This IS expected if you are initializing RobertaModel from the checkpoint of a model trained on another task or with another architecture (e.g. initializing a BertForSequenceClassification model from a BertForPreTraining model).
- This IS NOT expected if you are initializing RobertaModel from the checkpoint of a model that you expect to be exactly identical (initializing a BertForSequenceClassification model from a BertForSequenceClassification model).


Unnamed: 0,vectors
0,"[tensor(0.0428), tensor(-0.0517), tensor(0.122..."
1,"[tensor(-0.0514), tensor(-0.0310), tensor(0.10..."
2,"[tensor(-0.0152), tensor(-0.0518), tensor(0.13..."
3,"[tensor(0.0536), tensor(-0.0091), tensor(0.084..."
4,"[tensor(0.0712), tensor(-0.1449), tensor(0.043..."


**normalizing the data**

In [45]:
input_data = df_dataset['vectors'].to_list()
input_data = torch.stack(input_data)

xmax = torch.max(input_data)
xmin = torch.min(input_data)

def normalize(input_data):
    return (input_data - xmin) / (xmax - xmin)

df_dataset['vectors'] = df_dataset['vectors'].apply(normalize)
df_dataset.head()

Unnamed: 0,vectors
0,"[tensor(0.6068), tensor(0.5111), tensor(0.5536..."
1,"[tensor(0.4477), tensor(0.5359), tensor(0.5182..."
2,"[tensor(0.5088), tensor(0.5110), tensor(0.5693..."
3,"[tensor(0.6251), tensor(0.5623), tensor(0.4821..."
4,"[tensor(0.6547), tensor(0.3992), tensor(0.4049..."


In [46]:
df.artist.unique()

array(['Travis Scott', 'The Beatles', 'Queen'], dtype=object)

In [47]:
# string to tensor
stt = {s: torch.eye(len(ARTISTS))[n] for n, s in enumerate(ARTISTS)}
stt

{'Travis Scott': tensor([1., 0., 0.]),
 'Queen': tensor([0., 1., 0.]),
 'The Beatles': tensor([0., 0., 1.])}

In [48]:
# tesnor to string
tts = {str(v): k for k, v in stt.items()}
tts

{'tensor([1., 0., 0.])': 'Travis Scott',
 'tensor([0., 1., 0.])': 'Queen',
 'tensor([0., 0., 1.])': 'The Beatles'}

In [76]:
def encode_label(label: str):
    return stt[label]

def decode_label(label: torch.tensor) -> str:
    return tts[str(label)]

encode_label("Queen"), decode_label(torch.tensor([1., 0., 0.]))

(tensor([0., 1., 0.]), 'Travis Scott')

In [50]:
df_dataset['artist'] = df['artist'].apply(encode_label)
df_dataset.head()

Unnamed: 0,vectors,artist
0,"[tensor(0.6068), tensor(0.5111), tensor(0.5536...","[tensor(1.), tensor(0.), tensor(0.)]"
1,"[tensor(0.4477), tensor(0.5359), tensor(0.5182...","[tensor(1.), tensor(0.), tensor(0.)]"
2,"[tensor(0.5088), tensor(0.5110), tensor(0.5693...","[tensor(1.), tensor(0.), tensor(0.)]"
3,"[tensor(0.6251), tensor(0.5623), tensor(0.4821...","[tensor(1.), tensor(0.), tensor(0.)]"
4,"[tensor(0.6547), tensor(0.3992), tensor(0.4049...","[tensor(0.), tensor(0.), tensor(1.)]"


**model??**

let's see if you really trap

In [51]:
import torch
import torch.nn as nn
import torch.nn.functional as F
from torch.utils.data import Dataset, DataLoader

In [52]:
device = torch.device("cuda" if torch.cuda.is_available() else "cpu")

In [53]:
class Net(nn.Module):

    def __init__(self, input_units, output_units):
        super().__init__()
        self.fc1 = nn.Linear(input_units, 1024)
        self.fc2 = nn.Linear(1024, 2048)
        self.fc3 = nn.Linear(2048, 4096)
        self.fc4 = nn.Linear(4096, 2048)
        self.fc5 = nn.Linear(2048, 2048)
        self.fc6 = nn.Linear(2048, 84)
        self.fc7 = nn.Linear(84, output_units)

    def forward(self, x):
        x = F.relu(self.fc1(x))
        x = F.relu(self.fc2(x))
        x = F.relu(self.fc3(x))
        x = F.relu(self.fc4(x))
        x = F.relu(self.fc5(x))
        x = F.relu(self.fc6(x))
        x = F.softmax(self.fc7(x), dim=1)
        return x

**turn of the autotune(copilot)\
lets see if you really rap**

In [54]:
# initialize model
net = Net(768, 3).to(device)
epoch_count = 0

In [65]:
# CONSTANTS
torch.manual_seed(42)
train_size = 0.8
learning_rate = 0.000001
num_epochs = 20

In [56]:
class MyDataset(Dataset):
    def __init__(self, dataframe):
        self.data = dataframe.values

    def __len__(self):
        return len(self.data)

    def __getitem__(self, idx):
        return torch.stack(list(self.data[idx, 0])), torch.stack(list(self.data[idx, -1]))
    
split = int(train_size * len(df_dataset))
train_dataset = MyDataset(df_dataset[:split])
test_dataset = MyDataset(df_dataset[split:])

criterion = nn.CrossEntropyLoss()
optimizer = torch.optim.Adam(net.parameters(), lr=learning_rate)

In [57]:
train_dataloader = DataLoader(train_dataset, batch_size=32, shuffle=True)
test_dataloader = DataLoader(test_dataset, batch_size=32, shuffle=True)

**training**

In [59]:
def multi_class_accuracy(preds: torch.Tensor, target: torch.Tensor) -> float:
    return ((torch.argmax(preds, dim=1) == torch.argmax(target, dim=1)) * 1).sum()\
        / target.shape[0]

In [60]:
def train_pass(
    model: torch.nn.Module,
    dataset: list,
    loss_function: torch.nn.Module,
    optimizer: torch.optim.Optimizer,
    accuracy_function,
    device: torch.device=device,
    display: bool=False) -> dict:
    
    model.train()
    train_acc, train_loss = 0, 0
    N = len(dataset)

    for X, y in dataset:
        X, y = X.to(device).float(), y.to(device).float()
        # forward pass
        y_pred = model(X)
        loss = loss_function(input=y_pred, target=y)

        train_acc += accuracy_function(preds=y_pred, target=y)
        train_loss += loss

        # backward pass
        optimizer.zero_grad()
        loss.backward()
        optimizer.step()

    train_loss /= N
    train_acc /= N

    if display:
        print(f'🔨Train loss: {train_loss:.4f} || Train acc: {train_acc:.4f}')

    results = {
        'train_loss': train_loss,
        'train_accuracy': train_acc
    }

    return results


def test_pass(
    dataset,
    model: torch.nn.Module,
    loss_function: torch.nn.Module,
    accuracy_function,
    device: torch.device=device,
    display: bool=False) -> dict:

    test_loss, test_acc = 0, 0
    N = len(dataset)

    for batch, (X, y) in enumerate(dataset):
        
        X, y = X.to(device).float(), y.to(device).float()

        test_pred = model(X)
        test_loss += loss_function(input=test_pred, target=y)

        test_acc += accuracy_function(preds=test_pred, target=y)

    test_loss /= N
    test_acc /= N
    if display:
        print(f'🧐Test loss: {test_loss:.4f} || Test acc: {test_acc:.4f}')

    results = {
        'test_loss': test_loss.cpu().detach().float(),
        'test_accuracy': test_acc.cpu().detach().float()
    }

    return results

In [67]:
for n, epoch in enumerate(range(num_epochs)):
    print(f'Epoch {epoch_count + n+1}/{epoch_count + num_epochs}')
    train_results = train_pass(
        model=net,
        dataset=train_dataloader,
        loss_function=criterion,
        optimizer=optimizer,
        accuracy_function=multi_class_accuracy,
        device=device,
        display=True
    )

    test_results = test_pass(
        dataset=test_dataloader,
        model=net,
        loss_function=criterion,
        accuracy_function=multi_class_accuracy,
        device=device,
        display=True
    )

Epoch 1/20
🔨Train loss: 0.7294 || Train acc: 0.8477
🧐Test loss: 0.7743 || Test acc: 0.7679
Epoch 2/20
🔨Train loss: 0.7152 || Train acc: 0.8672
🧐Test loss: 0.7715 || Test acc: 0.7679
Epoch 3/20
🔨Train loss: 0.7214 || Train acc: 0.8516
🧐Test loss: 0.7661 || Test acc: 0.7723
Epoch 4/20
🔨Train loss: 0.7205 || Train acc: 0.8555
🧐Test loss: 0.7923 || Test acc: 0.7656
Epoch 5/20
🔨Train loss: 0.7531 || Train acc: 0.7852
🧐Test loss: 0.7720 || Test acc: 0.7701
Epoch 6/20
🔨Train loss: 0.7469 || Train acc: 0.8008
🧐Test loss: 0.8580 || Test acc: 0.7188
Epoch 7/20
🔨Train loss: 0.7261 || Train acc: 0.8320
🧐Test loss: 0.7797 || Test acc: 0.7679
Epoch 8/20
🔨Train loss: 0.7330 || Train acc: 0.8281
🧐Test loss: 0.7927 || Test acc: 0.7500
Epoch 9/20
🔨Train loss: 0.7498 || Train acc: 0.8125
🧐Test loss: 0.7852 || Test acc: 0.7522
Epoch 10/20
🔨Train loss: 0.7030 || Train acc: 0.8867
🧐Test loss: 0.7931 || Test acc: 0.7522
Epoch 11/20
🔨Train loss: 0.7529 || Train acc: 0.7930
🧐Test loss: 0.7798 || Test acc: 0.77

**markus? which markus, our markus?**

**lets decode**

**lyrics -> embeddings -> model -> class(one hot) -> artist name**

In [68]:
mafia_path = "../data/inference_data/MAFIA.txt"
all_my_love_path = "../data/inference_data/all_my_loving.txt"
bohemian_path = "../data/inference_data/in_39.txt"

**issue with this prediction**

In [89]:
def predict(model, text, device=device):
    X = normalize(embed(clean_up_text(text)))
    model.eval()
    with torch.no_grad():
        X = X.unsqueeze(dim=0).to(device).float()
        y_pred = model(X)
        print(y_pred)
        y_pred = torch.zeros_like(y_pred).scatter_(
            1, torch.argmax(y_pred, dim=1).unsqueeze(dim=1), 1)
        return decode_label(y_pred.squeeze(dim=0))

In [90]:
travis_lyrics = open(mafia_path, 'r').read()
beatles_lyrics = open(all_my_love_path, 'r').read()
queen_lyrics = open(bohemian_path, 'r').read()

print('Artist: Travis Scott, prediction:', predict(net, travis_lyrics))
print('Artis: The Beatles, prediction:', predict(net, beatles_lyrics))
print('Artist: Queen, prediction:', predict(net, queen_lyrics))

tensor(1.2837) tensor(0.3108)
tensor([[0.1432, 0.8512, 0.0056]])
Artist: Travis Scott, prediction: Queen
tensor(1.2908) tensor(0.3309)
tensor([[0.0211, 0.9715, 0.0074]])
Artis: The Beatles, prediction: Queen
tensor(1.3154) tensor(0.3190)
tensor([[0.0250, 0.9677, 0.0073]])
Artist: Queen, prediction: Queen


In [84]:
lyrics = '''
Mama, just killed a man
Put a gun against his head, pulled my trigger, now he's dead
Mama, life had just begun
But now I've gone and thrown it all away
Mama, ooh, didn't mean to make you cry
If I'm not back again this time tomorrow
Carry on, carry on as if nothing really matters

[Verse 2]
Too late, my time has come
Sends shivers down my spine, body's aching all the time
Goodbye, everybody, I've got to go
Gotta leave you all behind and face the truth
Mama, ooh (Any way the wind blows)
I don't wanna die
I sometimes wish I'd never been born at all
[Guitar Solo]

[Verse 3]
I see a little silhouetto of a man
Scaramouche, Scaramouche, will you do the Fandango?
Thunderbolt and lightning, very, very frightening me
(Galileo) Galileo, (Galileo) Galileo, Galileo Figaro magnifico
But I'm just a poor boy, nobody loves me
He's just a poor boy from a poor family
Spare him his life from this monstrosity
Easy come, easy go, will you let me go?
Bismillah! No, we will not let you go
(Let him go) Bismillah! We will not let you go
(Let him go) Bismillah! We will not let you go
(Let me go) Will not let you go
(Let me go) Will not let you go
(Never, never, never, never let me go) Ah
No, no, no, no, no, no, no
(Oh, mamma mia, mamma mia) Mamma mia, let me go
Beelzebub has a devil put aside for me, for me, for me!

[Verse 4]
So you think you can stone me and spit in my eye?
So you think you can love me and leave me to die?
Oh, baby, can't do this to me, baby!
Just gotta get out, just gotta get right outta here
[Outro]
(Ooh)
(Ooh, yeah, ooh, yeah)
Nothing really matters, anyone can see
Nothing really matters
Nothing really matters to me
Any way the wind blows674Embed
'''

predict(net, lyrics)

tensor([[0.0765, 0.9172, 0.0063]])


'Queen'

**let's save the model**

In [59]:
torch.save(net.state_dict(), '../models/lyrics_classifier_level_1.pt')