In [2]:
import sys
import os
import pandas as pd
import re

In [6]:
sys.path.append("../")
from src.embeddings import remove_stop_words

  from .autonotebook import tqdm as notebook_tqdm
Some weights of RobertaModel were not initialized from the model checkpoint at roberta-base and are newly initialized: ['roberta.pooler.dense.bias', 'roberta.pooler.dense.weight']
You should probably TRAIN this model on a down-stream task to be able to use it for predictions and inference.
[nltk_data] Downloading package stopwords to /Users/sbore/nltk_data...
[nltk_data]   Unzipping corpora/stopwords.zip.


In [7]:
DATA_FOLDER = "../data/genius_lyrics"
ARTISTS = ["Travis Scott", "Queen", "The Beatles"]

**getting the data into pandas**

In [19]:
df = pd.DataFrame()
for filename in os.listdir(DATA_FOLDER):
    artist_name = filename.split('.')[0]
    if artist_name in ARTISTS:
        temp_df = pd.read_csv(os.path.join(DATA_FOLDER, filename),
                usecols=['artist', 'title', 'lyrics'])
        df = pd.concat([df, temp_df])

df.head()

Unnamed: 0,artist,title,lyrics
0,Travis Scott,SICKO MODE,SICKO MODE Lyrics[Part I]\n\n[Intro: Drake]\nA...
1,Travis Scott,​goosebumps,​goosebumps Lyrics[Intro: Travis Scott]\nYeah\...
2,Travis Scott,BUTTERFLY EFFECT,BUTTERFLY EFFECT Lyrics[Intro]\nAll the commas...
3,Travis Scott,HIGHEST IN THE ROOM,HIGHEST IN THE ROOM Lyrics[Chorus]\nI got room...
4,Travis Scott,Antidote,Antidote Lyrics[Chorus]\nDon't you open up tha...


In [14]:
df.artist.value_counts()

artist
Travis Scott    100
The Beatles     100
Queen           100
Name: count, dtype: int64

In [20]:
def clean_up_text(sentence):
    sentence = str(sentence).lower()
    sentence = re.sub("\[.*\]", "", sentence)  # remove sections markings in brakets
    sentence = re.sub(r'[^\w]', ' ', sentence)  # romove punctuation
    sentence = re.sub(r'[0-9]', '', sentence)  # remove numbers
    # sentence = re.sub(r'\s[a-z]\s', ' ', sentence)  # remove single characters
    # sentence = re.sub(r'^[a-z]\s', '', sentence)  # remove single characters from the start
    sentence = re.sub(r'\s+', ' ', sentence).strip()  # remove extra spaces

    sentence =  re.sub(r'.*lyrics', '', sentence)
    

    return sentence

df.lyrics = df.lyrics.apply(clean_up_text)
# df.lyrics = df.lyrics.apply(remove_stop_words)

In [21]:
df = df.sort_values(by='title')
df = df.reset_index(drop=True)
df.head()

Unnamed: 0,artist,title,lyrics
0,Travis Scott,16 Chapels,i don t know about you feel like i m floating...
1,Travis Scott,3500,bandana wrapped wrapped round your head the b...
2,Travis Scott,5% TINT,who s that creeping through my window fore yo...
3,Travis Scott,90210,dean what s poppin yo travis mhm ooh ooh ooh ...
4,The Beatles,A Day in the Life,i read the news today oh boy about a lucky ma...


In [17]:
import torch
from transformers import RobertaTokenizer, RobertaModel

tokenizer = RobertaTokenizer.from_pretrained('roberta-base')
embedding_model = RobertaModel.from_pretrained('roberta-base')

Some weights of RobertaModel were not initialized from the model checkpoint at roberta-base and are newly initialized: ['roberta.pooler.dense.bias', 'roberta.pooler.dense.weight']
You should probably TRAIN this model on a down-stream task to be able to use it for predictions and inference.


**lets cut the songs into pieces**

In [34]:
song_contents = df['lyrics'].values.tolist()[:5]

def get_number_of_tokens(song_text):
    s = tokenizer.encode_plus(song_text, truncation=False)
    return len(s.input_ids)


df['number_of_tokens'] = df['lyrics'].apply(get_number_of_tokens)

df['number_of_tokens'].describe()

count     300.000000
mean      203.543333
std       141.991597
min         3.000000
25%       104.750000
50%       153.500000
75%       277.250000
max      1095.000000
Name: number_of_tokens, dtype: float64

**lets get those embeddings**

In [35]:
def embed(sentence):
    tokens = tokenizer.encode_plus(sentence, add_special_tokens=True, max_length=256, return_token_type_ids=True, padding="max_length", truncation=True)
    input_ids = torch.tensor(tokens['input_ids']).unsqueeze(0)
    attention_mask = torch.tensor(tokens['attention_mask']).unsqueeze(0)
    with torch.no_grad():
        outputs = embedding_model(input_ids, attention_mask=attention_mask)
    return torch.mean(outputs[0], dim=1).squeeze()


df_dataset = pd.DataFrame()
df_dataset['vectors'] = df['lyrics'].apply(embed)
df_dataset.head()

Unnamed: 0,vectors
0,"[tensor(0.0347), tensor(-0.0162), tensor(0.136..."
1,"[tensor(-0.0329), tensor(-0.0136), tensor(0.10..."
2,"[tensor(0.0066), tensor(-0.0761), tensor(0.123..."
3,"[tensor(0.0437), tensor(-0.0171), tensor(0.089..."
4,"[tensor(0.0498), tensor(-0.1275), tensor(0.058..."


In [64]:
df_dataset['vectors'].values[0].shape

torch.Size([768])

In [43]:
df.artist.unique()

array(['Travis Scott', 'The Beatles', 'Queen'], dtype=object)

In [44]:
# string to tensor
stt = {s: torch.eye(len(ARTISTS))[n] for n, s in enumerate(ARTISTS)}
stt

{'Travis Scott': tensor([1., 0., 0.]),
 'Queen': tensor([0., 1., 0.]),
 'The Beatles': tensor([0., 0., 1.])}

In [45]:
# tesnor to string
tts = {str(v): k for k, v in stt.items()}
tts

{'tensor([1., 0., 0.])': 'Travis Scott',
 'tensor([0., 1., 0.])': 'Queen',
 'tensor([0., 0., 1.])': 'The Beatles'}

In [46]:
def encode_label(label: str):
    return stt[label]

def decode_label(label: torch.tensor) -> str:
    return tts[str(label)]

encode_label("Queen"), decode_label(torch.tensor([1., 0., 0.]))

(tensor([0., 1., 0.]), 'Travis Scott')

In [47]:
df_dataset['artist'] = df['artist'].apply(encode_label)
df_dataset.head()

Unnamed: 0,vectors,artist
0,"[tensor(0.0347), tensor(-0.0162), tensor(0.136...","[tensor(1.), tensor(0.), tensor(0.)]"
1,"[tensor(-0.0329), tensor(-0.0136), tensor(0.10...","[tensor(1.), tensor(0.), tensor(0.)]"
2,"[tensor(0.0066), tensor(-0.0761), tensor(0.123...","[tensor(1.), tensor(0.), tensor(0.)]"
3,"[tensor(0.0437), tensor(-0.0171), tensor(0.089...","[tensor(1.), tensor(0.), tensor(0.)]"
4,"[tensor(0.0498), tensor(-0.1275), tensor(0.058...","[tensor(0.), tensor(0.), tensor(1.)]"


**model??**

let's see if you really trap

In [48]:
import torch
import torch.nn as nn
import torch.nn.functional as F
from torch.utils.data import Dataset, DataLoader

In [49]:
device = torch.device("cuda" if torch.cuda.is_available() else "cpu")

In [50]:
class Net(nn.Module):

    def __init__(self, input_units, output_units):
        super().__init__()
        self.fc1 = nn.Linear(input_units, 4096)
        self.fc2 = nn.Linear(4096, 4096)
        self.fc3 = nn.Linear(4096, 4096)
        self.fc4 = nn.Linear(4096, 4096)
        self.fc5 = nn.Linear(4096, 4096)
        self.fc6 = nn.Linear(4096, 4096)
        self.fc7 = nn.Linear(4096, output_units)

    def forward(self, x):
        x = F.relu(self.fc1(x))
        x = F.relu(self.fc2(x))
        x = F.relu(self.fc3(x))
        x = F.relu(self.fc4(x))
        x = F.relu(self.fc5(x))
        x = F.relu(self.fc6(x))
        x = F.softmax(self.fc7(x), dim=1)
        return x

**turn of the autotune(copilot)\
lets see if you really rap**

In [65]:
# initialize model
net = Net(768, 3).to(device)
epoch_count = 0

In [99]:
# CONSTANTS
torch.manual_seed(42)
train_size = 0.8
learning_rate = 0.1e-5
num_epochs = 200

In [100]:
class MyDataset(Dataset):
    def __init__(self, dataframe):
        self.data = dataframe.values

    def __len__(self):
        return len(self.data)

    def __getitem__(self, idx):
        return torch.stack(list(self.data[idx, 0])), torch.stack(list(self.data[idx, -1]))
    
split = int(train_size * len(df_dataset))
train_dataset = MyDataset(df_dataset[:split])
test_dataset = MyDataset(df_dataset[split:])

criterion = nn.CrossEntropyLoss()
optimizer = torch.optim.Adam(net.parameters(), lr=learning_rate)

In [101]:
train_dataloader = DataLoader(train_dataset, batch_size=32, shuffle=True)
test_dataloader = DataLoader(test_dataset, batch_size=32, shuffle=True)

**training**

In [102]:
def multi_class_accuracy(preds: torch.Tensor, target: torch.Tensor) -> float:
    return ((torch.argmax(preds, dim=1) == torch.argmax(target, dim=1)) * 1).sum()\
        / target.shape[0]

In [103]:
def train_pass(
    model: torch.nn.Module,
    dataset: list,
    loss_function: torch.nn.Module,
    optimizer: torch.optim.Optimizer,
    accuracy_function,
    device: torch.device=device,
    display: bool=False) -> dict:
    
    model.train()
    train_acc, train_loss = 0, 0
    N = len(dataset)

    for X, y in dataset:
        X, y = X.to(device).float(), y.to(device).float()
        # forward pass
        y_pred = model(X)
        loss = loss_function(input=y_pred, target=y)

        train_acc += accuracy_function(preds=y_pred, target=y)
        train_loss += loss

        # backward pass
        optimizer.zero_grad()
        loss.backward()
        optimizer.step()

    train_loss /= N
    train_acc /= N

    if display:
        print(f'🔨Train loss: {train_loss:.4f} || Train acc: {train_acc:.4f}')

    results = {
        'train_loss': train_loss,
        'train_accuracy': train_acc
    }

    return results


def test_pass(
    dataset,
    model: torch.nn.Module,
    loss_function: torch.nn.Module,
    accuracy_function,
    device: torch.device=device,
    display: bool=False) -> dict:

    test_loss, test_acc = 0, 0
    N = len(dataset)

    for batch, (X, y) in enumerate(dataset):
        
        X, y = X.to(device).float(), y.to(device).float()

        test_pred = model(X)
        test_loss += loss_function(input=test_pred, target=y)

        test_acc += accuracy_function(preds=test_pred, target=y)

    test_loss /= N
    test_acc /= N
    if display:
        print(f'🧐Test loss: {test_loss:.4f} || Test acc: {test_acc:.4f}')

    results = {
        'test_loss': test_loss.cpu().detach().float(),
        'test_accuracy': test_acc.cpu().detach().float()
    }

    return results

In [104]:
model_path = "../models/lyrics_classifier_level_IMPROVED.pt"
# model_path = os.path.join(None)


accuracies_over_time = {
    "train": [],
    "test": []
}

if os.path.exists(model_path):
    net.load_state_dict(torch.load(model_path))
else: 
    print("Trainin' the NET...")
    for n, epoch in enumerate(range(num_epochs)):
        print(f'Epoch {epoch_count + n+1}/{epoch_count + num_epochs}')
        train_results = train_pass(
            model=net,
            dataset=train_dataloader,
            loss_function=criterion,
            optimizer=optimizer,
            accuracy_function=multi_class_accuracy,
            device=device,
            display=True
        )

        test_results = test_pass(
            dataset=test_dataloader,
            model=net,
            loss_function=criterion,
            accuracy_function=multi_class_accuracy,
            device=device,
            display=True
        )

        accuracies_over_time["train"].append(train_results["train_accuracy"])
        accuracies_over_time["test"].append(test_results["test_accuracy"])

Trainin' the NET...
Epoch 1/200
🔨Train loss: 0.8635 || Train acc: 0.6758
🧐Test loss: 0.8218 || Test acc: 0.7121
Epoch 2/200
🔨Train loss: 0.8530 || Train acc: 0.6953
🧐Test loss: 0.8217 || Test acc: 0.7121
Epoch 3/200
🔨Train loss: 0.8591 || Train acc: 0.6914
🧐Test loss: 0.8171 || Test acc: 0.7321
Epoch 4/200
🔨Train loss: 0.8583 || Train acc: 0.6875
🧐Test loss: 0.8169 || Test acc: 0.7366
Epoch 5/200
🔨Train loss: 0.8501 || Train acc: 0.6992
🧐Test loss: 0.8196 || Test acc: 0.7277
Epoch 6/200
🔨Train loss: 0.8576 || Train acc: 0.6836
🧐Test loss: 0.8162 || Test acc: 0.7344
Epoch 7/200
🔨Train loss: 0.8568 || Train acc: 0.6836
🧐Test loss: 0.8185 || Test acc: 0.7299
Epoch 8/200
🔨Train loss: 0.8484 || Train acc: 0.7031
🧐Test loss: 0.8173 || Test acc: 0.7344
Epoch 9/200
🔨Train loss: 0.8518 || Train acc: 0.7109
🧐Test loss: 0.8180 || Test acc: 0.7299
Epoch 10/200
🔨Train loss: 0.8541 || Train acc: 0.6914
🧐Test loss: 0.8139 || Test acc: 0.7366
Epoch 11/200
🔨Train loss: 0.8459 || Train acc: 0.7070
🧐Test

In [105]:
test_results

{'test_loss': tensor(0.7469), 'test_accuracy': tensor(0.7812)}

**markus? which markus, our markus?**

**lets decode**

**lyrics -> embeddings -> model -> class(one hot) -> artist name**

In [106]:
mafia_path = "../data/inference_data/MAFIA.txt"
all_my_love_path = "../data/inference_data/all_my_loving.txt"
bohemian_path = "../data/inference_data/in_39.txt"

**issue with this prediction**

In [107]:
def predict(model, text, device=device):
    X = embed(clean_up_text(text))
    model.eval()
    with torch.no_grad():
        X = X.unsqueeze(dim=0).to(device).float()
        y_pred = model(X)
        print(y_pred)
        y_pred = torch.zeros_like(y_pred).scatter_(
            1, torch.argmax(y_pred, dim=1).unsqueeze(dim=1), 1)
        return decode_label(y_pred.squeeze(dim=0))

In [108]:
travis_lyrics = open(mafia_path, 'r').read()
beatles_lyrics = open(all_my_love_path, 'r').read()
queen_lyrics = open(bohemian_path, 'r').read()

print('Artist: Travis Scott, prediction:', predict(net, travis_lyrics))
print('Artis: The Beatles, prediction:', predict(net, beatles_lyrics))
print('Artist: Queen, prediction:', predict(net, queen_lyrics))

tensor([[9.9926e-01, 7.4045e-04, 3.9181e-06]])
Artist: Travis Scott, prediction: Travis Scott
tensor([[4.1391e-04, 2.3587e-01, 7.6372e-01]])
Artis: The Beatles, prediction: The Beatles
tensor([[0.0062, 0.9759, 0.0179]])
Artist: Queen, prediction: Queen


In [109]:
lyrics = '''
Mama, just killed a man
Put a gun against his head, pulled my trigger, now he's dead
Mama, life had just begun
But now I've gone and thrown it all away
Mama, ooh, didn't mean to make you cry
If I'm not back again this time tomorrow
Carry on, carry on as if nothing really matters

[Verse 2]
Too late, my time has come
Sends shivers down my spine, body's aching all the time
Goodbye, everybody, I've got to go
Gotta leave you all behind and face the truth
Mama, ooh (Any way the wind blows)
I don't wanna die
I sometimes wish I'd never been born at all
[Guitar Solo]

[Verse 3]
I see a little silhouetto of a man
Scaramouche, Scaramouche, will you do the Fandango?
Thunderbolt and lightning, very, very frightening me
(Galileo) Galileo, (Galileo) Galileo, Galileo Figaro magnifico
But I'm just a poor boy, nobody loves me
He's just a poor boy from a poor family
Spare him his life from this monstrosity
Easy come, easy go, will you let me go?
Bismillah! No, we will not let you go
(Let him go) Bismillah! We will not let you go
(Let him go) Bismillah! We will not let you go
(Let me go) Will not let you go
(Let me go) Will not let you go
(Never, never, never, never let me go) Ah
No, no, no, no, no, no, no
(Oh, mamma mia, mamma mia) Mamma mia, let me go
Beelzebub has a devil put aside for me, for me, for me!

[Verse 4]
So you think you can stone me and spit in my eye?
So you think you can love me and leave me to die?
Oh, baby, can't do this to me, baby!
Just gotta get out, just gotta get right outta here
[Outro]
(Ooh)
(Ooh, yeah, ooh, yeah)
Nothing really matters, anyone can see
Nothing really matters
Nothing really matters to me
Any way the wind blows674Embed
'''

predict(net, lyrics)

tensor([[0.0391, 0.9493, 0.0116]])


'Queen'

**let's save the model**

In [None]:
torch.save(net.state_dict(), '../models/lyrics_classifier_level_1.pt')