In [3]:
import sys
import os
import pandas as pd
import re

In [4]:
sys.path.append("../")
from src.tokenize_data import create_tokenizer, tokenize_text, remove_stop_words

2023-05-03 21:30:55.207721: I tensorflow/tsl/cuda/cudart_stub.cc:28] Could not find cuda drivers on your machine, GPU will not be used.
2023-05-03 21:30:55.423690: I tensorflow/tsl/cuda/cudart_stub.cc:28] Could not find cuda drivers on your machine, GPU will not be used.
2023-05-03 21:30:55.425146: I tensorflow/core/platform/cpu_feature_guard.cc:182] This TensorFlow binary is optimized to use available CPU instructions in performance-critical operations.
To enable the following instructions: AVX2 FMA, in other operations, rebuild TensorFlow with the appropriate compiler flags.
[nltk_data] Downloading package stopwords to /home/stefan/nltk_data...
[nltk_data]   Package stopwords is already up-to-date!


In [5]:
DATA_FOLDER = "../data/genius_lyrics"
ARTISTS = ["Travis Scott", "Queen", "The Beatles"]

**getting the data into pandas**

In [6]:
df = pd.DataFrame()
for filename in os.listdir(DATA_FOLDER):
    artist_name = filename.split('.')[0]
    if artist_name in ARTISTS:
        temp_df = pd.read_csv(os.path.join(DATA_FOLDER, filename),
                usecols=['artist', 'title', 'lyrics'])
        df = pd.concat([df, temp_df])

df.head()

Unnamed: 0,artist,title,lyrics
0,Travis Scott,SICKO MODE,SICKO MODE Lyrics[Part I]\n\n[Intro: Drake]\nA...
1,Travis Scott,​goosebumps,​goosebumps Lyrics[Intro: Travis Scott]\nYeah\...
2,Travis Scott,BUTTERFLY EFFECT,BUTTERFLY EFFECT Lyrics[Intro]\nAll the commas...
3,Travis Scott,HIGHEST IN THE ROOM,HIGHEST IN THE ROOM Lyrics[Chorus]\nI got room...
4,Travis Scott,Antidote,Antidote Lyrics[Chorus]\nDon't you open up tha...


In [7]:
df.artist.value_counts()

artist
Travis Scott    100
The Beatles     100
Queen           100
Name: count, dtype: int64

In [8]:
sentence = "Mercy Lyrics[Intro: Fuzzy Jones]\nWell, it is"
sentence = re.sub("\[.*\]", "", sentence)  
sentence

'Mercy Lyrics\nWell, it is'

In [9]:
def clean_up_text(sentence):
    sentence = str(sentence).lower()
    sentence = re.sub("\[.*\]", "", sentence)  # remove sections markings in brakets
    sentence = re.sub(r'[^\w]', ' ', sentence)  # romove punctuation
    sentence = re.sub(r'[0-9]', '', sentence)  # remove numbers
    sentence = re.sub(r'\s[a-z]\s', ' ', sentence)  # remove single characters
    sentence = re.sub(r'^[a-z]\s', '', sentence)  # remove single characters from the start
    sentence = re.sub(r'\s+', ' ', sentence).strip()  # remove extra spaces

    sentence =  re.sub(r'.* lyrics', '', sentence)
    

    return sentence

df.lyrics = df.lyrics.apply(clean_up_text)
df.lyrics = df.lyrics.apply(remove_stop_words)

In [10]:
df = df.sort_values(by='title')
df = df.reset_index(drop=True)
df.head()

Unnamed: 0,artist,title,lyrics
0,Travis Scott,16 Chapels,know feel like floating night slow move groove...
1,Travis Scott,3500,lyrics bandana wrapped wrapped round head band...
2,Travis Scott,5% TINT,creeping window fore come outside got took end...
3,Travis Scott,90210,lyrics dean poppin yo travis mhm ooh ooh ooh o...
4,The Beatles,A Day in the Life,read news today oh boy lucky man made grade th...


**lets get those embeddings**

In [19]:
import torch
from transformers import RobertaTokenizer, RobertaModel

tokenizer = RobertaTokenizer.from_pretrained('roberta-base')
model = RobertaModel.from_pretrained('roberta-base')


def embed(sentence):
    tokens = tokenizer.encode_plus(sentence, add_special_tokens=True, max_length=512, return_token_type_ids=True, padding="max_length", truncation=True)
    input_ids = torch.tensor(tokens['input_ids']).unsqueeze(0)
    attention_mask = torch.tensor(tokens['attention_mask']).unsqueeze(0)
    with torch.no_grad():
        outputs = model(input_ids, attention_mask=attention_mask)
    sentence_embedding = torch.mean(outputs[0], dim=1).squeeze()
    return sentence_embedding


df_dataset = pd.DataFrame()
df_dataset['vectors'] = df['lyrics'].apply(embed)
df_dataset.head()

Some weights of the model checkpoint at roberta-base were not used when initializing RobertaModel: ['lm_head.dense.bias', 'lm_head.decoder.weight', 'lm_head.dense.weight', 'lm_head.layer_norm.weight', 'lm_head.layer_norm.bias', 'lm_head.bias']
- This IS expected if you are initializing RobertaModel from the checkpoint of a model trained on another task or with another architecture (e.g. initializing a BertForSequenceClassification model from a BertForPreTraining model).
- This IS NOT expected if you are initializing RobertaModel from the checkpoint of a model that you expect to be exactly identical (initializing a BertForSequenceClassification model from a BertForSequenceClassification model).


Unnamed: 0,vectors
0,"[tensor(0.0428), tensor(-0.0517), tensor(0.122..."
1,"[tensor(-0.0514), tensor(-0.0310), tensor(0.10..."
2,"[tensor(-0.0152), tensor(-0.0518), tensor(0.13..."
3,"[tensor(0.0536), tensor(-0.0091), tensor(0.084..."
4,"[tensor(0.0712), tensor(-0.1449), tensor(0.043..."


In [20]:
df.artist.unique()

array(['Travis Scott', 'The Beatles', 'Queen'], dtype=object)

In [21]:
stt = {
    'Travis Scott': torch.tensor([1, 0, 0]),
    'The Beatles': torch.tensor([0, 1, 0]),
    'Queen': torch.tensor([0, 0, 1])
}

tts = {
    str(torch.tensor([1, 0, 0])): 'Travis Scott',
    str(torch.tensor([1, 0, 0])): 'The Beatles',
    str(torch.tensor([1, 0, 0])): 'Queen'
}

def encode_label(label: str):
    return stt[label]

def decode_label(label: torch.tensor) -> str:
    return tts[str(label)]

encode_label("Queen")

tensor([0, 0, 1])

In [22]:
df_dataset['artist'] = df['artist'].apply(encode_label)
df_dataset.head()

Unnamed: 0,vectors,artist
0,"[tensor(0.0428), tensor(-0.0517), tensor(0.122...","[tensor(1), tensor(0), tensor(0)]"
1,"[tensor(-0.0514), tensor(-0.0310), tensor(0.10...","[tensor(1), tensor(0), tensor(0)]"
2,"[tensor(-0.0152), tensor(-0.0518), tensor(0.13...","[tensor(1), tensor(0), tensor(0)]"
3,"[tensor(0.0536), tensor(-0.0091), tensor(0.084...","[tensor(1), tensor(0), tensor(0)]"
4,"[tensor(0.0712), tensor(-0.1449), tensor(0.043...","[tensor(0), tensor(1), tensor(0)]"


**model??**

let's see if you really trap

In [23]:
import torch
import torch.nn as nn
import torch.nn.functional as F
from torch.utils.data import Dataset, DataLoader

In [54]:
class Net(nn.Module):

    def __init__(self, input_units, output_units):
        super(Net, self).__init__()
        self.fc1 = nn.Linear(input_units, 1024)
        self.fc2 = nn.Linear(1024, 2048)
        self.fc3 = nn.Linear(2048, 4096)
        self.fc4 = nn.Linear(4096, 2048)
        self.fc5 = nn.Linear(2048, 2048)
        self.fc6 = nn.Linear(2048, 84)
        self.fc7 = nn.Linear(84, output_units)

    def forward(self, x):
        x = F.relu(self.fc1(x))
        x = F.relu(self.fc2(x))
        x = F.relu(self.fc3(x))
        x = F.relu(self.fc4(x))
        x = F.relu(self.fc5(x))
        x = F.relu(self.fc6(x))
        x = F.softmax(self.fc7(x), dim=1)
        return x
    

model = Net(768, 3)
x = torch.rand(1, 768)

y = model(x)
y

tensor([[0.3321, 0.3422, 0.3257]], grad_fn=<SoftmaxBackward0>)

**turn of the autotune(copilot)\
lets see if you really rap**

In [44]:
# CONSTANTS
torch.manual_seed(42)
learning_rate = 0.00001
num_epochs = 40

In [55]:
class MyDataset(Dataset):
    def __init__(self, dataframe):
        self.data = dataframe.values

    def __len__(self):
        return len(self.data)

    def __getitem__(self, idx):
        return torch.stack(list(self.data[idx, 0])), torch.stack(list(self.data[idx, -1]))
    

dataset = MyDataset(df_dataset)
dataloader = DataLoader(dataset, batch_size=32, shuffle=True)

criterion = nn.CrossEntropyLoss()
optimizer = torch.optim.Adam(model.parameters(), lr=learning_rate)

**training**

In [58]:
# training loop
for epoch in range(num_epochs):
    avg_loss = 0
    for batch_x, batch_y in dataloader:
        optimizer.zero_grad()
        pred_y = model(batch_x.float())
        loss = criterion(pred_y, batch_y.float())
        avg_loss += loss.item()
        loss.backward()
        optimizer.step()

    print('Epoch [{}/{}], Loss: {:.4f}'
          .format(epoch+1, num_epochs, avg_loss / len(dataloader)))


Epoch [1/40], Loss: 0.6097
Epoch [2/40], Loss: 0.6090
Epoch [3/40], Loss: 0.6238
Epoch [4/40], Loss: 0.6133
Epoch [5/40], Loss: 0.6067
Epoch [6/40], Loss: 0.6071
Epoch [7/40], Loss: 0.6156
Epoch [8/40], Loss: 0.6061
Epoch [9/40], Loss: 0.6081
Epoch [10/40], Loss: 0.6085
Epoch [11/40], Loss: 0.6023
Epoch [12/40], Loss: 0.6169
Epoch [13/40], Loss: 0.6023
Epoch [14/40], Loss: 0.6033
Epoch [15/40], Loss: 0.6056
Epoch [16/40], Loss: 0.6008
Epoch [17/40], Loss: 0.6099
Epoch [18/40], Loss: 0.6039
Epoch [19/40], Loss: 0.5970
Epoch [20/40], Loss: 0.5965
Epoch [21/40], Loss: 0.6016
Epoch [22/40], Loss: 0.5955
Epoch [23/40], Loss: 0.6010
Epoch [24/40], Loss: 0.6000
Epoch [25/40], Loss: 0.5945
Epoch [26/40], Loss: 0.5998
Epoch [27/40], Loss: 0.5963
Epoch [28/40], Loss: 0.6018
Epoch [29/40], Loss: 0.5975
Epoch [30/40], Loss: 0.5917
Epoch [31/40], Loss: 0.5952
Epoch [32/40], Loss: 0.5973
Epoch [33/40], Loss: 0.5958
Epoch [34/40], Loss: 0.5951
Epoch [35/40], Loss: 0.5904
Epoch [36/40], Loss: 0.5959
E