In [8]:
import sys
import os
import pandas as pd
import re

In [9]:
sys.path.append("../")
from src.tokenize_data import create_tokenizer, tokenize_text, remove_stop_words

2023-05-03 01:31:44.424040: I tensorflow/core/platform/cpu_feature_guard.cc:182] This TensorFlow binary is optimized to use available CPU instructions in performance-critical operations.
To enable the following instructions: AVX2 FMA, in other operations, rebuild TensorFlow with the appropriate compiler flags.
[nltk_data] Downloading package stopwords to /home/stefan/nltk_data...
[nltk_data]   Package stopwords is already up-to-date!


In [10]:
DATA_FOLDER = "../data/genius_lyrics"
ARTISTS = ["Travis Scott", "Queen", "The Beatles"]

**getting the data into pandas**

In [11]:
df = pd.DataFrame()
for filename in os.listdir(DATA_FOLDER):
    artist_name = filename.split('.')[0]
    if artist_name in ARTISTS:
        temp_df = pd.read_csv(os.path.join(DATA_FOLDER, filename),
                usecols=['artist', 'title', 'lyrics'])
        df = pd.concat([df, temp_df])

df.head()

Unnamed: 0,artist,title,lyrics
0,Travis Scott,SICKO MODE,SICKO MODE Lyrics[Part I]\n\n[Intro: Drake]\nA...
1,Travis Scott,​goosebumps,​goosebumps Lyrics[Intro: Travis Scott]\nYeah\...
2,Travis Scott,BUTTERFLY EFFECT,BUTTERFLY EFFECT Lyrics[Intro]\nAll the commas...
3,Travis Scott,HIGHEST IN THE ROOM,HIGHEST IN THE ROOM Lyrics[Chorus]\nI got room...
4,Travis Scott,Antidote,Antidote Lyrics[Chorus]\nDon't you open up tha...


In [12]:
df.artist.value_counts()

artist
Travis Scott    100
The Beatles     100
Queen           100
Name: count, dtype: int64

In [13]:
sentence = "Mercy Lyrics[Intro: Fuzzy Jones]\nWell, it is"
sentence = re.sub("\[.*\]", "", sentence)  
sentence

'Mercy Lyrics\nWell, it is'

In [14]:
def clean_up_text(sentence):
    sentence = str(sentence).lower()
    sentence = re.sub("\[.*\]", "", sentence)  # remove sections markings in brakets
    sentence = re.sub(r'[^\w]', ' ', sentence)  # romove punctuation
    sentence = re.sub(r'[0-9]', '', sentence)  # remove numbers
    sentence = re.sub(r'\s[a-z]\s', ' ', sentence)  # remove single characters
    sentence = re.sub(r'^[a-z]\s', '', sentence)  # remove single characters from the start
    sentence = re.sub(r'\s+', ' ', sentence).strip()  # remove extra spaces

    sentence =  re.sub(r'.* lyrics', '', sentence)
    

    return sentence

df.lyrics = df.lyrics.apply(clean_up_text)
df.lyrics = df.lyrics.apply(remove_stop_words)

In [15]:
df = df.sort_values(by='title')
df = df.reset_index(drop=True)
df.head()

Unnamed: 0,artist,title,lyrics
0,Travis Scott,16 Chapels,know feel like floating night slow move groove...
1,Travis Scott,3500,lyrics bandana wrapped wrapped round head band...
2,Travis Scott,5% TINT,creeping window fore come outside got took end...
3,Travis Scott,90210,lyrics dean poppin yo travis mhm ooh ooh ooh o...
4,The Beatles,A Day in the Life,read news today oh boy lucky man made grade th...


**lets get those embeddings**

In [23]:
import torch
from transformers import RobertaTokenizer, RobertaModel

tokenizer = RobertaTokenizer.from_pretrained('roberta-base')
model = RobertaModel.from_pretrained('roberta-base')

# Define your input sentence
input_sentence = "This is an example sentence to be tokenized and embedded using RoBERTa."

def embed(sentence):
    tokens = tokenizer.encode_plus(input_sentence, add_special_tokens=True, max_length=512, return_token_type_ids=True, padding="max_length", truncation=True)
    input_ids = torch.tensor(tokens['input_ids']).unsqueeze(0)
    attention_mask = torch.tensor(tokens['attention_mask']).unsqueeze(0)
    with torch.no_grad():
        outputs = model(input_ids, attention_mask=attention_mask)
    sentence_embedding = torch.mean(outputs[0], dim=1).squeeze()
    return sentence_embedding


df_dataset = pd.DataFrame()
df_dataset['vectors'] = df['lyrics'].apply(embed)
df_dataset.head()

Some weights of the model checkpoint at roberta-base were not used when initializing RobertaModel: ['lm_head.decoder.weight', 'lm_head.layer_norm.bias', 'lm_head.layer_norm.weight', 'lm_head.dense.bias', 'lm_head.dense.weight', 'lm_head.bias']
- This IS expected if you are initializing RobertaModel from the checkpoint of a model trained on another task or with another architecture (e.g. initializing a BertForSequenceClassification model from a BertForPreTraining model).
- This IS NOT expected if you are initializing RobertaModel from the checkpoint of a model that you expect to be exactly identical (initializing a BertForSequenceClassification model from a BertForSequenceClassification model).


Unnamed: 0,vectors
0,"[tensor(0.0499), tensor(0.0723), tensor(0.0283..."
1,"[tensor(0.0499), tensor(0.0723), tensor(0.0283..."
2,"[tensor(0.0499), tensor(0.0723), tensor(0.0283..."
3,"[tensor(0.0499), tensor(0.0723), tensor(0.0283..."
4,"[tensor(0.0499), tensor(0.0723), tensor(0.0283..."


In [24]:
df.artist.unique()

array(['Travis Scott', 'The Beatles', 'Queen'], dtype=object)

In [25]:
stt = {
    'Travis Scott': torch.tensor([1, 0, 0]),
    'The Beatles': torch.tensor([0, 1, 0]),
    'Queen': torch.tensor([0, 0, 1])
}

tts = {
    str(torch.tensor([1, 0, 0])): 'Travis Scott',
    str(torch.tensor([1, 0, 0])): 'The Beatles',
    str(torch.tensor([1, 0, 0])): 'Queen'
}

def encode_label(label: str):
    return stt[label]

def decode_label(label: torch.tensor) -> str:
    return tts[str(label)]

encode_label("Queen")

tensor([0, 0, 1])

In [26]:
df_dataset['artist'] = df['artist'].apply(encode_label)
df_dataset.head()

Unnamed: 0,vectors,artist
0,"[tensor(0.0499), tensor(0.0723), tensor(0.0283...","[tensor(1), tensor(0), tensor(0)]"
1,"[tensor(0.0499), tensor(0.0723), tensor(0.0283...","[tensor(1), tensor(0), tensor(0)]"
2,"[tensor(0.0499), tensor(0.0723), tensor(0.0283...","[tensor(1), tensor(0), tensor(0)]"
3,"[tensor(0.0499), tensor(0.0723), tensor(0.0283...","[tensor(1), tensor(0), tensor(0)]"
4,"[tensor(0.0499), tensor(0.0723), tensor(0.0283...","[tensor(0), tensor(1), tensor(0)]"


**model??**

let's see if you really trap

In [27]:
import torch
import torch.nn as nn
import torch.nn.functional as F
from torch.utils.data import Dataset, DataLoader

In [28]:
class Net(nn.Module):

    def __init__(self, input_units, output_units):
        super(Net, self).__init__()
        self.fc1 = nn.Linear(input_units, 1024)
        self.fc2 = nn.Linear(1024, 84)
        self.fc3 = nn.Linear(84, 3)

    def forward(self, x):
        x = F.relu(self.fc1(x))
        x = F.relu(self.fc2(x))
        x = self.fc3(x)
        return x
    

model = Net(512, 3)
x = torch.rand(1, 512)

model(x)

tensor([[ 0.0088, -0.0224,  0.1655]], grad_fn=<AddmmBackward0>)

**turn of the autotune(copilot)\
lets see if you really rap**

In [None]:
class MyDataset(Dataset):
    def __init__(self, dataframe):
        self.data = dataframe.values

    def __len__(self):
        return len(self.data)

    def __getitem__(self, idx):
        return torch.tensor(self.data[idx, :-1]), torch.tensor(self.data[idx, -1])
    

# Create the dataset and dataloader
# TODO: repair the dataloader
dataset = MyDataset(df_dataset)
dataloader = DataLoader(dataset, batch_size=32, shuffle=True)

criterion = nn.MSELoss()
optimizer = torch.optim.Adam(model.parameters())


# Train the model
num_epochs = 10
for epoch in range(num_epochs):
    for batch_x, batch_y in dataloader:
        optimizer.zero_grad()
        pred_y = model(batch_x.float())
        loss = criterion(pred_y.flatten(), batch_y.float())
        loss.backward()
        optimizer.step()

    print('Epoch [{}/{}], Loss: {:.4f}'
          .format(epoch+1, num_epochs, loss.item()))
