In [None]:
import pandas as pd
import numpy as np
import seaborn as sns
import matplotlib.pyplot as plt

import nltk
from nltk.tokenize import WhitespaceTokenizer

import re

from sklearn.feature_extraction.text import TfidfVectorizer

from sklearn.decomposition import TruncatedSVD
from sklearn.preprocessing import StandardScaler
from sklearn.cluster import KMeans

from sklearn.metrics import silhouette_score

import torch
from transformers import BertTokenizer, BertConfig, BertModel

from tqdm import tqdm
from tqdm.notebook import tqdm
tqdm.pandas()

In [None]:
nltk.download('stopwords')
nltk.download('wordnet')
nltk.download('omw-1.4')

[nltk_data] Downloading package stopwords to /root/nltk_data...
[nltk_data]   Unzipping corpora/stopwords.zip.
[nltk_data] Downloading package wordnet to /root/nltk_data...
[nltk_data] Downloading package omw-1.4 to /root/nltk_data...


True

In [None]:
df = pd.read_csv('./data/lyrics_def_noDupl.csv')
df.drop('language', axis=1, inplace=True)
df.drop(df[df.year < 1968].index, inplace=True)
df.reset_index(drop=True, inplace=True)

In [None]:
df.genre.value_counts()

Rock          89998
Pop           28759
Metal         20658
Hip-Hop       20166
Country       11708
Electronic     6167
Jazz           4833
Indie          2777
R&B            2733
Folk           1589
Name: genre, dtype: int64

In [None]:
STOPWORDS = nltk.corpus.stopwords.words("english") 
lemmatizer = nltk.stem.WordNetLemmatizer()

STOPWORDS.extend(["i'm","eh", "oh", "aren’t","are not","can’t","cannot","couldn’t","could not","didn’t","did not","doesn’t","does not","don’t","do not","hadn’t","had not","hasn’t","has not","haven’t","have not","he’d","he had","he would","he’ll","he will","he shall","he’s","he is","he has","I’d","I had","I would","I’ll","I will","I shall","I’m","I am","I’ve","I have","isn’t","is not","let’s","let us","mightn’t","might not","mustn’t","must not","shan’t","shall not","she’d","she had","she would","she’ll","she will","she shall","she’s","she is","she has","shouldn’t","should not","that’s","that is","that has","there’s","there is","there has","they’d","they had","they would","they’ll","they will","they shall","they’re","they are","they’ve","they have","we’d","we had","we would","we’re","we are","we’ve","we have","weren’t","were not","what’ll","what will","what shall","what’re","what are","what’s","what is","what has","what’ve","what have","where’s","where is","where has","who’d","who had","who would","who’ll","who will","who shall","who’re","who are","who’s","who is","who has","who’ve","who have","won’t","will not","wouldn’t","would not","you’d","you had","you would","you’ll","you will","you shall","you’re","you are","you’ve","you have"])

In [None]:
def clearLyrics(df):
    lyrics_cleaned = []

    for idx, row in tqdm(df.iterrows(), desc='Cleaning Lyrics', total=len(df)):
        #if idx % 50000 == 0:
        #    print(f'Song number {idx} cleaned')
            
        new_lyric = ""
        new_lyric = re.sub("[^\x00-\x7F]+", ' ', row.lyrics) ## elimina caratteri strani a causa della codifica in ASCII
        new_lyric = re.sub("[\(\[].*?[\)\]]", ' ', new_lyric) ## elimina le parole contenute dentro le parentesi
        new_lyric = re.sub("x[0-9]+", " ", new_lyric) ## elimina x2, x3, ecc.
        new_lyric = re.sub("[0-9]x+", " ", new_lyric) ## elimina 2x, 3x, ecc.
        new_lyric = re.sub("[^\w\s^']", '', new_lyric) ##elimina punteggiatura tranne che apostrofi
        new_lyric = re.sub("\\n", " ", new_lyric) ## elimina gli /n
        new_lyric = re.sub(' {2,}', " ", new_lyric) ## elimina gli spazi superflui
        new_lyric = new_lyric.lower()
        #new_lyric = ' '.join((clean_abbreviation(tok.lower()) for tok in new_lyric))
        #new_lyric = ' '.join([lemmatizer.lemmatize(i) for i in new_lyric if i not in STOPWORDS])

        lyrics_cleaned.append(new_lyric)

    df['lyrics_cleaned'] = lyrics_cleaned

    return df

In [None]:
df = clearLyrics(df)

Cleaning Lyrics:   0%|          | 0/189388 [00:00<?, ?it/s]

In [None]:
torch.cuda.is_available()

True

## Epochs 5 | Dropout 0.3 | Undersampling Training Set

In [None]:
torch.cuda.empty_cache()

In [None]:
np.random.seed(42)
df_train, df_val, df_test = np.split(df.sample(frac=1, random_state=42), 
                                     [int(.8*len(df)), int(.9*len(df))])

print(len(df_train),len(df_val), len(df_test))

X = df_train.copy(deep=True)
X.drop('genre', axis=1, inplace=True)
y = df_train['genre']

from imblearn.under_sampling import RandomUnderSampler

rus = RandomUnderSampler(random_state=42)
df_X, df_y = rus.fit_resample(X, y)
df_train = pd.concat([df_X, df_y], axis=1)
print(len(df_train),len(df_val), len(df_test))

151510 18939 18939
12550 18939 18939


In [None]:
tokenizer = BertTokenizer.from_pretrained('bert-base-cased')
labels = dict(zip(list(df_train.genre.unique()), [n for n in range(10)]))

In [None]:
class Dataset(torch.utils.data.Dataset):

    def __init__(self, df):

        self.labels = [labels[label] for label in df['genre']]
        self.texts = [tokenizer(text, 
                               padding='max_length', max_length = 400, truncation=True,
                                return_tensors="pt") for text in df['lyrics_cleaned']]

    def classes(self):
        return self.labels

    def __len__(self):
        return len(self.labels)

    def get_batch_labels(self, idx):
        # Fetch a batch of labels
        return np.array(self.labels[idx])

    def get_batch_texts(self, idx):
        # Fetch a batch of inputs
        return self.texts[idx]

    def __getitem__(self, idx):

        batch_texts = self.get_batch_texts(idx)
        batch_y = self.get_batch_labels(idx)

        return batch_texts, batch_y

In [None]:
from torch import nn
from transformers import BertModel

class BertClassifier(nn.Module):

    def __init__(self, dropout=0.3):

        super(BertClassifier, self).__init__()

        self.bert = BertModel.from_pretrained('bert-base-cased')
        self.dropout = nn.Dropout(dropout)
        self.linear = nn.Linear(768, 10)
        self.relu = nn.ReLU()

    def forward(self, input_id, mask):

        _, pooled_output = self.bert(input_ids= input_id, attention_mask=mask,return_dict=False)
        dropout_output = self.dropout(pooled_output)
        linear_output = self.linear(dropout_output)
        final_layer = self.relu(linear_output)

        return final_layer

In [None]:
from torch.optim import Adam
from tqdm import tqdm

def train(model, train_data, val_data, learning_rate, epochs):

    train, val = Dataset(train_data), Dataset(val_data)

    train_dataloader = torch.utils.data.DataLoader(train, batch_size=2, shuffle=True)
    val_dataloader = torch.utils.data.DataLoader(val, batch_size=2)

    use_cuda = torch.cuda.is_available()
    device = torch.device("cuda" if use_cuda else "cpu")

    criterion = nn.CrossEntropyLoss()
    optimizer = Adam(model.parameters(), lr= learning_rate)

    if use_cuda:

            model = model.cuda()
            criterion = criterion.cuda()

    for epoch_num in range(epochs):

            total_acc_train = 0
            total_loss_train = 0

            for train_input, train_label in tqdm(train_dataloader):

                train_label = train_label.to(device)
                mask = train_input['attention_mask'].to(device)
                input_id = train_input['input_ids'].squeeze(1).to(device)

                output = model(input_id, mask)
                
                batch_loss = criterion(output, train_label.long())
                total_loss_train += batch_loss.item()
                
                acc = (output.argmax(dim=1) == train_label).sum().item()
                total_acc_train += acc

                model.zero_grad()
                batch_loss.backward()
                optimizer.step()
            
            total_acc_val = 0
            total_loss_val = 0

            with torch.no_grad():

                for val_input, val_label in val_dataloader:

                    val_label = val_label.to(device)
                    mask = val_input['attention_mask'].to(device)
                    input_id = val_input['input_ids'].squeeze(1).to(device)

                    output = model(input_id, mask)

                    batch_loss = criterion(output, val_label.long())
                    total_loss_val += batch_loss.item()
                    
                    acc = (output.argmax(dim=1) == val_label).sum().item()
                    total_acc_val += acc
            
            print(
                f'Epochs: {epoch_num + 1} | Train Loss: {total_loss_train / len(train_data): .3f} \
                | Train Accuracy: {total_acc_train / len(train_data): .3f} \
                | Val Loss: {total_loss_val / len(val_data): .3f} \
                | Val Accuracy: {total_acc_val / len(val_data): .3f}')
                  
EPOCHS = 5
model = BertClassifier()
LR = 1e-6
              
train(model, df_train, df_val, LR, EPOCHS)

Some weights of the model checkpoint at bert-base-cased were not used when initializing BertModel: ['cls.predictions.transform.dense.bias', 'cls.predictions.transform.LayerNorm.bias', 'cls.predictions.transform.LayerNorm.weight', 'cls.predictions.bias', 'cls.predictions.transform.dense.weight', 'cls.predictions.decoder.weight', 'cls.seq_relationship.bias', 'cls.seq_relationship.weight']
- This IS expected if you are initializing BertModel from the checkpoint of a model trained on another task or with another architecture (e.g. initializing a BertForSequenceClassification model from a BertForPreTraining model).
- This IS NOT expected if you are initializing BertModel from the checkpoint of a model that you expect to be exactly identical (initializing a BertForSequenceClassification model from a BertForSequenceClassification model).
100%|██████████| 6275/6275 [19:24<00:00,  5.39it/s]


Epochs: 1 | Train Loss:  1.075                 | Train Accuracy:  0.229                 | Val Loss:  1.111                 | Val Accuracy:  0.215


100%|██████████| 6275/6275 [19:23<00:00,  5.39it/s]


Epochs: 2 | Train Loss:  0.947                 | Train Accuracy:  0.348                 | Val Loss:  0.963                 | Val Accuracy:  0.301


100%|██████████| 6275/6275 [19:23<00:00,  5.39it/s]


Epochs: 3 | Train Loss:  0.878                 | Train Accuracy:  0.408                 | Val Loss:  0.991                 | Val Accuracy:  0.279


100%|██████████| 6275/6275 [19:23<00:00,  5.39it/s]


Epochs: 4 | Train Loss:  0.824                 | Train Accuracy:  0.442                 | Val Loss:  0.931                 | Val Accuracy:  0.317


100%|██████████| 6275/6275 [19:23<00:00,  5.39it/s]


Epochs: 5 | Train Loss:  0.763                 | Train Accuracy:  0.492                 | Val Loss:  0.928                 | Val Accuracy:  0.324


In [None]:
def evaluate(model, test_data):

    test = Dataset(test_data)

    test_dataloader = torch.utils.data.DataLoader(test, batch_size=2)

    use_cuda = torch.cuda.is_available()
    device = torch.device("cuda" if use_cuda else "cpu")

    if use_cuda:

        model = model.cuda()

    total_acc_test = 0
    with torch.no_grad():

        for test_input, test_label in test_dataloader:

              test_label = test_label.to(device)
              mask = test_input['attention_mask'].to(device)
              input_id = test_input['input_ids'].squeeze(1).to(device)

              output = model(input_id, mask)

              acc = (output.argmax(dim=1) == test_label).sum().item()
              total_acc_test += acc
    
    print(f'Test Accuracy: {total_acc_test / len(test_data): .3f}')
    
evaluate(model, df_test)

Test Accuracy:  0.327


## Epochs 10 | Dropout 0.4 | Undersampling Training Set

In [None]:
torch.cuda.empty_cache()

In [None]:
np.random.seed(42)
df_train, df_val, df_test = np.split(df.sample(frac=1, random_state=42), 
                                     [int(.8*len(df)), int(.9*len(df))])

print(len(df_train),len(df_val), len(df_test))

X = df_train.copy(deep=True)
X.drop('genre', axis=1, inplace=True)
y = df_train['genre']

from imblearn.under_sampling import RandomUnderSampler

rus = RandomUnderSampler(random_state=42)
df_X, df_y = rus.fit_resample(X, y)
df_train = pd.concat([df_X, df_y], axis=1)
print(len(df_train),len(df_val), len(df_test))

151510 18939 18939
12550 18939 18939


In [None]:
tokenizer = BertTokenizer.from_pretrained('bert-base-cased')
labels = dict(zip(list(df_train.genre.unique()), [n for n in range(10)]))

Downloading:   0%|          | 0.00/213k [00:00<?, ?B/s]

Downloading:   0%|          | 0.00/29.0 [00:00<?, ?B/s]

Downloading:   0%|          | 0.00/570 [00:00<?, ?B/s]

In [None]:
class Dataset(torch.utils.data.Dataset):

    def __init__(self, df):

        self.labels = [labels[label] for label in df['genre']]
        self.texts = [tokenizer(text, 
                               padding='max_length', max_length = 400, truncation=True,
                                return_tensors="pt") for text in df['lyrics_cleaned']]

    def classes(self):
        return self.labels

    def __len__(self):
        return len(self.labels)

    def get_batch_labels(self, idx):
        # Fetch a batch of labels
        return np.array(self.labels[idx])

    def get_batch_texts(self, idx):
        # Fetch a batch of inputs
        return self.texts[idx]

    def __getitem__(self, idx):

        batch_texts = self.get_batch_texts(idx)
        batch_y = self.get_batch_labels(idx)

        return batch_texts, batch_y

In [None]:
from torch import nn
from transformers import BertModel

class BertClassifier(nn.Module):

    def __init__(self, dropout=0.4):

        super(BertClassifier, self).__init__()

        self.bert = BertModel.from_pretrained('bert-base-cased')
        self.dropout = nn.Dropout(dropout)
        self.linear = nn.Linear(768, 10)
        self.relu = nn.ReLU()

    def forward(self, input_id, mask):

        _, pooled_output = self.bert(input_ids= input_id, attention_mask=mask,return_dict=False)
        dropout_output = self.dropout(pooled_output)
        linear_output = self.linear(dropout_output)
        final_layer = self.relu(linear_output)

        return final_layer

In [None]:
from torch.optim import Adam
from tqdm import tqdm

def train(model, train_data, val_data, learning_rate, epochs):

    train, val = Dataset(train_data), Dataset(val_data)

    train_dataloader = torch.utils.data.DataLoader(train, batch_size=2, shuffle=True)
    val_dataloader = torch.utils.data.DataLoader(val, batch_size=2)

    use_cuda = torch.cuda.is_available()
    device = torch.device("cuda" if use_cuda else "cpu")

    criterion = nn.CrossEntropyLoss()
    optimizer = Adam(model.parameters(), lr= learning_rate)

    if use_cuda:

            model = model.cuda()
            criterion = criterion.cuda()

    for epoch_num in range(epochs):

            total_acc_train = 0
            total_loss_train = 0

            for train_input, train_label in tqdm(train_dataloader):

                train_label = train_label.to(device)
                mask = train_input['attention_mask'].to(device)
                input_id = train_input['input_ids'].squeeze(1).to(device)

                output = model(input_id, mask)
                
                batch_loss = criterion(output, train_label.long())
                total_loss_train += batch_loss.item()
                
                acc = (output.argmax(dim=1) == train_label).sum().item()
                total_acc_train += acc

                model.zero_grad()
                batch_loss.backward()
                optimizer.step()
            
            total_acc_val = 0
            total_loss_val = 0

            with torch.no_grad():

                for val_input, val_label in val_dataloader:

                    val_label = val_label.to(device)
                    mask = val_input['attention_mask'].to(device)
                    input_id = val_input['input_ids'].squeeze(1).to(device)

                    output = model(input_id, mask)

                    batch_loss = criterion(output, val_label.long())
                    total_loss_val += batch_loss.item()
                    
                    acc = (output.argmax(dim=1) == val_label).sum().item()
                    total_acc_val += acc
            
            print(
                f'Epochs: {epoch_num + 1} | Train Loss: {total_loss_train / len(train_data): .3f} \
                | Train Accuracy: {total_acc_train / len(train_data): .3f} \
                | Val Loss: {total_loss_val / len(val_data): .3f} \
                | Val Accuracy: {total_acc_val / len(val_data): .3f}')
                  
EPOCHS = 10
model = BertClassifier()
LR = 1e-6
              
train(model, df_train, df_val, LR, EPOCHS)

Downloading:   0%|          | 0.00/436M [00:00<?, ?B/s]

Some weights of the model checkpoint at bert-base-cased were not used when initializing BertModel: ['cls.predictions.transform.LayerNorm.bias', 'cls.seq_relationship.bias', 'cls.predictions.transform.dense.bias', 'cls.predictions.decoder.weight', 'cls.seq_relationship.weight', 'cls.predictions.transform.LayerNorm.weight', 'cls.predictions.transform.dense.weight', 'cls.predictions.bias']
- This IS expected if you are initializing BertModel from the checkpoint of a model trained on another task or with another architecture (e.g. initializing a BertForSequenceClassification model from a BertForPreTraining model).
- This IS NOT expected if you are initializing BertModel from the checkpoint of a model that you expect to be exactly identical (initializing a BertForSequenceClassification model from a BertForSequenceClassification model).
100%|██████████| 6275/6275 [18:34<00:00,  5.63it/s]


Epochs: 1 | Train Loss:  1.114                 | Train Accuracy:  0.180                 | Val Loss:  1.073                 | Val Accuracy:  0.245


100%|██████████| 6275/6275 [18:35<00:00,  5.63it/s]


Epochs: 2 | Train Loss:  0.971                 | Train Accuracy:  0.329                 | Val Loss:  0.970                 | Val Accuracy:  0.274


100%|██████████| 6275/6275 [18:34<00:00,  5.63it/s]


Epochs: 3 | Train Loss:  0.889                 | Train Accuracy:  0.382                 | Val Loss:  0.961                 | Val Accuracy:  0.281


100%|██████████| 6275/6275 [18:34<00:00,  5.63it/s]


Epochs: 4 | Train Loss:  0.832                 | Train Accuracy:  0.433                 | Val Loss:  0.944                 | Val Accuracy:  0.295


100%|██████████| 6275/6275 [18:33<00:00,  5.63it/s]


Epochs: 5 | Train Loss:  0.774                 | Train Accuracy:  0.477                 | Val Loss:  0.923                 | Val Accuracy:  0.300


100%|██████████| 6275/6275 [18:33<00:00,  5.63it/s]


Epochs: 6 | Train Loss:  0.713                 | Train Accuracy:  0.524                 | Val Loss:  0.958                 | Val Accuracy:  0.298


100%|██████████| 6275/6275 [18:33<00:00,  5.63it/s]


Epochs: 7 | Train Loss:  0.641                 | Train Accuracy:  0.582                 | Val Loss:  0.931                 | Val Accuracy:  0.326


100%|██████████| 6275/6275 [18:33<00:00,  5.64it/s]


Epochs: 8 | Train Loss:  0.558                 | Train Accuracy:  0.644                 | Val Loss:  0.977                 | Val Accuracy:  0.310


100%|██████████| 6275/6275 [18:33<00:00,  5.64it/s]


Epochs: 9 | Train Loss:  0.469                 | Train Accuracy:  0.713                 | Val Loss:  1.018                 | Val Accuracy:  0.300


100%|██████████| 6275/6275 [18:33<00:00,  5.64it/s]


Epochs: 10 | Train Loss:  0.377                 | Train Accuracy:  0.785                 | Val Loss:  1.064                 | Val Accuracy:  0.322


In [None]:
def evaluate(model, test_data):

    test = Dataset(test_data)

    test_dataloader = torch.utils.data.DataLoader(test, batch_size=2)

    use_cuda = torch.cuda.is_available()
    device = torch.device("cuda" if use_cuda else "cpu")

    if use_cuda:

        model = model.cuda()

    total_acc_test = 0
    with torch.no_grad():

        for test_input, test_label in test_dataloader:

              test_label = test_label.to(device)
              mask = test_input['attention_mask'].to(device)
              input_id = test_input['input_ids'].squeeze(1).to(device)

              output = model(input_id, mask)

              acc = (output.argmax(dim=1) == test_label).sum().item()
              total_acc_test += acc
    
    print(f'Test Accuracy: {total_acc_test / len(test_data): .3f}')
    
evaluate(model, df_test)

Test Accuracy:  0.326
