In [1]:
import sys
import os

# Ruta relativa desde el notebook a la carpeta de scripts
sys.path.append(os.path.abspath("../scripts"))

In [2]:
import numpy as np
import pandas as pd
import torch as t
from datasets import Dataset
from dataset_utils import MiSonGynyDataset, split_songs_into_verses
import task1_config as config

  from .autonotebook import tqdm as notebook_tqdm


### Dataset Creation

In [3]:
df = pd.read_csv("../datasets/subtask1_train.csv")

In [4]:
df.columns

Index(['id', 'lyrics', 'label'], dtype='object')

In [34]:
# T1_TRAIN_1918
# LiveGet tickets as low as $39

df[ df['id'] == 'T1_TRAIN_2013']

Unnamed: 0,id,lyrics,label
2012,T1_TRAIN_2013,"[Letra de ""Quiero Hacerte El Amor""]\n\n[Pre-Co...",M


In [33]:
print(df['lyrics'][df['id'] == 'T1_TRAIN_2051'].to_numpy().tolist()[0])

a
Eres perfecta
Ajá
[Letra de "Corazón Acelera'o"]

[Refrán]
¿Qué importa la edad? (W)
Contigo quiero escaparme
El sabor de tus labios tienen una magia tan interesante
(Los Vaqueros: La trilogía)

[Pre-Coro]
Sé que me piensas
Por tu mirada con desvelo
Permítame otra noche intensa
Que nos volvamos locos
Loco'-loco'-locos los dos

[Coro]
Me estoy muriendo
Agonizando a fuego lento
Por el vaivén que trae tu cuerpo
Que me tiene acelera'o
El corazón acelera'o

Quisiera devolver el tiempo
Para grabar tus movimientos
Que me traen acelera'o
El corazón acelera'o
Uoh-uoh
El corazón acelera'o
Uoh-uoh

(Yo me le pego y me da taquicardia)
(Doble U)

[Verso 1]
Tú me aceleras (heh), me desesperas
Esta historia es como de una novela mi centinela (pam-pam-pam)
Yo y ella nos gustamos desde la escuela (ajá)
Yo la beso en el cuello y rápido se revela
Me prende (hey), con un rose me enciende
Cuando estamos bailando, la miro y me entiende (duro-duro-duro)
Ella es loca conmigo, por que yo soy el que la atiend

Baseline dataset

In [8]:
songs = df['lyrics'].to_numpy()
labels = df['label'].apply(lambda x: 0 if x == 'NM' else 1).to_numpy()
ids = df['id'].to_numpy()

In [21]:
dataset = Dataset.from_dict({"songs": songs, "ids": ids, "labels": labels})
dataset.save_to_disk('../datasets/task1_mil_v0')

Saving the dataset (1/1 shards): 100%|██████████| 2104/2104 [00:00<00:00, 268215.17 examples/s]


Dataset with verses

In [37]:
import re 

def split_songs_into_verses(song_list, verse_size=1, num_verses=20, sentence_split_token=" "):
    """
    1. Get all sentences per song
    2. Clean each sentence by removing 'as low as', sentences with a single token, parenthesis, and squared brackets 
    3. Remove repeated sentences (chorus, and so on)
    4. Split them by chunks (according to chunk size)
    5. Get only first k chunks per song maximum

    if verse_size is greater than the actual number of sentences, it will be ignored
    """
    songs = []

    for idx, song in enumerate(song_list):
        sentences = song.split("\n")

        sentences = [ clean_sentence(s) for s in sentences ]
        sentences = [ s for s in sentences if 'as low as $' not in s and ' ' in s and len(s.strip()) >= 1 ]
        
        # Get only the unique sentences preserving the order of appareance
        sentences = get_most_repeated_sentences(sentences)
        sentences = [ s[0] for s in sentences ]
        
        if len(sentences) > verse_size:
            verses = [ sentence_split_token.join(sentences[i:i + verse_size]).strip() for i in range(0, len(sentences), verse_size)]
            verses = [ clean_sentence(v) for v in verses if len(clean_sentence(v)) > 0 ]
            songs.append(verses[:num_verses])
        else:
            verses = [ sentence.strip() for sentence in sentences ]
            songs.append(verses[:num_verses])

    return songs

def get_most_repeated_sentences(sentences_list):
    results = {}
    for sentence in sentences_list:
        if sentence not in results:
            results[sentence] = 0
        results[sentence] += 1

    sorted_results = sorted(results.items(), key=lambda x:x[1], reverse=True)

    return sorted_results

def clean_sentence(sentence):
    """
    Remove brackets, parenthesis and extra spaces
    """
    sentence = re.sub(r'\[.*\]', '', sentence)
    sentence = re.sub(r'\(.*\)', '', sentence)
    # Delete camel case
    sentence = re.sub(r'(?<=[a-z])(?=[A-Z])', ' ', sentence)
    sentence = re.sub(r'\,', ', ', sentence)
    sentence = re.sub(r'\s+', ' ', sentence)
    return sentence.strip()

In [38]:
songs  = split_songs_into_verses(df['lyrics'].to_numpy(), num_verses=100, verse_size=1)
labels = df['label'].apply(lambda x: 0 if x == 'NM' else 1).to_numpy()
ids    = df['id'].to_numpy()

In [39]:
sentence_sizes = [ len(sentence.split(" ")) for verses in songs for sentence in verses ]
np.mean(sentence_sizes), np.max(sentence_sizes), np.min(sentence_sizes), np.quantile(sentence_sizes, q=.98)

(np.float64(6.663219644861882), np.int64(213), np.int64(2), np.float64(13.0))

In [42]:
verse_sizes = [ len(verses) for verses in songs ]
np.mean(verse_sizes), np.max(verse_sizes), np.min(verse_sizes), np.quantile(verse_sizes, q=.95)

(np.float64(30.058460076045627), np.int64(100), np.int64(1), np.float64(62.0))

In [41]:
[ (id, verses) for id, verses in zip(ids, songs) if len(verses) > 77 ]

[('T1_TRAIN_0004',
  ['Flor de vida',
   'Es una parte de mí que no puedo controlar',
   'Una parte de mi cuerpo que me impide respirar',
   '¿Quién te enseña a volar? No tengo apoyo',
   'No está muerto y por las noches duerme dentro de un hoyo',
   'Los paisajes cambian',
   'Flor de vida, flor de vida, flor de vida',
   'Las personas también',
   'Un chico de carácter agradable',
   'de suicidio, puede grabarlo en vídeo',
   'de suicidio',
   'Puede grabarlo en vídeo',
   'Algo te atrae sin porqués, no se ve, cambia tu ser',
   'Da sed, caes en un cuarto sin pared',
   'No es de ayer, te consume, sube en tu interior',
   'Es la maldad y el dolor que afloran sin condición',
   'Hay quien llora cuando nota su alma sucia y rota',
   'La calma mora y flota, no se coge ni se toca',
   'No se encoge, pero explota, idiota, el rencor brota',
   'Es peor que la farlopa, su olor queda en tu ropa',
   'Está detrás de todo, de cualquier modo',
   'Sin más, te puede encontrar',
   'Sólo duele po

In [8]:
max( [len(verses) for verses in songs] ), min( [len(verses) for verses in songs] )

(10, 1)

In [32]:
n_verses = len([ (id, verses) for id, verses in zip(ids, songs) if len(verses) < 15 ])
print(f"number of verses with length less than 10: {n_verses}/{len(songs)}")

number of verses with length less than 10: 2070/2104


In [37]:
len([ (id, verses) for id, verses in zip(ids, songs) if len(verses) < 15 ])

2070

In [43]:
dataset = Dataset.from_dict({"songs": songs, "ids": ids, "labels": labels})

In [44]:
dataset.save_to_disk('../datasets/task1_mil_v3')

Saving the dataset (1/1 shards): 100%|██████████| 2104/2104 [00:00<00:00, 209829.89 examples/s]


In [55]:
from huggingface_hub import login

login(token="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")

In [None]:
from transformers import AutoModel
from transformers import AutoTokenizer

modelname = 'meta-llama/Llama-3.1-8B'

tokenizer = AutoTokenizer.from_pretrained(modelname)

model = AutoModel.from_pretrained(modelname
                                             ,num_labels=2
                                             ,output_attentions=False
                                             ,output_hidden_states=False)



  from .autonotebook import tqdm as notebook_tqdm
Fetching 4 files:   0%|          | 0/4 [00:00<?, ?it/s]

In [None]:
import torch as t 

inputs = tokenizer(['This is a sentences that I want to'])
inputs = { k: t.tensor(v) for k,v in inputs.items() }

In [None]:
outputs = model(**inputs)
outputs.keys()

odict_keys(['last_hidden_state', 'pooler_output'])

In [52]:
model

BertModel(
  (embeddings): BertEmbeddings(
    (word_embeddings): Embedding(28996, 768, padding_idx=0)
    (position_embeddings): Embedding(512, 768)
    (token_type_embeddings): Embedding(2, 768)
    (LayerNorm): LayerNorm((768,), eps=1e-12, elementwise_affine=True)
    (dropout): Dropout(p=0.1, inplace=False)
  )
  (encoder): BertEncoder(
    (layer): ModuleList(
      (0-11): 12 x BertLayer(
        (attention): BertAttention(
          (self): BertSdpaSelfAttention(
            (query): Linear(in_features=768, out_features=768, bias=True)
            (key): Linear(in_features=768, out_features=768, bias=True)
            (value): Linear(in_features=768, out_features=768, bias=True)
            (dropout): Dropout(p=0.1, inplace=False)
          )
          (output): BertSelfOutput(
            (dense): Linear(in_features=768, out_features=768, bias=True)
            (LayerNorm): LayerNorm((768,), eps=1e-12, elementwise_affine=True)
            (dropout): Dropout(p=0.1, inplace=False