In [1]:
import sys
import os

# Ruta relativa desde el notebook a la carpeta de scripts
sys.path.append(os.path.abspath("../scripts"))

In [2]:
import numpy as np
import pandas as pd
import torch as t
from datasets import Dataset
from dataset_utils import MiSonGynyDataset, split_songs_into_verses
import task1_config as config

  from .autonotebook import tqdm as notebook_tqdm


### Dataset Creation

In [3]:
df = pd.read_csv("../datasets/subtask1_train.csv")

In [4]:
df.columns

Index(['id', 'lyrics', 'label'], dtype='object')

In [23]:
# T1_TRAIN_1918
# LiveGet tickets as low as $39

df[ df['id'] == 'T1_TRAIN_1810']

Unnamed: 0,id,lyrics,label
1809,T1_TRAIN_1810,"¿qué están fumando?\n[Letra de ""Humo en la Tra...",M


In [24]:
print(df['lyrics'][df['id'] == 'T1_TRAIN_2086'].to_numpy().tolist()[0])


[Letra de "Cielo Eterno - Spotify Singles"]

[Intro: Jasiel Nuñez]
A veces pensamos que el amor no existe
Tal vez por malas experiencias fallidas
Pero qué equivocados estamos, porque sí existe
Y eso nos lo enseña la vida

[Interludio Instrumental]

[Verso 1: Jasiel Nuñez & DannyLux]
Gota tras gota, la lluvia nos moja
Afuera aquí hace frío, pero eso nuestros cuerpos ya ni lo notan
Se nos pasaron las copas
No entiendo por qué le hacemos caso al corazón, aunque él también se equivoca
Pero cuando toca, toca
Tiene malicia tu boca
Y creo que el viento me sopla, quе esto te vuelvе loca

[Coro: Jasiel Nuñez & DannyLux]
Oh, vámonos de aquí, mi amor
Que solo estemos tú y yo
El tiempo paralizó
Esta hermosa situación
Pero qué equivocación
No creía en Cupido
Y con su flecha nos juntó
Así algo nuevo nació
Oh, woh-oh
Oh-oh

[Interludio Instrumental]

[Verso 2: DannyLux & Jasiel Nuñez]
Quiero que esto no sea fugaz como esa estrella
La que le pedimos la noche aquella
Yo ser tu príncipe y tú mi doncell

Baseline dataset

In [8]:
songs = df['lyrics'].to_numpy()
labels = df['label'].apply(lambda x: 0 if x == 'NM' else 1).to_numpy()
ids = df['id'].to_numpy()

In [21]:
dataset = Dataset.from_dict({"songs": songs, "ids": ids, "labels": labels})
dataset.save_to_disk('../datasets/task1_mil_v0')

Saving the dataset (1/1 shards): 100%|██████████| 2104/2104 [00:00<00:00, 268215.17 examples/s]


Dataset with verses

In [None]:
import re 

def split_songs_into_verses(song_list, verse_size=1, num_verses=20, sentence_split_token=" "):
    """
    1. Get all sentences per song
    2. Clean each sentence by removing 'as low as', sentences with a single token, parenthesis, and squared brackets 
    3. Remove repeated sentences (chorus, and so on)
    4. Split them by chunks (according to chunk size)
    5. Get only first k chunks per song maximum

    if verse_size is greater than the actual number of sentences, it will be ignored
    """
    songs = []

    for idx, song in enumerate(song_list):
        sentences = song.split("\n")

        sentences = [ clean_sentence(s) for s in sentences ]
        sentences = [ s for s in sentences if 'as low as $' not in s and ' ' in s and len(s.strip()) >= 1 ]
        
        # Get only the unique sentences preserving the order of appareance
        sentences = get_most_repeated_sentences(sentences)
        sentences = [ s[0] for s in sentences ]
        
        if len(sentences) > verse_size:
            verses = [ sentence_split_token.join(sentences[i:i + verse_size]).strip() for i in range(0, len(sentences), verse_size)]
            verses = [ clean_sentence(v) for v in verses if len(v) > 0 ]
            songs.append(verses[:num_verses])
        else:
            verses = [ sentence.strip() for sentence in sentences ]
            songs.append(verses[:num_verses])

    return songs

def get_most_repeated_sentences(sentences_list):
    results = {}
    for sentence in sentences_list:
        if sentence not in results:
            results[sentence] = 0
        results[sentence] += 1

    sorted_results = sorted(results.items(), key=lambda x:x[1], reverse=True)

    return sorted_results

def clean_sentence(sentence):
    """
    Remove brackets, parenthesis and extra spaces
    """
    sentence = re.sub(r'\[.*\]', '', sentence)
    sentence = re.sub(r'\(.*\)', '', sentence)
    # Delete camel case
    sentence = re.sub(r'(?<=[a-z])(?=[A-Z])', ' ', sentence)
    sentence = re.sub(r'\s+', ' ', sentence)
    return sentence.strip()

In [26]:
songs  = split_songs_into_verses(df['lyrics'].to_numpy(), num_verses=100, verse_size=1)
labels = df['label'].apply(lambda x: 0 if x == 'NM' else 1).to_numpy()
ids    = df['id'].to_numpy()

UnboundLocalError: cannot access local variable 'sentences' where it is not associated with a value

In [20]:
sentence_sizes = [ len(sentence.split(" ")) for verses in songs for sentence in verses ]
np.mean(sentence_sizes), np.max(sentence_sizes), np.min(sentence_sizes), np.quantile(sentence_sizes, q=.98)

(np.float64(6.651811508406346), np.int64(213), np.int64(2), np.float64(13.0))

In [21]:
verse_sizes = [ len(verses) for verses in songs ]
np.mean(verse_sizes), np.max(verse_sizes), np.min(verse_sizes), np.quantile(verse_sizes, q=.98)

(np.float64(30.106939163498097),
 np.int64(100),
 np.int64(1),
 np.float64(77.94000000000005))

In [22]:
[ (id, verses) for id, verses in zip(ids, songs) if len(verses) > 30 ]

[('T1_TRAIN_0002',
  ['Viajera, te vas de puerta en puerta',
   'Buscando quien te quiera',
   'Viajera flor machita y destrichada',
   'Que has perdido tu belleza',
   'Sin dejar la primavera',
   'Viajera, hoy reflejas en tu rostro',
   'Huellas de un triste pasado',
   'Por vivir, a tu manera',
   'Entregandole a cualquiera',
   'Tus caricias pasajeras',
   'Sin sentir ningun amor',
   'Viajera, hoy te miro con gran pena',
   'Te acompana por donde quiera',
   'El beso de un gran dolor',
   'Ni la sombra te pareces',
   'De lo que fuistes un dia',
   'Hoy te tengo compasion',
   'Oigame mi amor!',
   '- Viajera hoy siento por ti gran pena',
   'Flor machita y destrichada, has perdido tu belleza, viajera',
   'Cada vez que yo te miro! Que pena me da mirarte mi nena',
   'Sufrimiento y soledad en tu vida si... eso lo que te espera',
   'Entregandole a cualquiera, tus caricias ... mira pasajeras',
   'Me expreso mi compasion, por tu condicion, y ya no te quiero viajera',
   'Te vas de 

In [8]:
max( [len(verses) for verses in songs] ), min( [len(verses) for verses in songs] )

(10, 1)

In [32]:
n_verses = len([ (id, verses) for id, verses in zip(ids, songs) if len(verses) < 15 ])
print(f"number of verses with length less than 10: {n_verses}/{len(songs)}")

number of verses with length less than 10: 2070/2104


In [37]:
len([ (id, verses) for id, verses in zip(ids, songs) if len(verses) < 15 ])

2070

In [33]:
dataset = Dataset.from_dict({"songs": songs, "ids": ids, "labels": labels})

In [34]:
dataset.save_to_disk('../datasets/task1_mil_v2')

Saving the dataset (1/1 shards): 100%|██████████| 2104/2104 [00:00<00:00, 156882.82 examples/s]


In [55]:
from huggingface_hub import login

login(token="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")

In [None]:
from transformers import AutoModel
from transformers import AutoTokenizer

modelname = 'meta-llama/Llama-3.1-8B'

tokenizer = AutoTokenizer.from_pretrained(modelname)

model = AutoModel.from_pretrained(modelname
                                             ,num_labels=2
                                             ,output_attentions=False
                                             ,output_hidden_states=False)



  from .autonotebook import tqdm as notebook_tqdm
Fetching 4 files:   0%|          | 0/4 [00:00<?, ?it/s]

In [None]:
import torch as t 

inputs = tokenizer(['This is a sentences that I want to'])
inputs = { k: t.tensor(v) for k,v in inputs.items() }

In [None]:
outputs = model(**inputs)
outputs.keys()

odict_keys(['last_hidden_state', 'pooler_output'])

In [52]:
model

BertModel(
  (embeddings): BertEmbeddings(
    (word_embeddings): Embedding(28996, 768, padding_idx=0)
    (position_embeddings): Embedding(512, 768)
    (token_type_embeddings): Embedding(2, 768)
    (LayerNorm): LayerNorm((768,), eps=1e-12, elementwise_affine=True)
    (dropout): Dropout(p=0.1, inplace=False)
  )
  (encoder): BertEncoder(
    (layer): ModuleList(
      (0-11): 12 x BertLayer(
        (attention): BertAttention(
          (self): BertSdpaSelfAttention(
            (query): Linear(in_features=768, out_features=768, bias=True)
            (key): Linear(in_features=768, out_features=768, bias=True)
            (value): Linear(in_features=768, out_features=768, bias=True)
            (dropout): Dropout(p=0.1, inplace=False)
          )
          (output): BertSelfOutput(
            (dense): Linear(in_features=768, out_features=768, bias=True)
            (LayerNorm): LayerNorm((768,), eps=1e-12, elementwise_affine=True)
            (dropout): Dropout(p=0.1, inplace=False