# Setup

In [1]:
!pip install transformers
!pip install datasets
!pip install sacrebleu
!pip install sentencepiece

Collecting datasets
  Downloading datasets-2.18.0-py3-none-any.whl (510 kB)
[2K     [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m510.5/510.5 kB[0m [31m6.7 MB/s[0m eta [36m0:00:00[0m
Collecting dill<0.3.9,>=0.3.0 (from datasets)
  Downloading dill-0.3.8-py3-none-any.whl (116 kB)
[2K     [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m116.3/116.3 kB[0m [31m9.7 MB/s[0m eta [36m0:00:00[0m
Collecting xxhash (from datasets)
  Downloading xxhash-3.4.1-cp310-cp310-manylinux_2_17_x86_64.manylinux2014_x86_64.whl (194 kB)
[2K     [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m194.1/194.1 kB[0m [31m5.7 MB/s[0m eta [36m0:00:00[0m
[?25hCollecting multiprocess (from datasets)
  Downloading multiprocess-0.70.16-py310-none-any.whl (134 kB)
[2K     [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m134.8/134.8 kB[0m [31m12.7 MB/s[0m eta [36m0:00:00[0m
Installing collected packages: xxhash, dill, multiprocess, datasets
Successfully installed datasets-

In [2]:
import transformers
import torch
import logging
import pandas as pd
from transformers import (
    AutoTokenizer,
    AutoModelForSeq2SeqLM,
    DataCollatorForSeq2Seq,
    AutoConfig,
)
from sklearn.model_selection import train_test_split
from torch.utils.data import DataLoader
from datasets import Dataset, load_metric
import os
from tqdm.notebook import tqdm
import numpy as np

In [3]:
from google.colab import drive

drive.mount("/content/drive")

Mounted at /content/drive


In [4]:
if not torch.cuda.is_available():
    logging.warning(
        'GPU device not found. Go to Runtime > Change Runtime type and set Hardware accelerator to "GPU"'
    )
    logging.warning("We will use the CPU, this will be very slow")
else:
    print(f"Cuda device found: {torch.cuda.get_device_name(0)}")

device = torch.device("cuda" if torch.cuda.is_available() else "cpu")

Cuda device found: Tesla T4


# Training Data

In [5]:
data_path = "drive/MyDrive/EHU/Apps1/final_project/data"

In [6]:
train_df = pd.read_csv(f"{data_path}/train.csv")
train_df.info()
train_df.head()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 6484 entries, 0 to 6483
Data columns (total 5 columns):
 #   Column   Non-Null Count  Dtype 
---  ------   --------------  ----- 
 0   artist   6484 non-null   object
 1   song     6484 non-null   object
 2   tag      6484 non-null   object
 3   content  6484 non-null   object
 4   tag_2    6075 non-null   object
dtypes: object(5)
memory usage: 253.4+ KB


Unnamed: 0,artist,song,tag,content,tag_2
0,5 Seconds of Summer,Easier,"[Outro: All, Luke]",I love you so much that I hate you (Hate you)\...,[Outro]
1,5 Seconds of Summer,Easier,[Verse 1: Luke],Why do we always gotta run away?\nAnd we wind ...,[Verse]
2,5 Seconds of Summer,Easier,[Bridge: Luke],The hardest part of all (Da-da-da-dum-da-da-da...,[Bridge]
3,5 Seconds of Summer,Easier,"[Pre-Chorus: All, Luke]","I love you so much that I hate you\nRight now,...",[Pre-Chorus]
4,5 Seconds of Summer,Easier,[Intro: Luke],Is it easier to stay? Is it easier to go?\nI d...,[Intro]


In [7]:
val_df = pd.read_csv(f"{data_path}/val.csv")
test_df = pd.read_csv(f"{data_path}/test.csv")

In [8]:
def generate_prompt(part_of_song: str, song_name: str, artist_name: str):
    return f"""Generate the {part_of_song} of a song called '{song_name}' by {artist_name}"""


train_df["prompt"] = train_df.apply(
    lambda row: generate_prompt(row["tag_2"], row["song"], row["artist"]), axis=1
)
val_df["prompt"] = val_df.apply(
    lambda row: generate_prompt(row["tag_2"], row["song"], row["artist"]), axis=1
)
test_df["prompt"] = test_df.apply(
    lambda row: generate_prompt(row["tag_2"], row["song"], row["artist"]), axis=1
)

In [9]:
generate_prompt("[Verse]", "Someone like you", "Adele")

"Generate the [Verse] of a song called 'Someone like you' by Adele"

In [10]:
print(f"{len(train_df)=}")
print(f"{len(val_df)=}")
print(f"{len(test_df)=}")

len(train_df)=6484
len(val_df)=1621
len(test_df)=961


# Prepare data

In [11]:
model_name = "google-t5/t5-small"
tokenizer = AutoTokenizer.from_pretrained(model_name)
model = AutoModelForSeq2SeqLM.from_pretrained(model_name)

The secret `HF_TOKEN` does not exist in your Colab secrets.
To authenticate with the Hugging Face Hub, create a token in your settings tab (https://huggingface.co/settings/tokens), set it as secret in your Google Colab and restart your session.
You will be able to reuse this secret in all of your notebooks.
Please note that authentication is recommended but still optional to access public models or datasets.


tokenizer_config.json:   0%|          | 0.00/2.32k [00:00<?, ?B/s]

spiece.model:   0%|          | 0.00/792k [00:00<?, ?B/s]

tokenizer.json:   0%|          | 0.00/1.39M [00:00<?, ?B/s]

config.json:   0%|          | 0.00/1.21k [00:00<?, ?B/s]

model.safetensors:   0%|          | 0.00/242M [00:00<?, ?B/s]

generation_config.json:   0%|          | 0.00/147 [00:00<?, ?B/s]

In [12]:
learning_rate = 1e-4
train_batch_size = 8
gradient_accumulation_steps = 4
validation_batch_size = 8
num_training_epochs = 10
num_beams = 1

In [13]:
max_source_length = 256  # Max number of tokens after tokenization
max_target_length = 256  # Max number of tokens after tokenization


def preprocess_function(examples):
    inputs = examples["prompt"]
    targets = examples["content"]

    model_inputs = tokenizer(
        inputs,
        max_length=max_source_length,
        padding=False,  # We will padd the data latter
        truncation=True,
    )

    # Setup the tokenizer for targets
    with tokenizer.as_target_tokenizer():
        labels = tokenizer(
            targets, max_length=max_target_length, padding=False, truncation=True
        )

    model_inputs["labels"] = labels["input_ids"]

    return model_inputs

In [14]:
# train from pandas to HF
train_dataset = Dataset.from_pandas(train_df)
train_dataset = train_dataset.map(
    preprocess_function,  # Preprocess function to apply
    batch_size=8,
    batched=True,  # Operate in batches instead of preprocessing all the data at once, very usefull to avoid running out of memory when preprocessing huge datasets
    num_proc=os.cpu_count(),  # Number of CPU core to use, more cores = faster preprocessing
    remove_columns=[
        "tag",
        "song",
        "content",
        "tag_2",
        "artist",
        "prompt",
    ],  # Remove the text columns, we only need model inputs
    load_from_cache_file=False,  # Do not load the dataset from Cache
    desc="Running tokenizer on dataset",
)

train_dataset

Running tokenizer on dataset (num_proc=2):   0%|          | 0/6484 [00:00<?, ? examples/s]



Dataset({
    features: ['input_ids', 'attention_mask', 'labels'],
    num_rows: 6484
})

In [15]:
# validation from pandas to HF
validation_dataset = Dataset.from_pandas(val_df)
validation_dataset = validation_dataset.map(
    preprocess_function,
    batch_size=1,
    batched=True,
    num_proc=os.cpu_count(),
    remove_columns=[
        "tag",
        "song",
        "content",
        "tag_2",
        "artist",
        "prompt",
    ],  # Remove the text columns, we only need model inputs
    load_from_cache_file=False,
    desc="Running tokenizer on dataset",
)
validation_dataset

Running tokenizer on dataset (num_proc=2):   0%|          | 0/1621 [00:00<?, ? examples/s]



Dataset({
    features: ['input_ids', 'attention_mask', 'labels'],
    num_rows: 1621
})

In [16]:
# test from pandas to HF
test_dataset = Dataset.from_pandas(test_df)
test_dataset = test_dataset.map(
    preprocess_function,
    batch_size=1,
    batched=True,
    num_proc=os.cpu_count(),
    remove_columns=[
        "tag",
        "song",
        "content",
        "tag_2",
        "artist",
        "prompt",
    ],  # Remove the text columns, we only need model inputs
    load_from_cache_file=False,
    desc="Running tokenizer on dataset",
)
test_dataset

Running tokenizer on dataset (num_proc=2):   0%|          | 0/961 [00:00<?, ? examples/s]



Dataset({
    features: ['input_ids', 'attention_mask', 'labels'],
    num_rows: 961
})

In [17]:
data_collator = DataCollatorForSeq2Seq(
    tokenizer,
    model=model,
    padding=True,
    label_pad_token_id=tokenizer.pad_token_id,
    pad_to_multiple_of=8,  # GPUs prefer inputs with a size multiple of 8, it can speed up computation
)

In [18]:
train_dataloader = DataLoader(
    train_dataset,
    shuffle=True,
    collate_fn=data_collator,
    batch_size=train_batch_size,
)
validation_dataloader = DataLoader(
    validation_dataset,
    shuffle=False,
    collate_fn=data_collator,
    batch_size=validation_batch_size,
)
test_dataloader = DataLoader(
    test_dataset,
    shuffle=False,
    collate_fn=data_collator,
    batch_size=validation_batch_size,  # we don't really need to set a new batch size for test
)

In [19]:
# setting custom optimization parameters. You may implement a scheduler here as well.
param_optimizer = list(model.named_parameters())
no_decay = ["bias", "gamma", "beta"]
optimizer_grouped_parameters = [
    {
        "params": [
            p for n, p in param_optimizer if not any(nd in n for nd in no_decay)
        ],
        "weight_decay_rate": 0.01,
    },
    {
        "params": [p for n, p in param_optimizer if any(nd in n for nd in no_decay)],
        "weight_decay_rate": 0.0,
    },
]

optimizer = torch.optim.AdamW(optimizer_grouped_parameters, lr=learning_rate)

In [20]:
metric = load_metric("sacrebleu")

  metric = load_metric("sacrebleu")
You can avoid this message in future by passing the argument `trust_remote_code=True`.
Passing `trust_remote_code=True` will be mandatory to load this metric from the next major release of `datasets`.


Downloading builder script:   0%|          | 0.00/2.85k [00:00<?, ?B/s]

In [21]:
model = model.to(device)  # Load the model in the GPU

In [22]:
import math

max_train_steps = (
    math.ceil(len(train_dataloader) / gradient_accumulation_steps) * num_training_epochs
)

# Training

In [47]:
num_training_epochs = 5

In [None]:
running_loss = 0
num_batches = 0
completed_steps = 0

with tqdm(
    total=num_training_epochs,
    ascii=True,
    desc="Training",
) as train_progress_bar:

    for epoch in range(num_training_epochs):
        # Training

        # Set our model to training mode (as opposed to evaluation mode)
        model.train()

        for step, batch in enumerate(train_dataloader):

            # Forward Pass
            outputs = model(**batch.to(device))
            # If we pass the labels in the batch the models automatically
            # computes the loss and returns it. See: https://huggingface.co/docs/transformers/v4.17.0/en/model_doc/t5#transformers.T5ForConditionalGeneration.forward.labels
            loss = outputs.loss

            # Divide the loss by the gradient accumulation steps
            loss = loss / gradient_accumulation_steps

            # Backward pass
            loss.backward()
            running_loss += loss.item()
            num_batches += 1
            train_progress_bar.set_description(
                f"Training. Running loss: {round(loss.item() / num_batches,5)}"
            )
            if (
                step % gradient_accumulation_steps == 0
                or step == len(train_dataloader) - 1
            ):
                # Run step in the accumulated gradients
                optimizer.step()

                # Clear out the gradients
                optimizer.zero_grad()

                train_progress_bar.update(1)
                completed_steps += 1

        # ========================================================
        # Validation

        # Put model in evaluation mode
        model.eval()

        with tqdm(
            total=len(validation_dataloader),
            ascii=True,
            desc="Validation",
        ) as validation_progress_bar:

            # Parameters to control generation
            # For a list of all available parameters see: https://huggingface.co/docs/transformers/v4.17.0/en/main_classes/model#transformers.generation_utils.GenerationMixin.generate

            gen_kwargs = {
                "max_length": max_target_length,
                "num_beams": num_beams,
            }

            for step, batch in enumerate(validation_dataloader):
                with torch.no_grad():  # Do not compute gradients

                    # In inference we do not use the forward pass, we use the generate method
                    # This method will generate words one by one. To generate the next word
                    # the model will see the input sequence and the previous generated words
                    generated_tokens = model.generate(
                        batch["input_ids"].to(device),
                        attention_mask=batch["attention_mask"].to(device),
                        **gen_kwargs,
                    )

                    # Copy the data back to the CPU
                    labels = batch["labels"].cpu().numpy()
                    generated_tokens = generated_tokens.cpu().numpy()

                    # Decode the model generated ids
                    decoded_preds = tokenizer.batch_decode(
                        generated_tokens,
                        skip_special_tokens=True,
                        clean_up_tokenization_spaces=True,
                    )

                    # Decode the encoded labels ids
                    decoded_labels = tokenizer.batch_decode(
                        labels,
                        skip_special_tokens=True,
                        clean_up_tokenization_spaces=True,
                    )

                    # Add batch to the metric
                    metric.add_batch(
                        predictions=decoded_preds,
                        references=[[s] for s in decoded_labels],
                    )

                    validation_progress_bar.update(1)

        # Calculate metric after all batches have been processed
        eval_metric = metric.compute()
        print(f"Validation epoch {epoch}: {eval_metric['score']}")

Training:   0%|          | 0/5 [00:00<?, ?it/s]

Validation:   0%|          | 0/203 [00:00<?, ?it/s]

In [None]:
model.save_pretrained(f"{data_path}/model", from_pt=True)

# Evaluation

In [26]:
# we take the parameters from before
max_target_length = 256
num_beams = 5

gen_kwargs = {
    "max_length": max_target_length,
    "num_beams": num_beams,
}

sentences_len = []

In [27]:
test_dataset = Dataset.from_pandas(test_df)
test_dataset = test_dataset.map(
    preprocess_function,
    batch_size=1000,
    batched=True,
    num_proc=os.cpu_count(),
    remove_columns=["HS","CN"],
    load_from_cache_file=False,
    desc="Running tokenizer on dataset",
)

In [28]:
device = torch.device("cuda" if torch.cuda.is_available() else "cpu")

In [29]:
data_collator = DataCollatorForSeq2Seq(
    tokenizer,
    model=model,
    padding=True,
    label_pad_token_id=tokenizer.pad_token_id,
    pad_to_multiple_of=8,  # GPUs prefer inputs with a size multiple of 8, it can speed up computation
)

In [30]:
test_dataloader = DataLoader(
    test_dataset,
    shuffle=False,
    collate_fn=data_collator,
    batch_size=1,
)

In [31]:
predictions_list = []
labels_list = []

with tqdm(
    total=len(test_df) // 8,  # length of data divided into 8 batches
    ascii=True,
) as progress_bar:
    for batch in test_dataloader:
        generated_tokens = model.generate(
            batch["input_ids"].to(device),
            attention_mask=batch["attention_mask"].to(device),
            **gen_kwargs,
        )

        # Copy the data back to the CPU
        labels = batch["labels"].cpu().numpy()
        generated_tokens = generated_tokens.cpu().numpy()

        # Decode the model generated ids
        decoded_preds = tokenizer.batch_decode(
            generated_tokens,
            skip_special_tokens=True,
            clean_up_tokenization_spaces=True,
        )
        predictions_list.append(decoded_preds)

        # Decode the encoded labels ids
        decoded_labels = tokenizer.batch_decode(
            labels,
            skip_special_tokens=True,
            clean_up_tokenization_spaces=True,
        )
        labels_list.append(decoded_labels)

        # Add batch to the metric
        metric.add_batch(
            predictions=decoded_preds, references=[[s] for s in decoded_labels]
        )
        progress_bar.update(1)

    # Calculate metric after all batches have been processed
    eval_metric = metric.compute()
    print(eval_metric["score"])

  0%|          | 0/120 [00:00<?, ?it/s]

0.5333568698734513


In [43]:
print(f'Test BLEU: {eval_metric["score"]}')

Test BLEU: 0.5333568698734513


# Generate your own lyrics

In [33]:
def generate_prediction(model, tokenizer, input_sequence):
    model_input = tokenizer(
        [input_sequence],
        return_tensors="pt",
        padding=False,
        max_length=256,
        truncation=True,
    ).input_ids.to(device)

    gen_kwargs = {
        "max_length": 256,
        "num_beams": 5,
        "num_return_sequences": 3,  # Output the top 3 most probable sequences
    }

    outputs = model.generate(input_ids=model_input, **gen_kwargs)
    generated_sequences = tokenizer.batch_decode(outputs, skip_special_tokens=True)

    return generated_sequences

In [36]:
input_sequence = input("Write your sentences:")
generated_sequences = generate_prediction(model, tokenizer, input_sequence)
for n, s in enumerate(generated_sequences):
    print(f"\x1b[32m{n}\x1b[0m: {s}")

Write your sentences:Generate the [VERSE] of a song called 'I'm a girl' by Taylor Swift
[32m0[0m: I'm a girl, I'm a girl, I'm a girl, I'm a girl, I'm a girl, I'm a girl, I'm a girl, I'm a girl, I'm a girl, I'm a girl, I'm a girl, I'm a girl, I'm a girl, I'm a girl, I'm a girl, I'm a girl, I'm a girl I'm a girl, I'm a girl, I'm a girl, I'm a girl, I'm a girl, I'm a girl, I'm a girl, I'm a girl, I'm a girl, I'm a girl, I'm a girl, I'm a girl, I'm a girl, I'm a girl, I'm a girl, I'm a girl, I'm a girl, I'm a girl, I'm a girl, I'm 
[32m1[0m: I'm a girl (I'm a girl) I'm a girl (I'm a girl) I'm a girl (I'm a girl) I'm a girl (I'm a girl) I'm a girl (I'm a girl) I'm a girl (I'm a girl) I'm a girl (I'm a girl) I'm a girl (I'm a girl) I'm a girl) I'm a girl) I'm a girl (I'm a girl) I'm a girl (I'm a girl) I'm a girl (I'm a girl) I'm a girl (I'm a girl) I'm a girl (I'm a girl) I'm a girl (I'm a girl) I'm a girl (I'm a girl) I'm a girl (I'm a girl (I'm a girl) I'm a girl (I'm
[32m2[0m: I'm 

In [45]:
input_sequence = "Generate the [Chorus] of a song called 'Subway' by The Weeknd"
generated_sequences = generate_prediction(model, tokenizer, input_sequence)
for n, s in enumerate(generated_sequences):
    print(f"\x1b[32m{n}\x1b[0m: {s}")

[32m0[0m: 'Cause I'm a girl, a girl, a girl, a girl, a girl, a girl, a girl, a girl, a girl, a girl, a girl, a girl, a girl, a girl, a girl, a girl, a girl, a girl, a girl, a girl, a girl, a girl, a girl, a girl, a girl, a girl, 'Cause I don't know why I'm a girl, a girl, a girl, a girl, a girl, a girl, a girl, a girl, a girl, a girl, a girl, a girl, a girl, a girl, a girl, a girl, a girl, a girl, a girl, a girl, a girl, a girl, a girl, a girl, a girl, a girl, a girl, a girl, a girl, a girl, a girl, a girl, 
[32m1[0m: 'Cause I'm a girl, a girl, a girl, a girl, a girl, a girl, a girl, a girl, a girl, a girl, a girl, a girl, a girl, a girl, a girl, a girl, a girl, a girl, a girl, a girl, a girl, a girl, a girl, a girl, a girl, a girl, 'Cause I don't know what I'm a girl, a girl, a girl, a girl, a girl, a girl, a girl, a girl, a girl, a girl, a girl, a girl, a girl, a girl, a girl, a girl, a girl, a girl, a girl, a girl, a girl, a girl, a girl, a girl, a girl, a girl, a girl, a girl, 

In [46]:
input_sequence = "Generate the [Bridge] of a song called 'I am a boy' by Justin Bieber"
generated_sequences = generate_prediction(model, tokenizer, input_sequence)
for n, s in enumerate(generated_sequences):
    print(f"\x1b[32m{n}\x1b[0m: {s}")

[32m0[0m: I am a boy I am a boy I am a boy I am a boy I am a boy I am a boy I am a boy I am a boy I am a boy I am a boy I am a boy I am a boy I am a boy I am a boy I am a boy I am a boy I am a boy I am a boy I am a boy I am a boy I am a boy I am a boy I am 
[32m1[0m: I'm a boy I'm a boy I'm a boy I'm a boy I'm a boy I'm a boy I'm a boy I'm a boy I'm a boy I'm a boy I'm a boy I'm a boy I'm a boy I'm a boy I'm a boy I'm a boy I'm a boy I'm a boy I'm a
[32m2[0m: I'm a boy, I'm a boy, I'm a boy, I'm a boy, I'm a boy, I'm a boy, I'm a boy, I'm a boy, I'm a boy, I'm a boy, I'm a boy, I'm a boy, I'm a boy, I'm a boy, I'm a boy, I'm a boy, I am a boy


In [44]:
input_sequence = "Generate the [Refrain] of a song called 'Vampire' by Ed Sheeran"
generated_sequences = generate_prediction(model, tokenizer, input_sequence)
for n, s in enumerate(generated_sequences):
    print(f"\x1b[32m{n}\x1b[0m: {s}")

[32m0[0m: 'Cause I'm a vampire, a vampire, a vampire, a vampire, a vampire, a vampire, a vampire, a vampire, a vampire, a vampire, a vampire, a vampire, a vampire, a vampire, a vampire, a vampire, a vampire, a vampire, a vampire, a vampire, a vampire, a vampire, a vampire, a vampire, a vampire, a vampire
[32m1[0m: 'Cause I'm a vampire, a vampire, a vampire, a vampire, a vampire, a vampire, a vampire, a vampire, a vampire, a vampire, a vampire, a vampire, a vampire, a vampire, a vampire, a vampire, a vampire, a vampire, a vampire, a vampire, a vampire, a vampire, a vampire, a vampire, a vampire
[32m2[0m: 'Cause I'm a vampire, a vampire, a vampire, a vampire, a vampire, a vampire, a vampire, a vampire, a vampire, a vampire, a vampire, a vampire, a vampire, a vampire, a vampire, a vampire, a vampire, a vampire, a vampire, a vampire, a vampire, a vampire, a vampire, a vampire, a vampire, a vampire,...
