## Taylor Swift's Lyrics Generator

Given the dataset of Taylor Swift songs, write a song generator that will continue writing a song given the first 2-3 verses in the style of the mentioned artist.

### Import

In [None]:
#Import necessary libraries
import os
import numpy as np 
import pandas as pd
import seaborn as sns
import matplotlib.pyplot as plt
import re #https://docs.python.org/3/library/re.html

import torch
import transformers
from sklearn.model_selection import train_test_split
from transformers import GPT2Tokenizer, GPT2LMHeadModel, GPT2Config
from torch.utils.data import Dataset, DataLoader

#Evaluation of the results
from rouge_score import rouge_scorer
import nltk
from nltk.translate.bleu_score import sentence_bleu

In [None]:
#Import the data 
directory = 'data/Albums/'

lyrics_list = [] #list with all the lyrics

for album_name in os.listdir(directory):
    album_path = os.path.join(directory, album_name)
    for music_name in os.listdir(album_path):
        music_path = os.path.join(album_path, music_name)
        with open(music_path, 'r', encoding='utf-8') as file:
            music_lyrics = file.read()
            lyrics_list.append(music_lyrics)

#Create a dataframe with all the lyrics from all the albums from Taylor Swift
lyrics_df = pd.DataFrame({'Lyrics': lyrics_list})

print("Number of lyrics: ", lyrics_df.shape[0]) 
print(lyrics_df) #print examples of lyrics

In [None]:
#TODO remove this. this was used due to time constraints but must be removed for complete training
#Shuffle and choose some for quicker training
#lyrics_df = lyrics_df.sample(frac = 1) #shuffle the songs
#lyrics_df = lyrics_df.head(200) 

### Clean Data

In [None]:
#Show one example
print(lyrics_df["Lyrics"].iloc[0])

#We should remove the first sentence "104 ContributorsTranslationsEspañolPortuguêsFrançaisClean" that does not belong to the music
#We should remove the last sentence "161Embed" #See how it varies for different examples
#We should remove things that are between [] as they do not belong to the lyrics

In [None]:
#There is a prefix in every lyrics that must be removed
def remove_prefix(lyrics):
    """Remove the prefix the text contains before the beggining of the lyrics"""
    prefix_position = lyrics.find("Lyrics") #the position of the first word in Lyrics
    return lyrics[prefix_position + 6:] #Remove eeverything before, and including, "Lyrics"

#Remove labels
def remove_reg_expressions(lyrics):
    """Remove labels such as [Verse 1], [Guitar], etc and remove last sentence emb/kemb"""
    lyrics = re.sub(r'\[.*?\]', '', lyrics)
    lyrics = re.sub('[0-9]+KEmbed', '', lyrics)
    lyrics = re.sub('[0-9]+Embed', '', lyrics)
    return lyrics


lyrics_df['Lyrics'] = lyrics_df['Lyrics'].apply(remove_prefix)
lyrics_df['Lyrics'] = lyrics_df['Lyrics'].apply(remove_reg_expressions)

In [None]:
print(lyrics_df['Lyrics'].iloc[1]) #Show one clean example

In [None]:
#Check for missing values 
lyrics_df.isna().sum() 

In [None]:
#Check for duplicates
print(f"Number of duplicates: {lyrics_df['Lyrics'].duplicated().sum()}") #How many duplicates were there? 

#Remove duplicates
lyrics_df = lyrics_df.drop_duplicates(subset='Lyrics', keep='first') #Remove duplicates
print(f"Number of lyrics after duplicates removal: {lyrics_df.shape[0]}") #Size after duplicates removal

In [None]:
#Shuffle and split into training and test set
lyrics_df = lyrics_df.sample(frac = 1) #shuffle the songs
df_train, df_test = train_test_split(lyrics_df, test_size=0.01) #A very small amount of lyrics will be separated for "test": the first 2/3 verses will be used as a prompt to the lyrics generator; This step may be removed if we want to create prompts by hand.
df_train, df_val = train_test_split(df_train, test_size=0.1) #Define a train and validation sets for parameter selection

In [None]:
#For the test set, create a column with the first 3 verses of the lyrics that will be used as a initializing prompt for the word generator
def select_first_3_verses(lyrics):
    """Select the first 3 verses from the lyrics."""
    lines = lyrics.split("\n")
    first_3_verses = "\n".join(lines[:4]) 
    return first_3_verses


df_test['First_3_verses'] = df_test['Lyrics'].apply(select_first_3_verses)
print(df_test["First_3_verses"].iloc[0])

### Transform the data

In [None]:
class CustomImageDataset(Dataset):
    #CustomImageDataset extends Dataset from https://pytorch.org/tutorials/beginner/basics/data_tutorial.html
    #It must implement three functions, __init__, __len__ and __getitem__

    def __init__(self, all_lyrics, tokenizer):
        """Using the data_set/list all_lyrics and the tokenizer, we get two torch tensors, 
         the input_ids and the attention_mask"""
        self.tokenizer = tokenizer

        self.input_ids = []
        self.attention_mask = []

        for lyrics in all_lyrics:
            encoding = tokenizer('<|startoftext|>'+ lyrics + '<|endoftext|>',  #'<|startoftext|>'+ 
                                truncation=True, 
                                max_length=tokenizer.model_max_length, #1024
                                padding="max_length",
                                ) #pad until it reaches maximum lenght

            #encoding['attention_mask'][0] = 0 
            self.input_ids.append(torch.tensor(encoding['input_ids']))
            self.attention_mask.append(torch.tensor(encoding['attention_mask'])) #will be 0 for the padded elements


    def __len__(self):
        """The __len__ function returns the number of samples in our dataset."""
        return len(self.input_ids)

    def __getitem__(self, index):
        """The __getitem__ function loads and returns a sample from the dataset at the given index index."""
        image = self.input_ids[index]
        label = self.attention_mask[index] 
        return image, label

In [None]:
#Define the tokenizer to be used. Define the pad_token as for GPT there is no default
model_name = "gpt2" 
tokenizer = GPT2Tokenizer.from_pretrained(model_name) #, bos_token='<|startoftext|>', eos_token='<|endoftext|>', pad_token='<|pad|>')
tokenizer.pad_token = tokenizer.eos_token #using a token of end of text to pad
#tokenizer.pad_token ='[PAD]'

#Get the dataset using CustomImageDataset
dataset_train = CustomImageDataset(df_train["Lyrics"], tokenizer)
dataset_val = CustomImageDataset(df_val["Lyrics"], tokenizer)

In [None]:
#Dataloader: https://pytorch.org/tutorials/beginner/basics/data_tutorial.html
#While training a model, we typically want to pass samples in “minibatches”, reshuffle the data at every epoch to reduce model overfitting
#DataLoader is an iterable that abstracts this complexity

batch_size = 1

train_dataloader = DataLoader(dataset_train, 
                              batch_size=batch_size, 
                              shuffle=True,
                              pin_memory=True) #pin-memory 

val_dataloader = DataLoader(dataset_train, 
                            batch_size=batch_size, 
                            shuffle=True,
                            pin_memory=True)

### Fine-tune the model

**Choosing the specific model**

GPT2Model is the base GPT-2 model and it is usefull for several tasks, including feature extraction, but is not indicated for text generation. 

**GPT2LMHeadModel** is specifically designed for language modeling and text generation tasks. It has a LM Head, or Language Model Head, which is a component added to a pre-trained model to adapt it for language generation tasks. It’s essentially a linear layer that predicts the next token in a sequence, given the previous tokens.

GPT2DoubleHeadsModel is a more flexible model that can handle both language modeling and other tasks, making it suitable for multitask learning scenarios. It is a more complex model due to its dual heads, harder to work with and can consume more computational resources.

GPT2LMHeadModel is primarily built for PyTorch, while TFGPT2LMHeadModel is also based on GPT-2 but is intended for TensorFlow. We will use GPT2LMHeadModel. 

In [None]:
#Choose if we want to do something in the configuration and create model
#configuration = GPT2Config.from_pretrained('gpt2', vocab_size = 50257, n_positions = 1024) #using the default. Change the params here. 
model = GPT2LMHeadModel.from_pretrained(model_name) #, pad_token_id=tokenizer.eos_token_id) #config=configuration

In [None]:
#Choose the best optimizer
#https://pytorch.org/docs/stable/optim.html

#Define the parameters
learning_rate = 1e-3 
epsilon = 1e-8 #term added to the denominator to improve numerical stability 
weight_decay = 1e-2 #weight decay coefficient = L2 regularization applied. Adding a penalty term to the loss function during training that discourages large values for the model's parameters (weights). The regularization term is defined as the sum of squared values of all model parameters. 

optimizer = torch.optim.SGD(model.parameters(), lr=learning_rate)
#optimizer = torch.optim.AdamW(model.parameters(), lr = learning_rate, eps = epsilon)

In [None]:
epochs = 3
train_loss_per_epoch = [] #Initialize the train loss list per epoch 
val_loss_per_epoch = [] #Initialize the validation loss list per epoch 

for epoch in range(epochs):
    print(epoch)
    
    ## Training ##
    total_train_loss = 0 #Initialize the loss for this epoch as 0
    model.train() #Set the model in training mode. In this mode, dropout and batch normalization layers introduce randomness during training to prevent overfitting.

    for batch in train_dataloader:
        #From each batch, take the input_ids and the attention_mask
        input_ids = batch[0]
        attention_mask = batch[1]

        #Clean the information from previous batches 
        model.zero_grad()  #Clear any previously calculated gradients before performing a backward pass
        total_loss = 0 #Set loss to 0
        
        #Forward pass
        outputs = model(input_ids,
                        labels=input_ids, 
                        attention_mask = attention_mask)   

        #Get the loss values
        loss = outputs.loss #Loss for this bacht
        total_train_loss += loss.item() #Get the value with item() and add the loss of this batch to the list of losses 

    #Get the average loss for this epoch: Calculate the average loss over all of the batches.
    avg_train_loss = total_train_loss / len(train_dataloader)    
    train_loss_per_epoch.append(avg_train_loss)

    loss.backward() #Backpropagation: Perform a backward pass to calculate the gradients. 
    optimizer.step() #Parameter updating based on the computed gradients, learning rate, optimizer



    ## Validation ##
    total_val_loss = 0
    model.eval() #Set the model in evaluation mode. Dropout layers do not drop any units during evaluation and batch normalization layers use population statistics (averages and variances collected during training) instead of batch-specific statistics. This ensures consistent and deterministic behavior during inference.

    for batch in val_dataloader:
        #From each batch, take the input_ids and the attention_mask
        input_ids = batch[0]
        attention_mask = batch[1]

        #Clean the information from previous batches 
        model.zero_grad()  #Clear any previously calculated gradients before performing a backward pass
        total_loss = 0 #Set loss to 0
        
        #Forward pass
        outputs = model(input_ids,
                        labels=input_ids, 
                        attention_mask = attention_mask)     

        #Calculate the loss for this batch
        loss = outputs.loss
        #Add the loss of this batch to the list of losses 
        total_val_loss += loss.item() #Get the value

    avg_val_loss = total_val_loss / len(val_dataloader)
    val_loss_per_epoch.append(avg_train_loss)



In [None]:
#Plot the loss function for each epoch
results = {"epoch_list_ks": np.arange(1, epochs + 1, 1),
           "loss_function": train_loss_per_epoch,
           "validation_loss_function": val_loss_per_epoch}

#We create a dataframe so we can apreciate the evolution of the loss function through the epochs
df_loss = pd.DataFrame(data = results)[["loss_function", "validation_loss_function"]]

#Create a plot
fig, ax = plt.subplots(figsize=(7, 4))
sns.lineplot(data=df_loss)
plt.title('Value of the loss function for each epoch')
plt.xlabel('Epoch')
plt.ylabel('Loss function value')
sns.despine()

### Save the fine-tuned model

In [379]:
#Save the model
output_dir = './model/'

if not os.path.exists(output_dir):
    os.makedirs(output_dir)

model.save_pretrained(output_dir)
tokenizer.save_pretrained(output_dir)

('./model/tokenizer_config.json',
 './model/special_tokens_map.json',
 './model/vocab.json',
 './model/merges.txt',
 './model/added_tokens.json')

In [380]:
#If you want to use the model pre-trained, you can find the link in the README file.

#Load the saved model
model = GPT2LMHeadModel.from_pretrained(output_dir)
tokenizer = GPT2Tokenizer.from_pretrained(output_dir)

### Generate lyrics

In [None]:
df_train["Lyrics"].iloc[1]

In [None]:
dataset_test = CustomImageDataset(df_test["Lyrics"], tokenizer)
tensor = dataset_test[0][0]
num_tokens = tensor.size(0) 
num_tokens

In [382]:
#If we want to re-write some of TS songs based on their first 2 or 3 verses, use this:
test_example = 0
text_to_insert = df_test["First_3_verses"].iloc[test_example]
print(text_to_insert)


You booked the night train for a reason
So you could sit there in this hurt
Bustling crowds or silent sleepers


In [383]:
model.eval() #model.eval() is used to ensure consistent and deterministic behavior of the model during text generation.

#Create the prompt and tokenize it
prompt = text_to_insert
input_ids = torch.tensor(tokenizer.encode(prompt, add_special_tokens=True)).unsqueeze(0)

#Generate text
output = model.generate(input_ids, #starting point
                        pad_token_id=tokenizer.eos_token_id,
                        min_new_tokens = 150, #Here we can use a range that is common for TS songs
                        max_new_tokens = 300,
                        do_sample=True, #it enables random sampling of tokens during generation, adding some randomness to the result obtained
                        temperature=10.0, #Controls the randomness of the generated text. A higher temperature makes the text more random by making a picked probability distribution of the words, creative, diverse, while a lower temperature makes it more deterministic and focused, flatter prob distribution
                        #top_k=3, #The model considers the top-k most likely tokens at each step.
                        #top_p=100, #cumulative probability >p in the ordered results
                        num_return_sequences=1, #Returning only one text for now
                        )

generated_text = tokenizer.decode(output[0], skip_special_tokens=True)
print(output[0]) #Show the sequence of generated tokens
print(generated_text)

tensor([  198,  1639, 21765,   262,  1755,  4512,   329,   257,  1738,   198,
         2396,   345,   714,  1650,   612,   287,   428,  5938,   198,    33,
          436,  1359, 15779,   393, 10574,  3993,   364,   318,   691,  2219,
          706,   257,  6388,   319,   257,   922,  7374,  3931,   618,   345,
          389,  1654,  2506, 14759,   986,   628,   366, 10995,   616,  6621,
          508,  2925,   351,   502,   736,   284,  3240,   262,  1285,   357,
          271, 10091,   481,  1282,   351,   674,  3656,   532,   674,   604,
           11,   607,  1115, 11875,   287,  2166,  1222,   502,   530,   503,
          492,     1, 11485,  1106,   986,   356,  1541,   423,   534, 13008,
          492,     7,    82, 17403,   798, 23029,    40,   731,   534,   362,
          812,  2084,   389,   655,  8523,  2877,   379,   604,   393,  2029,
          314,   714,   307,   287,   767,  2250,    11,   616,  2802,   373,
        11029,   362,    12,    16,    10,  1528,   981,   428, 

In [384]:
#Save the generated lyrics 
with open("generated_lyrics/generated_lyrics", "w") as text_file:
    text_file.write(generated_text)

### Evaluation

In [385]:
#Remove the first three verses that were given 
original_text = df_test["Lyrics"].iloc[test_example]

original_without_beginning = original_text[len(prompt):]
generated_without_beggining = generated_text[len(prompt):]

In [389]:
#Scores
#Rouge for summarization
#– summaries, compares to one or more reference summaries, -recall, -precision
#-does not care about the order if using unigrams, for bigrams we care about the order of pairs of words #-Rouge-L – longest common sequence
#GLUE scores are usually computed on natural language understanding tasks,


#Rouge Score
#https://huggingface.co/spaces/evaluate-metric/rouge
#https://pypi.org/project/rouge-score/

scorer = rouge_scorer.RougeScorer(['rouge1', 'rougeL'], use_stemmer=True) #rouge1 - n-gram scoring; rougeL - longest common seq
rouge_score = scorer.score(original_without_beginning, 
                      generated_without_beggining) #compare generated text with the original lyrics
rouge_score

{'rouge1': Score(precision=0.42038216560509556, recall=0.22837370242214533, fmeasure=0.2959641255605381),
 'rouge2': Score(precision=0.019230769230769232, recall=0.010416666666666666, fmeasure=0.013513513513513514),
 'rougeL': Score(precision=0.16560509554140126, recall=0.08996539792387544, fmeasure=0.11659192825112107)}

In [388]:
#Bleu is commonly used for machine translation evaluation but can be adapted for text generation.
#Range from 0 to 1: higher scores indicating better quality and similarity 

# Calculate BLEU score for a single sentence
bleu_score = sentence_bleu(original_without_beginning, generated_without_beggining)
bleu_score

The hypothesis contains 0 counts of 2-gram overlaps.
Therefore the BLEU score evaluates to 0, independently of
how many N-gram overlaps of lower order it contains.
Consider using lower n-gram order or use SmoothingFunction()
The hypothesis contains 0 counts of 3-gram overlaps.
Therefore the BLEU score evaluates to 0, independently of
how many N-gram overlaps of lower order it contains.
Consider using lower n-gram order or use SmoothingFunction()
The hypothesis contains 0 counts of 4-gram overlaps.
Therefore the BLEU score evaluates to 0, independently of
how many N-gram overlaps of lower order it contains.
Consider using lower n-gram order or use SmoothingFunction()


8.057672682463324e-232

### References

References:

https://huggingface.co/docs/transformers/model_doc/gpt2

https://pytorch.org/tutorials/beginner/basics/data_tutorial.html

https://pytorch.org/tutorials/beginner/basics/optimization_tutorial.html

https://github.com/huggingface/transformers/blob/main/src/transformers/models/gpt2/modeling_gpt2.py

https://colab.research.google.com/drive/13dZVYEOMhXhkXWfvSMVM1TTtUDrT6Aeh?usp=sharing#scrollTo=v4XhewaV93-_

https://gmihaila.github.io/tutorial_notebooks/gpt2_finetune_classification/



