In [1]:
import json
from transformers import GPT2LMHeadModel, GPT2Tokenizer
import torch
from torch.utils.data import DataLoader, Dataset

from peft import get_peft_model, LoraConfig
from sentence_transformers import SentenceTransformer, util

from nltk.translate.bleu_score import sentence_bleu
! pip install rouge-score nltk
from rouge_score import rouge_scorer

import numpy as np

Collecting rouge-score
  Downloading rouge_score-0.1.2.tar.gz (17 kB)
  Preparing metadata (setup.py) ... [?25l[?25hdone
Building wheels for collected packages: rouge-score
  Building wheel for rouge-score (setup.py) ... [?25l[?25hdone
  Created wheel for rouge-score: filename=rouge_score-0.1.2-py3-none-any.whl size=24935 sha256=6e3640aea619baec3dd54f36e9abba8b5826ebe737fd7027acb63846454137e5
  Stored in directory: /root/.cache/pip/wheels/5f/dd/89/461065a73be61a532ff8599a28e9beef17985c9e9c31e541b4
Successfully built rouge-score
Installing collected packages: rouge-score
Successfully installed rouge-score-0.1.2


In [2]:
# import os
# os.chdir('C:/Users/reese/OneDrive/Documents/MIDS/Fall 2024/DATASCI 266/Final/Data')

from google.colab import drive
drive.mount('/content/drive/')

Mounted at /content/drive/


### Scoring Functions

In [3]:
# Load the pre-trained Sentence-BERT model
STS_model = SentenceTransformer('all-MiniLM-L6-v2')

# Semantic similarity calculation
def compute_semantic_similarity(generated_lyrics, reference_lyrics):
    # Encode the sentences into embeddings
    generated_embedding = STS_model.encode(generated_lyrics, convert_to_tensor=True)
    reference_embedding = STS_model.encode(reference_lyrics, convert_to_tensor=True)

    # Compute cosine similarity between the embeddings
    similarity_score = util.pytorch_cos_sim(generated_embedding, reference_embedding)
    return similarity_score.item()

The secret `HF_TOKEN` does not exist in your Colab secrets.
To authenticate with the Hugging Face Hub, create a token in your settings tab (https://huggingface.co/settings/tokens), set it as secret in your Google Colab and restart your session.
You will be able to reuse this secret in all of your notebooks.
Please note that authentication is recommended but still optional to access public models or datasets.


modules.json:   0%|          | 0.00/349 [00:00<?, ?B/s]

config_sentence_transformers.json:   0%|          | 0.00/116 [00:00<?, ?B/s]

README.md:   0%|          | 0.00/10.7k [00:00<?, ?B/s]

sentence_bert_config.json:   0%|          | 0.00/53.0 [00:00<?, ?B/s]

config.json:   0%|          | 0.00/612 [00:00<?, ?B/s]

model.safetensors:   0%|          | 0.00/90.9M [00:00<?, ?B/s]

tokenizer_config.json:   0%|          | 0.00/350 [00:00<?, ?B/s]

vocab.txt:   0%|          | 0.00/232k [00:00<?, ?B/s]

tokenizer.json:   0%|          | 0.00/466k [00:00<?, ?B/s]

special_tokens_map.json:   0%|          | 0.00/112 [00:00<?, ?B/s]

1_Pooling/config.json:   0%|          | 0.00/190 [00:00<?, ?B/s]

In [4]:
# BLEU score calculation
def compute_bleu_score(generated_lyrics, reference_lyrics):
    reference_tokens = reference_lyrics.split()  # Tokenize the reference lyrics
    generated_tokens = generated_lyrics.split()  # Tokenize the generated lyrics
    bleu_score = sentence_bleu([reference_tokens], generated_tokens)  # Compute BLEU score
    return bleu_score

In [5]:
# ROUGE score calculation
def compute_rouge_score(generated_lyrics, reference_lyrics):
    scorer = rouge_scorer.RougeScorer(['rouge1', 'rouge2', 'rougeL'], use_stemmer=True)
    scores = scorer.score(reference_lyrics, generated_lyrics)
    return scores

### Dataset & Data Loader

In [6]:
# Prepare Dataset and DataLoader
class LyricsDataset(Dataset):
    def __init__(self, input_ids):
        self.input_ids = input_ids

    def __len__(self):
        return len(self.input_ids)

    def __getitem__(self, idx):
        return self.input_ids[idx], self.input_ids[idx]

### Johnny Cash

In [7]:
# Load the JSON data
with open('/content/drive/My Drive/w266 Final Project/data/Johnny_cleaned_songs.json', 'r') as file: #Choose between Johhny Cash, Adele, Elvis
    songs = json.load(file)

# Combine all lyrics into a single string
lyrics_data = "\n\n".join([song['lyrics'] for song in songs if song['lyrics'] is not None])
print(f"Total length of lyrics data: {len(lyrics_data)} characters")

# Split the lyrics data into smaller chunks
chunk_size = 1024  # Keep chunk size within 1024 tokens
lyrics_chunks = [lyrics_data[i:i + chunk_size] for i in range(0, len(lyrics_data), chunk_size)]

# Load pre-trained model and tokenizer
model_name = 'gpt2'
tokenizer = GPT2Tokenizer.from_pretrained(model_name)
model = GPT2LMHeadModel.from_pretrained(model_name)

# Configure LoRA adaptation
config = LoraConfig(r = 8, lora_alpha = 32, lora_dropout = 0.1)
model = get_peft_model(model, config)

# Tokenize each chunk
tokenized_inputs = []
for chunk in lyrics_chunks:
    inputs = tokenizer(chunk, return_tensors='pt', max_length=1024, truncation=True)
    tokenized_inputs.append(inputs)

print(f"Number of chunks processed: {len(tokenized_inputs)}")

# Flatten tokenized inputs for Dataset
flattened_input_ids = [item['input_ids'].squeeze() for item in tokenized_inputs]
dataset = LyricsDataset(flattened_input_ids)
dataloader = DataLoader(dataset, batch_size=1, shuffle=True)

# Fine-Tune the GPT-2 Model with LoRA
device = torch.device('cuda' if torch.cuda.is_available() else 'cpu')
model.to(device)
model.train()

optimizer = torch.optim.AdamW(model.parameters(), lr=5e-5)

for epoch in range(3):  # Example: 3 epochs
    for batch in dataloader:
        input_ids, labels = batch
        input_ids, labels = input_ids.to(device), labels.to(device)

        optimizer.zero_grad()
        outputs = model(input_ids, labels=labels)
        loss = outputs.loss
        loss.backward()
        optimizer.step()

        print(f"Epoch {epoch+1} Loss: {loss.item()}")

# Generate New Lyrics with the LoRA-adapted Model
model.eval()

Total length of lyrics data: 502398 characters


tokenizer_config.json:   0%|          | 0.00/26.0 [00:00<?, ?B/s]

vocab.json:   0%|          | 0.00/1.04M [00:00<?, ?B/s]

merges.txt:   0%|          | 0.00/456k [00:00<?, ?B/s]

tokenizer.json:   0%|          | 0.00/1.36M [00:00<?, ?B/s]

config.json:   0%|          | 0.00/665 [00:00<?, ?B/s]

model.safetensors:   0%|          | 0.00/548M [00:00<?, ?B/s]

generation_config.json:   0%|          | 0.00/124 [00:00<?, ?B/s]



Number of chunks processed: 491
Epoch 1 Loss: 4.022095203399658
Epoch 1 Loss: 3.815650463104248
Epoch 1 Loss: 3.6639230251312256
Epoch 1 Loss: 4.090695381164551
Epoch 1 Loss: 3.50703763961792
Epoch 1 Loss: 3.583636522293091
Epoch 1 Loss: 3.7187013626098633
Epoch 1 Loss: 4.439051151275635
Epoch 1 Loss: 3.3586971759796143
Epoch 1 Loss: 4.047092914581299
Epoch 1 Loss: 2.9114418029785156
Epoch 1 Loss: 3.491962194442749
Epoch 1 Loss: 3.928727865219116
Epoch 1 Loss: 3.759618043899536
Epoch 1 Loss: 4.603238105773926
Epoch 1 Loss: 3.534290075302124
Epoch 1 Loss: 3.50844407081604
Epoch 1 Loss: 3.6834516525268555
Epoch 1 Loss: 4.411977767944336
Epoch 1 Loss: 4.077694416046143
Epoch 1 Loss: 3.3883414268493652
Epoch 1 Loss: 2.002924919128418
Epoch 1 Loss: 2.7383270263671875
Epoch 1 Loss: 3.023052215576172
Epoch 1 Loss: 3.577672243118286
Epoch 1 Loss: 3.584977388381958
Epoch 1 Loss: 3.7104415893554688
Epoch 1 Loss: 4.217568874359131
Epoch 1 Loss: 3.7376599311828613
Epoch 1 Loss: 3.0473246574401855


PeftModel(
  (base_model): LoraModel(
    (model): GPT2LMHeadModel(
      (transformer): GPT2Model(
        (wte): Embedding(50257, 768)
        (wpe): Embedding(1024, 768)
        (drop): Dropout(p=0.1, inplace=False)
        (h): ModuleList(
          (0-11): 12 x GPT2Block(
            (ln_1): LayerNorm((768,), eps=1e-05, elementwise_affine=True)
            (attn): GPT2SdpaAttention(
              (c_attn): lora.Linear(
                (base_layer): Conv1D(nf=2304, nx=768)
                (lora_dropout): ModuleDict(
                  (default): Dropout(p=0.1, inplace=False)
                )
                (lora_A): ModuleDict(
                  (default): Linear(in_features=768, out_features=8, bias=False)
                )
                (lora_B): ModuleDict(
                  (default): Linear(in_features=8, out_features=2304, bias=False)
                )
                (lora_embedding_A): ParameterDict()
                (lora_embedding_B): ParameterDict()
                (l

In [8]:
prompts = [song['lyrics'].split('\n')[1] for song in songs if song['lyrics'] is not None]
reference_lyrics = [song['lyrics'] for song in songs if song['lyrics'] is not None]

semantic_similarities = []
bleu_scores = []
rouge_scores = {'rouge1': [], 'rouge2': [], 'rougeL': []}

for i, prompt in enumerate(prompts):
    if not prompt.strip():  # Check if prompt is empty or contains only whitespace
        continue  # Skip empty prompts
    # Generate text
    input_ids = tokenizer(prompt, return_tensors='pt').input_ids.to(device)
    output_ids = model.generate(input_ids,
                                max_length=500,
                                num_return_sequences=1,
                                no_repeat_ngram_size=2,
                                top_k=20,
                                top_p=0.7,
                                temperature=0.7,
                                early_stopping=True)

    generated_lyrics = tokenizer.decode(output_ids[0], skip_special_tokens=True)

    # Compute scores
    semantic_similarities.append(compute_semantic_similarity(generated_lyrics, reference_lyrics[i]))
    bleu_scores.append(compute_bleu_score(generated_lyrics, reference_lyrics[i]))
    scores = compute_rouge_score(generated_lyrics, reference_lyrics[i])
    rouge_scores['rouge1'].append(scores['rouge1'].fmeasure)
    rouge_scores['rouge2'].append(scores['rouge2'].fmeasure)
    rouge_scores['rougeL'].append(scores['rougeL'].fmeasure)

# Compute averages
avg_semantic_similarity = sum(semantic_similarities) / len(semantic_similarities)
avg_bleu = sum(bleu_scores) / len(bleu_scores)
avg_rouge1 = sum(rouge_scores['rouge1']) / len(rouge_scores['rouge1'])
avg_rouge2 = sum(rouge_scores['rouge2']) / len(rouge_scores['rouge2'])
avg_rougeL = sum(rouge_scores['rougeL']) / len(rouge_scores['rougeL'])

# Print overall scores
print(f"Average Semantic Similarity: {avg_semantic_similarity}")
print(f"Average BLEU Score: {avg_bleu}")
print(f"Average ROUGE-1: {avg_rouge1}")
print(f"Average ROUGE-2: {avg_rouge2}")
print(f"Average ROUGE-L: {avg_rougeL}")

The attention mask and the pad token id were not set. As a consequence, you may observe unexpected behavior. Please pass your input's `attention_mask` to obtain reliable results.
Setting `pad_token_id` to `eos_token_id`:None for open-end generation.
The attention mask is not set and cannot be inferred from input because pad token is same as eos token. As a consequence, you may observe unexpected behavior. Please pass your input's `attention_mask` to obtain reliable results.
The attention mask and the pad token id were not set. As a consequence, you may observe unexpected behavior. Please pass your input's `attention_mask` to obtain reliable results.
Setting `pad_token_id` to `eos_token_id`:None for open-end generation.
The attention mask and the pad token id were not set. As a consequence, you may observe unexpected behavior. Please pass your input's `attention_mask` to obtain reliable results.
Setting `pad_token_id` to `eos_token_id`:None for open-end generation.
The attention mask an

Average Semantic Similarity: 0.4742122830827194
Average BLEU Score: 0.04122592687950909
Average ROUGE-1: 0.3073353129897605
Average ROUGE-2: 0.07771728645250606
Average ROUGE-L: 0.16108554554645002


In [9]:
# Prompt for generation
prompt = "You wired me awake"
input_ids = tokenizer(prompt, return_tensors='pt').input_ids.to(device)

# Generate lyrics - you can change max_length to getlonger songs
model.generate(input_ids,
               max_length=500,
               num_return_sequences=1,
               no_repeat_ngram_size=2,
               top_k=20,
               top_p=0.7,
               temperature=0.7,
               early_stopping=True)

# Decode and print
generated_lyrics = tokenizer.decode(output_ids[0], skip_special_tokens=True)
print(generated_lyrics)

The attention mask and the pad token id were not set. As a consequence, you may observe unexpected behavior. Please pass your input's `attention_mask` to obtain reliable results.
Setting `pad_token_id` to `eos_token_id`:None for open-end generation.


That you've been seen, you're not alone
You've seen me, I've heard you
I've known you, and I know you'll be there

[Chorus]
Oh, my God, oh, God
My God!
And I'm going to be here
But I'll never be alone,
No, no, not ever
Never, ever, never
Just a little bit of love
A little little love, a lot of joy
That's all I can say
Well, it's just a matter of time
Then I will be back
There's no way I won't be
It's a long way to go
Until I get back to my home
So I hope you all love me
Love, love love you too



### Elvis

In [10]:
# Load the JSON data
with open('/content/drive/My Drive/w266 Final Project/data/Elvis_cleaned_songs.json', 'r') as file: #Choose between Johhny Cash, Adele, Elvis
    songs = json.load(file)

# Combine all lyrics into a single string
lyrics_data = "\n\n".join([song['lyrics'] for song in songs if song['lyrics'] is not None])
print(f"Total length of lyrics data: {len(lyrics_data)} characters")

# Split the lyrics data into smaller chunks
chunk_size = 1024  # Keep chunk size within 1024 tokens
lyrics_chunks = [lyrics_data[i:i + chunk_size] for i in range(0, len(lyrics_data), chunk_size)]

# Load pre-trained model and tokenizer
model_name = 'gpt2'
tokenizer = GPT2Tokenizer.from_pretrained(model_name)
model = GPT2LMHeadModel.from_pretrained(model_name)

# Configure LoRA adaptation
config = LoraConfig(r=8, lora_alpha=32, lora_dropout=0.1)
model = get_peft_model(model, config)

# Tokenize each chunk
tokenized_inputs = []
for chunk in lyrics_chunks:
    inputs = tokenizer(chunk, return_tensors='pt', max_length=1024, truncation=True)
    tokenized_inputs.append(inputs)

print(f"Number of chunks processed: {len(tokenized_inputs)}")

# Flatten tokenized inputs for Dataset
flattened_input_ids = [item['input_ids'].squeeze() for item in tokenized_inputs]
dataset = LyricsDataset(flattened_input_ids)
dataloader = DataLoader(dataset, batch_size=1, shuffle=True)

# Fine-Tune the GPT-2 Model with LoRA
device = torch.device('cuda' if torch.cuda.is_available() else 'cpu')
model.to(device)
model.train()

optimizer = torch.optim.AdamW(model.parameters(), lr=5e-5)

for epoch in range(3):  # Example: 3 epochs
    for batch in dataloader:
        input_ids, labels = batch
        input_ids, labels = input_ids.to(device), labels.to(device)

        optimizer.zero_grad()
        outputs = model(input_ids, labels=labels)
        loss = outputs.loss
        loss.backward()
        optimizer.step()

        print(f"Epoch {epoch+1} Loss: {loss.item()}")

# Generate New Lyrics with the LoRA-adapted Model
model.eval()

Total length of lyrics data: 389230 characters
Number of chunks processed: 381
Epoch 1 Loss: 2.7346370220184326
Epoch 1 Loss: 3.0977351665496826
Epoch 1 Loss: 2.752974033355713
Epoch 1 Loss: 3.2441790103912354
Epoch 1 Loss: 3.34065580368042
Epoch 1 Loss: 3.1764779090881348
Epoch 1 Loss: 3.268885850906372
Epoch 1 Loss: 3.997481346130371
Epoch 1 Loss: 3.4176478385925293
Epoch 1 Loss: 4.418014049530029
Epoch 1 Loss: 2.7680282592773438
Epoch 1 Loss: 4.242470741271973
Epoch 1 Loss: 3.088244676589966
Epoch 1 Loss: 2.624859094619751
Epoch 1 Loss: 3.018771171569824
Epoch 1 Loss: 3.364112138748169
Epoch 1 Loss: 3.327284097671509
Epoch 1 Loss: 3.190556287765503
Epoch 1 Loss: 2.4125771522521973
Epoch 1 Loss: 2.9172236919403076
Epoch 1 Loss: 2.7850887775421143
Epoch 1 Loss: 3.2505621910095215
Epoch 1 Loss: 2.0973613262176514
Epoch 1 Loss: 3.5500035285949707
Epoch 1 Loss: 2.0187225341796875
Epoch 1 Loss: 3.8717401027679443
Epoch 1 Loss: 2.6454684734344482
Epoch 1 Loss: 3.54815673828125
Epoch 1 Loss

PeftModel(
  (base_model): LoraModel(
    (model): GPT2LMHeadModel(
      (transformer): GPT2Model(
        (wte): Embedding(50257, 768)
        (wpe): Embedding(1024, 768)
        (drop): Dropout(p=0.1, inplace=False)
        (h): ModuleList(
          (0-11): 12 x GPT2Block(
            (ln_1): LayerNorm((768,), eps=1e-05, elementwise_affine=True)
            (attn): GPT2SdpaAttention(
              (c_attn): lora.Linear(
                (base_layer): Conv1D(nf=2304, nx=768)
                (lora_dropout): ModuleDict(
                  (default): Dropout(p=0.1, inplace=False)
                )
                (lora_A): ModuleDict(
                  (default): Linear(in_features=768, out_features=8, bias=False)
                )
                (lora_B): ModuleDict(
                  (default): Linear(in_features=8, out_features=2304, bias=False)
                )
                (lora_embedding_A): ParameterDict()
                (lora_embedding_B): ParameterDict()
                (l

In [11]:
prompts = [song['lyrics'].split('\n')[1] for song in songs if song['lyrics'] is not None]
reference_lyrics = [song['lyrics'] for song in songs if song['lyrics'] is not None]

semantic_similarities = []
bleu_scores = []
rouge_scores = {'rouge1': [], 'rouge2': [], 'rougeL': []}

for i, prompt in enumerate(prompts):
    if not prompt.strip():  # Check if prompt is empty or contains only whitespace
        continue  # Skip empty prompts
    # Generate text
    input_ids = tokenizer(prompt, return_tensors='pt').input_ids.to(device)
    output_ids = model.generate(input_ids,
                                max_length=500,
                                num_return_sequences=1,
                                no_repeat_ngram_size=2,
                                top_k=20,
                                top_p=0.7,
                                temperature=0.7,
                                early_stopping=True)

    generated_lyrics = tokenizer.decode(output_ids[0], skip_special_tokens=True)

    # Compute scores
    semantic_similarities.append(compute_semantic_similarity(generated_lyrics, reference_lyrics[i]))
    bleu_scores.append(compute_bleu_score(generated_lyrics, reference_lyrics[i]))
    scores = compute_rouge_score(generated_lyrics, reference_lyrics[i])
    rouge_scores['rouge1'].append(scores['rouge1'].fmeasure)
    rouge_scores['rouge2'].append(scores['rouge2'].fmeasure)
    rouge_scores['rougeL'].append(scores['rougeL'].fmeasure)

# Compute averages
avg_semantic_similarity = sum(semantic_similarities) / len(semantic_similarities)
avg_bleu = sum(bleu_scores) / len(bleu_scores)
avg_rouge1 = sum(rouge_scores['rouge1']) / len(rouge_scores['rouge1'])
avg_rouge2 = sum(rouge_scores['rouge2']) / len(rouge_scores['rouge2'])
avg_rougeL = sum(rouge_scores['rougeL']) / len(rouge_scores['rougeL'])

# Print overall scores
print(f"Average Semantic Similarity: {avg_semantic_similarity}")
print(f"Average BLEU Score: {avg_bleu}")
print(f"Average ROUGE-1: {avg_rouge1}")
print(f"Average ROUGE-2: {avg_rouge2}")
print(f"Average ROUGE-L: {avg_rougeL}")

The attention mask and the pad token id were not set. As a consequence, you may observe unexpected behavior. Please pass your input's `attention_mask` to obtain reliable results.
Setting `pad_token_id` to `eos_token_id`:None for open-end generation.
The attention mask and the pad token id were not set. As a consequence, you may observe unexpected behavior. Please pass your input's `attention_mask` to obtain reliable results.
Setting `pad_token_id` to `eos_token_id`:None for open-end generation.
The attention mask and the pad token id were not set. As a consequence, you may observe unexpected behavior. Please pass your input's `attention_mask` to obtain reliable results.
Setting `pad_token_id` to `eos_token_id`:None for open-end generation.
The attention mask and the pad token id were not set. As a consequence, you may observe unexpected behavior. Please pass your input's `attention_mask` to obtain reliable results.
Setting `pad_token_id` to `eos_token_id`:None for open-end generation.


Average Semantic Similarity: 0.5195928968775084
Average BLEU Score: 0.029198471456403845
Average ROUGE-1: 0.2812790341071367
Average ROUGE-2: 0.06572129437699684
Average ROUGE-L: 0.14909636418166697


In [12]:
# Prompt for generation
prompt = "When a boy like me meets a girl like you"
input_ids = tokenizer(prompt, return_tensors='pt').input_ids.to(device)

# Generate lyrics - you can change max_length to getlonger songs
model.generate(input_ids,
               max_length=500,
               num_return_sequences=1,
               no_repeat_ngram_size=2,
               top_k=20,
               top_p=0.7,
               temperature=0.7,
               early_stopping=True)

# Decode and print
generated_lyrics = tokenizer.decode(output_ids[0], skip_special_tokens=True)
print(generated_lyrics)

The attention mask and the pad token id were not set. As a consequence, you may observe unexpected behavior. Please pass your input's `attention_mask` to obtain reliable results.
Setting `pad_token_id` to `eos_token_id`:None for open-end generation.


(Young dreams of love, young dreams of love)
Young dream of a happy life

[Verse 1]
I'm going to be a good girl
And I'm gonna be happy
But I'll be lonely
So I can't be alone
You know, I've been lonely for a long time
My heart's been beating so hard
That I don't know how to get out
Well, you know what I mean
Just let me go
Let me get back to my life, let's go back home
We'll see each other again
(Verve 1) I love you
Love you so much
Don't you love me so badly
Oh, my God, love
It's so good
To see you again, to see me again again (Ververve 2) Love you, so so well
The love that you have for me
Is so strong
In my heart
When I see your love in my eyes
All the love I have
For you and me, and for you (Chorus) Oh, God
God, oh God (chorus 2-3) God love so very much,
O God of all love (singing) O God that loves so dearly
He loves me and I, O Lord of the world
Lord of my world, Lord, lord of mine
Haven't I seen you before
A little girl, a little boy, little man
Who's got a heart of gold
Where's th

### Adele

In [13]:
# Load the JSON data
with open('/content/drive/My Drive/w266 Final Project/data/Adele_cleaned_songs.json', 'r') as file: #Choose between Johhny Cash, Adele, Elvis
    songs = json.load(file)

# Combine all lyrics into a single string
lyrics_data = "\n\n".join([song['lyrics'] for song in songs if song['lyrics'] is not None])
print(f"Total length of lyrics data: {len(lyrics_data)} characters")

# Split the lyrics data into smaller chunks
chunk_size = 1024  # Keep chunk size within 1024 tokens
lyrics_chunks = [lyrics_data[i:i + chunk_size] for i in range(0, len(lyrics_data), chunk_size)]

# Load pre-trained model and tokenizer
model_name = 'gpt2'
tokenizer = GPT2Tokenizer.from_pretrained(model_name)
model = GPT2LMHeadModel.from_pretrained(model_name)

# Configure LoRA adaptation
config = LoraConfig(r=8, lora_alpha=32, lora_dropout=0.1)
model = get_peft_model(model, config)

# Tokenize each chunk
tokenized_inputs = []
for chunk in lyrics_chunks:
    inputs = tokenizer(chunk, return_tensors='pt', max_length=1024, truncation=True)
    tokenized_inputs.append(inputs)

print(f"Number of chunks processed: {len(tokenized_inputs)}")

# Flatten tokenized inputs for Dataset
flattened_input_ids = [item['input_ids'].squeeze() for item in tokenized_inputs]
dataset = LyricsDataset(flattened_input_ids)
dataloader = DataLoader(dataset, batch_size=1, shuffle=True)

# Fine-Tune the GPT-2 Model with LoRA
device = torch.device('cuda' if torch.cuda.is_available() else 'cpu')
model.to(device)
model.train()

optimizer = torch.optim.AdamW(model.parameters(), lr=5e-5)

for epoch in range(3):  # Example: 3 epochs
    for batch in dataloader:
        input_ids, labels = batch
        input_ids, labels = input_ids.to(device), labels.to(device)

        optimizer.zero_grad()
        outputs = model(input_ids, labels=labels)
        loss = outputs.loss
        loss.backward()
        optimizer.step()

        print(f"Epoch {epoch+1} Loss: {loss.item()}")

# Generate New Lyrics with the LoRA-adapted Model
model.eval()

Total length of lyrics data: 88122 characters
Number of chunks processed: 87
Epoch 1 Loss: 2.894925594329834
Epoch 1 Loss: 2.971682548522949
Epoch 1 Loss: 3.6312758922576904
Epoch 1 Loss: 2.2032926082611084
Epoch 1 Loss: 2.3273425102233887
Epoch 1 Loss: 3.7976737022399902
Epoch 1 Loss: 3.8205976486206055
Epoch 1 Loss: 3.196150302886963
Epoch 1 Loss: 3.0405638217926025
Epoch 1 Loss: 2.70858097076416
Epoch 1 Loss: 3.694937229156494
Epoch 1 Loss: 3.0684890747070312
Epoch 1 Loss: 3.3280181884765625
Epoch 1 Loss: 3.2484753131866455
Epoch 1 Loss: 2.0252881050109863
Epoch 1 Loss: 2.088021993637085
Epoch 1 Loss: 1.9698636531829834
Epoch 1 Loss: 3.3630282878875732
Epoch 1 Loss: 2.853713035583496
Epoch 1 Loss: 2.1482481956481934
Epoch 1 Loss: 2.3228330612182617
Epoch 1 Loss: 3.3237431049346924
Epoch 1 Loss: 3.493793249130249
Epoch 1 Loss: 2.871731996536255
Epoch 1 Loss: 2.9033446311950684
Epoch 1 Loss: 2.7133893966674805
Epoch 1 Loss: 2.1598165035247803
Epoch 1 Loss: 3.4962077140808105
Epoch 1 L

PeftModel(
  (base_model): LoraModel(
    (model): GPT2LMHeadModel(
      (transformer): GPT2Model(
        (wte): Embedding(50257, 768)
        (wpe): Embedding(1024, 768)
        (drop): Dropout(p=0.1, inplace=False)
        (h): ModuleList(
          (0-11): 12 x GPT2Block(
            (ln_1): LayerNorm((768,), eps=1e-05, elementwise_affine=True)
            (attn): GPT2SdpaAttention(
              (c_attn): lora.Linear(
                (base_layer): Conv1D(nf=2304, nx=768)
                (lora_dropout): ModuleDict(
                  (default): Dropout(p=0.1, inplace=False)
                )
                (lora_A): ModuleDict(
                  (default): Linear(in_features=768, out_features=8, bias=False)
                )
                (lora_B): ModuleDict(
                  (default): Linear(in_features=8, out_features=2304, bias=False)
                )
                (lora_embedding_A): ParameterDict()
                (lora_embedding_B): ParameterDict()
                (l

In [14]:
prompts = [song['lyrics'].split('\n')[1] for song in songs if song['lyrics'] is not None]
reference_lyrics = [song['lyrics'] for song in songs if song['lyrics'] is not None]

semantic_similarities = []
bleu_scores = []
rouge_scores = {'rouge1': [], 'rouge2': [], 'rougeL': []}

for i, prompt in enumerate(prompts):
    if not prompt.strip():  # Check if prompt is empty or contains only whitespace
        continue  # Skip empty prompts
    # Generate text
    input_ids = tokenizer(prompt, return_tensors='pt').input_ids.to(device)
    output_ids = model.generate(input_ids,
                                max_length=500,
                                num_return_sequences=1,
                                no_repeat_ngram_size=2,
                                top_k=20,
                                top_p=0.7,
                                temperature=0.7,
                                early_stopping=True)

    generated_lyrics = tokenizer.decode(output_ids[0], skip_special_tokens=True)

    # Compute scores
    semantic_similarities.append(compute_semantic_similarity(generated_lyrics, reference_lyrics[i]))
    bleu_scores.append(compute_bleu_score(generated_lyrics, reference_lyrics[i]))
    scores = compute_rouge_score(generated_lyrics, reference_lyrics[i])
    rouge_scores['rouge1'].append(scores['rouge1'].fmeasure)
    rouge_scores['rouge2'].append(scores['rouge2'].fmeasure)
    rouge_scores['rougeL'].append(scores['rougeL'].fmeasure)

# Compute averages
avg_semantic_similarity = sum(semantic_similarities) / len(semantic_similarities)
avg_bleu = sum(bleu_scores) / len(bleu_scores)
avg_rouge1 = sum(rouge_scores['rouge1']) / len(rouge_scores['rouge1'])
avg_rouge2 = sum(rouge_scores['rouge2']) / len(rouge_scores['rouge2'])
avg_rougeL = sum(rouge_scores['rougeL']) / len(rouge_scores['rougeL'])

# Print overall scores
print(f"Average Semantic Similarity: {avg_semantic_similarity}")
print(f"Average BLEU Score: {avg_bleu}")
print(f"Average ROUGE-1: {avg_rouge1}")
print(f"Average ROUGE-2: {avg_rouge2}")
print(f"Average ROUGE-L: {avg_rougeL}")

The attention mask and the pad token id were not set. As a consequence, you may observe unexpected behavior. Please pass your input's `attention_mask` to obtain reliable results.
Setting `pad_token_id` to `eos_token_id`:None for open-end generation.
The attention mask and the pad token id were not set. As a consequence, you may observe unexpected behavior. Please pass your input's `attention_mask` to obtain reliable results.
Setting `pad_token_id` to `eos_token_id`:None for open-end generation.
The attention mask and the pad token id were not set. As a consequence, you may observe unexpected behavior. Please pass your input's `attention_mask` to obtain reliable results.
Setting `pad_token_id` to `eos_token_id`:None for open-end generation.
The attention mask and the pad token id were not set. As a consequence, you may observe unexpected behavior. Please pass your input's `attention_mask` to obtain reliable results.
Setting `pad_token_id` to `eos_token_id`:None for open-end generation.


Average Semantic Similarity: 0.36788110721569794
Average BLEU Score: 0.013703102038503938
Average ROUGE-1: 0.2377795204057581
Average ROUGE-2: 0.06891373239302008
Average ROUGE-L: 0.1524868597126249


In [15]:
# Prompt for generation
prompt = "I will leave my heart at the door"
input_ids = tokenizer(prompt, return_tensors='pt').input_ids.to(device)

# Generate lyrics - you can change max_length to getlonger songs
model.generate(input_ids,
               max_length=500,
               num_return_sequences=1,
               no_repeat_ngram_size=2,
               top_k=20,
               top_p=0.7,
               temperature=0.7,
               early_stopping=True)

# Decode and print
generated_lyrics = tokenizer.decode(output_ids[0], skip_special_tokens=True)
print(generated_lyrics)

The attention mask and the pad token id were not set. As a consequence, you may observe unexpected behavior. Please pass your input's `attention_mask` to obtain reliable results.
Setting `pad_token_id` to `eos_token_id`:None for open-end generation.


You're driving me away, give me a reason to stay.

I'm not going to let you down,
You know what I mean?
Cause I'm going back to the way I was before you
And I'll never let that happen again
Because I know you're going through a lot
But I don't want to see you go through that again, I want you to know that I love you, and I can't let it happen to you again. I just want it to be over. And I hope you don' wanna see me go back.
