In [None]:
import json
from transformers import GPT2LMHeadModel, GPT2Tokenizer, get_scheduler
import torch
from torch.utils.data import DataLoader, Dataset

from peft import get_peft_model, LoraConfig
from sentence_transformers import SentenceTransformer, util

from nltk.translate.bleu_score import sentence_bleu

! pip install rouge-score nltk
from rouge_score import rouge_scorer

import numpy as np

from itertools import product



In [None]:
# import os
# os.chdir('C:/Users/reese/OneDrive/Documents/MIDS/Fall 2024/DATASCI 266/Final/Data')

from google.colab import drive
drive.mount('/content/drive/')

Drive already mounted at /content/drive/; to attempt to forcibly remount, call drive.mount("/content/drive/", force_remount=True).


In [None]:
# Load the JSON data
with open('/content/drive/My Drive/w266 Final Project/data/Johnny_cleaned_songs.json', 'r') as file: #Choose between Johhny Cash, Adele, Elvis
    songs = json.load(file)

# Combine all lyrics into a single string
lyrics_data = "\n\n".join([song['lyrics'] for song in songs if song['lyrics'] is not None])
print(f"Total length of lyrics data: {len(lyrics_data)} characters")

# Split the lyrics data into smaller chunks
chunk_size = 1024  # Keep chunk size within 1024 tokens
lyrics_chunks = [lyrics_data[i:i + chunk_size] for i in range(0, len(lyrics_data), chunk_size)]

Total length of lyrics data: 502398 characters


In [None]:
# Load pre-trained model and tokenizer
model_name = 'gpt2'
tokenizer = GPT2Tokenizer.from_pretrained(model_name)
model = GPT2LMHeadModel.from_pretrained(model_name)

The secret `HF_TOKEN` does not exist in your Colab secrets.
To authenticate with the Hugging Face Hub, create a token in your settings tab (https://huggingface.co/settings/tokens), set it as secret in your Google Colab and restart your session.
You will be able to reuse this secret in all of your notebooks.
Please note that authentication is recommended but still optional to access public models or datasets.


In [None]:
# Configure LoRA adaptation
config = LoraConfig(r = 8, lora_alpha = 16, lora_dropout = 0.2)
model = get_peft_model(model, config)



In [None]:
# Tokenize each chunk
tokenized_inputs = []
for chunk in lyrics_chunks:
    inputs = tokenizer(chunk, return_tensors='pt', max_length=1024, truncation=True)
    tokenized_inputs.append(inputs)

print(f"Number of chunks processed: {len(tokenized_inputs)}")

Number of chunks processed: 491


In [None]:
# Prepare Dataset and DataLoader
class LyricsDataset(Dataset):
    def __init__(self, input_ids):
        self.input_ids = input_ids

    def __len__(self):
        return len(self.input_ids)

    def __getitem__(self, idx):
        return self.input_ids[idx], self.input_ids[idx]

# Flatten tokenized inputs for Dataset
flattened_input_ids = [item['input_ids'].squeeze() for item in tokenized_inputs]
dataset = LyricsDataset(flattened_input_ids)
dataloader = DataLoader(dataset, batch_size = 1, shuffle = True)

In [None]:
# Fine-Tune the GPT-2 Model with LoRA
device = torch.device('cuda' if torch.cuda.is_available() else 'cpu')
model.to(device)
model.train()

#gradient_accumulation_steps = 4  # Combine gradients over multiple steps
optimizer = torch.optim.AdamW(model.parameters(), lr = 5e-5, weight_decay = 0.01)

epoch = 6

for e in range(epoch):  # Example: 3 epochs
    for batch in dataloader:
        input_ids, labels = batch
        input_ids, labels = input_ids.to(device), labels.to(device)

        optimizer.zero_grad()
        outputs = model(input_ids, labels=labels)
        loss = outputs.loss
        loss.backward()
        optimizer.step()

        print(f"Epoch {e+1} Loss: {loss.item()}")

Epoch 1 Loss: 3.4038591384887695
Epoch 1 Loss: 3.7428054809570312
Epoch 1 Loss: 4.356794357299805
Epoch 1 Loss: 3.515784978866577
Epoch 1 Loss: 4.049907207489014
Epoch 1 Loss: 3.191981077194214
Epoch 1 Loss: 4.179463863372803
Epoch 1 Loss: 4.051504135131836
Epoch 1 Loss: 4.144811630249023
Epoch 1 Loss: 3.864670991897583
Epoch 1 Loss: 4.4451751708984375
Epoch 1 Loss: 4.265930652618408
Epoch 1 Loss: 3.6272776126861572
Epoch 1 Loss: 3.969184637069702
Epoch 1 Loss: 3.4688377380371094
Epoch 1 Loss: 3.2966268062591553
Epoch 1 Loss: 4.238895416259766
Epoch 1 Loss: 3.958503484725952
Epoch 1 Loss: 3.5566954612731934
Epoch 1 Loss: 4.615248680114746
Epoch 1 Loss: 3.3637263774871826
Epoch 1 Loss: 2.4125967025756836
Epoch 1 Loss: 3.9060676097869873
Epoch 1 Loss: 3.702125072479248
Epoch 1 Loss: 2.0222721099853516
Epoch 1 Loss: 4.000223159790039
Epoch 1 Loss: 3.963681221008301
Epoch 1 Loss: 4.241659164428711
Epoch 1 Loss: 3.5838024616241455
Epoch 1 Loss: 4.090770721435547
Epoch 1 Loss: 4.071255207061

In [None]:
# Generate New Lyrics with the LoRA-adapted Model
model.eval()

PeftModel(
  (base_model): LoraModel(
    (model): GPT2LMHeadModel(
      (transformer): GPT2Model(
        (wte): Embedding(50257, 768)
        (wpe): Embedding(1024, 768)
        (drop): Dropout(p=0.1, inplace=False)
        (h): ModuleList(
          (0-11): 12 x GPT2Block(
            (ln_1): LayerNorm((768,), eps=1e-05, elementwise_affine=True)
            (attn): GPT2SdpaAttention(
              (c_attn): lora.Linear(
                (base_layer): Conv1D(nf=2304, nx=768)
                (lora_dropout): ModuleDict(
                  (default): Dropout(p=0.2, inplace=False)
                )
                (lora_A): ModuleDict(
                  (default): Linear(in_features=768, out_features=8, bias=False)
                )
                (lora_B): ModuleDict(
                  (default): Linear(in_features=8, out_features=2304, bias=False)
                )
                (lora_embedding_A): ParameterDict()
                (lora_embedding_B): ParameterDict()
                (l

In [None]:
# Load the pre-trained Sentence-BERT model
STS_model = SentenceTransformer('all-MiniLM-L6-v2')

# Semantic similarity calculation
def compute_semantic_similarity(generated_lyrics, reference_lyrics):
    # Encode the sentences into embeddings
    generated_embedding = STS_model.encode(generated_lyrics, convert_to_tensor=True)
    reference_embedding = STS_model.encode(reference_lyrics, convert_to_tensor=True)

    # Compute cosine similarity between the embeddings
    similarity_score = util.pytorch_cos_sim(generated_embedding, reference_embedding)
    return similarity_score.item()

In [None]:
# BLEU score calculation
def compute_bleu_score(generated_lyrics, reference_lyrics):
    reference_tokens = reference_lyrics.split()  # Tokenize the reference lyrics
    generated_tokens = generated_lyrics.split()  # Tokenize the generated lyrics
    bleu_score = sentence_bleu([reference_tokens], generated_tokens)  # Compute BLEU score
    return bleu_score

# Example usage
#bleu_score1 = compute_bleu_score(generated_lyrics1, reference_lyrics1)
#print(f"BLEU Score 1: {bleu_score1}")


In [None]:
# ROUGE score calculation
def compute_rouge_score(generated_lyrics, reference_lyrics):
    scorer = rouge_scorer.RougeScorer(['rouge1', 'rouge2', 'rougeL'], use_stemmer=True)
    scores = scorer.score(reference_lyrics, generated_lyrics)
    return scores

# Example usage
#rouge_scores1 = compute_rouge_score(generated_lyrics1, reference_lyrics1)
#print(f"ROUGE Scores 1: {rouge_scores1}")

In [None]:
# Define parameter ranges for grid search
param_grid = {
    "top_k": [10, 20, 30, 40],
    "top_p": [0.7, 0.8, 0.9],
    "temperature": [0.7, 0.9, 1.1],
    "max_length": [250, 500],
}

# Generate all combinations of parameters
param_combinations = list(product(
    param_grid["top_k"],
    param_grid["top_p"],
    param_grid["temperature"]
))


In [None]:
# Initialize variables
best_params = None
best_score = -float("inf")
results = []

prompts = [song['lyrics'].split('\n')[1] for song in songs if song['lyrics'] is not None]
reference_lyrics = [song['lyrics'] for song in songs if song['lyrics'] is not None]

# # Ensure pad_token is properly defined
# if tokenizer.pad_token is None:
#     tokenizer.add_special_tokens({'pad_token': '[PAD]'})  # Add custom pad token if not present
#     model.resize_token_embeddings(len(tokenizer))  # Resize model embeddings to include new token

# # Set `pad_token_id` to the integer ID of the pad token
# tokenizer.pad_token_id = tokenizer.eos_token_id if tokenizer.pad_token_id is None else tokenizer.pad_token_id
# model.config.pad_token_id = tokenizer.pad_token_id  # Ensure consistency

# Iterate through parameter combinations
for top_k, top_p, temperature in param_combinations:
    print(f"Testing parameters: top_k={top_k}, top_p={top_p}, temperature={temperature}")

    semantic_similarities = []
    bleu_scores = []
    rouge1_scores = []
    rouge2_scores = []
    rougeL_scores = []

    # Evaluate each prompt
    for i, prompt in enumerate(prompts):
        if not prompt.strip():  # Check if prompt is empty or contains only whitespace
            continue  # Skip empty prompts

        # Generate text
        input_ids = tokenizer(prompt, return_tensors='pt').input_ids.to(device)
        output_ids = model.generate(input_ids,
                                    max_length=500,
                                    num_return_sequences=1,
                                    no_repeat_ngram_size=2,
                                    top_k=top_k,
                                    top_p=top_p,
                                    temperature=temperature,
                                    early_stopping=True,

                                    # pad_token_id = tokenizer.pad_token,
                                    do_sample = True) #to allow varied outputs, negating decoding strategies like beam search / greedy decoding

        generated_text = tokenizer.decode(output_ids[0], skip_special_tokens=True)

        # Compute metrics
        semantic_score = compute_semantic_similarity(generated_text, reference_lyrics[i])
        bleu_score = compute_bleu_score(generated_text, reference_lyrics[i])
        rouge_scores = compute_rouge_score(generated_text, reference_lyrics[i])

        # Collect scores
        semantic_similarities.append(semantic_score)
        bleu_scores.append(bleu_score)
        rouge1_scores.append(rouge_scores["rouge1"].fmeasure)
        rouge2_scores.append(rouge_scores["rouge2"].fmeasure)
        rougeL_scores.append(rouge_scores["rougeL"].fmeasure)

    # Compute averages
    avg_semantic_similarity = np.mean(semantic_similarities)
    avg_bleu = np.mean(bleu_scores)
    avg_rouge1 = np.mean(rouge1_scores)
    avg_rouge2 = np.mean(rouge2_scores)
    avg_rougeL = np.mean(rougeL_scores)

    print(f"Semantic Similarity Score: {avg_semantic_similarity}")

    # Log results
    results.append({
        "params": {"top_k": top_k, "top_p": top_p, "temperature": temperature},
        "semantic_similarity": avg_semantic_similarity,
        "bleu": avg_bleu,
        "rouge1": avg_rouge1,
        "rouge2": avg_rouge2,
        "rougeL": avg_rougeL
    })

    # Update best parameters
    if avg_semantic_similarity > best_score:
        best_score = avg_semantic_similarity
        best_params = {"top_k": top_k, "top_p": top_p, "temperature": temperature}

# Display best results
print(f"Best Parameters: {best_params}")
print(f"Best Semantic Similarity: {best_score}")

# Display all results
for res in results:
    print(f"Params: {res['params']} -> Semantic: {res['semantic_similarity']}, BLEU: {res['bleu']}, ROUGE-1: {res['rouge1']}, ROUGE-2: {res['rouge2']}, ROUGE-L: {res['rougeL']}")


In [None]:
# Prompt for generation
prompt = "You wired me awake"
input_ids = tokenizer(prompt, return_tensors='pt').input_ids.to(device)

# Generate lyrics - you can change max_length to getlonger songs
output_ids = model.generate(input_ids,
                            max_length=500,
                            num_return_sequences=1,
                            no_repeat_ngram_size=2,
                            top_k = best_params['top_k'],
                            top_p = best_params['top_p'],
                            temperature = best_params['temperature'],
                            early_stopping = True,
                            do_sample = True)

# Decode and print
generated_lyrics = tokenizer.decode(output_ids[0], skip_special_tokens=True)
print(generated_lyrics)