In [1]:
import json
from transformers import GPT2LMHeadModel, GPT2Tokenizer
import torch
from torch.utils.data import DataLoader, Dataset
import os
from sentence_transformers import SentenceTransformer, util
from nltk.translate.bleu_score import sentence_bleu

! pip install rouge-score nltk
from rouge_score import rouge_scorer




In [2]:
# import os
# os.chdir('C:/Users/reese/OneDrive/Documents/MIDS/Fall 2024/DATASCI 266/Final/Data')

from google.colab import drive
drive.mount('/content/drive/')

Drive already mounted at /content/drive/; to attempt to forcibly remount, call drive.mount("/content/drive/", force_remount=True).


In [3]:
# Load the pre-trained Sentence-BERT model
STS_model = SentenceTransformer('all-MiniLM-L6-v2')

# Semantic similarity calculation
def compute_semantic_similarity(generated_lyrics, reference_lyrics):
    # Encode the sentences into embeddings
    generated_embedding = STS_model.encode(generated_lyrics, convert_to_tensor=True)
    reference_embedding = STS_model.encode(reference_lyrics, convert_to_tensor=True)

    # Compute cosine similarity between the embeddings
    similarity_score = util.pytorch_cos_sim(generated_embedding, reference_embedding)
    return similarity_score.item()

The secret `HF_TOKEN` does not exist in your Colab secrets.
To authenticate with the Hugging Face Hub, create a token in your settings tab (https://huggingface.co/settings/tokens), set it as secret in your Google Colab and restart your session.
You will be able to reuse this secret in all of your notebooks.
Please note that authentication is recommended but still optional to access public models or datasets.


In [4]:
# BLEU score calculation
def compute_bleu_score(generated_lyrics, reference_lyrics):
    reference_tokens = reference_lyrics.split()  # Tokenize the reference lyrics
    generated_tokens = generated_lyrics.split()  # Tokenize the generated lyrics
    bleu_score = sentence_bleu([reference_tokens], generated_tokens)  # Compute BLEU score
    return bleu_score

In [5]:
# ROUGE score calculation
def compute_rouge_score(generated_lyrics, reference_lyrics):
    scorer = rouge_scorer.RougeScorer(['rouge1', 'rouge2', 'rougeL'], use_stemmer=True)
    scores = scorer.score(reference_lyrics, generated_lyrics)
    return scores

## Johnny Cash

In [6]:
# Load the JSON data
with open('/content/drive/My Drive/w266 Final Project/data/Johnny_cleaned_songs.json', 'r') as file: #Choose between Johhny Cash, Adele, Elvis
    songs = json.load(file)

# Combine all lyrics into a single string
lyrics_data = "\n\n".join([song['lyrics'] for song in songs if song['lyrics'] is not None])
print(f"Total length of lyrics data: {len(lyrics_data)} characters")

# Load pre-trained model and tokenizer
model_name = 'gpt2'
tokenizer = GPT2Tokenizer.from_pretrained(model_name)
model = GPT2LMHeadModel.from_pretrained(model_name)

# Split the lyrics data into smaller chunks
chunk_size = 1024  # You can adjust this size but it has to stay at max 1024 in size
lyrics_chunks = [lyrics_data[i:i + chunk_size] for i in range(0, len(lyrics_data), chunk_size)]

# Tokenize each chunk
tokenized_inputs = []
for chunk in lyrics_chunks:
    inputs = tokenizer(chunk, return_tensors='pt', max_length=1024, truncation=True)
    tokenized_inputs.append(inputs)

print(f"Number of chunks processed: {len(tokenized_inputs)}")

# Prepare Dataset and DataLoader
class LyricsDataset(Dataset):
    def __init__(self, input_ids):
        self.input_ids = input_ids

    def __len__(self):
        return len(self.input_ids)

    def __getitem__(self, idx):
        return self.input_ids[idx], self.input_ids[idx]

# Flatten tokenized inputs for Dataset
flattened_input_ids = [item['input_ids'].squeeze() for item in tokenized_inputs]
dataset = LyricsDataset(flattened_input_ids)
dataloader = DataLoader(dataset, batch_size=1, shuffle=True)

# Fine-Tune the GPT-2 Model
device = torch.device('cuda' if torch.cuda.is_available() else 'cpu')
model.to(device)
model.train()

optimizer = torch.optim.AdamW(model.parameters(), lr=5e-5)

for epoch in range(3):  # Example: 3 epochs
    for batch in dataloader:
        input_ids, labels = batch
        input_ids, labels = input_ids.to(device), labels.to(device)

        optimizer.zero_grad()
        outputs = model(input_ids, labels=labels)
        loss = outputs.loss
        loss.backward()
        optimizer.step()

        print(f"Epoch {epoch+1} Loss: {loss.item()}")

# Generate New Lyrics
model.eval()
#%%
# Prompt for generation
prompt1 = "You wired me awake" #Rusty Cage
input_ids1 = tokenizer(prompt1, return_tensors='pt').input_ids.to(device)

prompt2 = "Well they're building a gallows outside my cell" #25 Minutes to Go
input_ids2 = tokenizer(prompt2, return_tensors='pt').input_ids.to(device)

prompt3 = "Every town has its town bum" # Abner Brown
input_ids3 = tokenizer(prompt3, return_tensors='pt').input_ids.to(device)

# Generate lyrics - you can change max_length to get longer songs
output_ids1 = model.generate(input_ids1, max_length=210, num_return_sequences=1, no_repeat_ngram_size=2)
output_ids2 = model.generate(input_ids2, max_length=210, num_return_sequences=1, no_repeat_ngram_size=2)
output_ids3 = model.generate(input_ids3, max_length=210, num_return_sequences=1, no_repeat_ngram_size=2)

# Decode and print
generated_lyrics1 = tokenizer.decode(output_ids1[0], skip_special_tokens=True)
generated_lyrics2 = tokenizer.decode(output_ids2[0], skip_special_tokens=True)
generated_lyrics3 = tokenizer.decode(output_ids3[0], skip_special_tokens=True)



Total length of lyrics data: 502398 characters


tokenizer_config.json:   0%|          | 0.00/26.0 [00:00<?, ?B/s]

vocab.json:   0%|          | 0.00/1.04M [00:00<?, ?B/s]

merges.txt:   0%|          | 0.00/456k [00:00<?, ?B/s]

tokenizer.json:   0%|          | 0.00/1.36M [00:00<?, ?B/s]

config.json:   0%|          | 0.00/665 [00:00<?, ?B/s]

model.safetensors:   0%|          | 0.00/548M [00:00<?, ?B/s]

generation_config.json:   0%|          | 0.00/124 [00:00<?, ?B/s]

Number of chunks processed: 491
Epoch 1 Loss: 3.868964910507202
Epoch 1 Loss: 3.7420225143432617
Epoch 1 Loss: 3.777437686920166
Epoch 1 Loss: 3.377272605895996
Epoch 1 Loss: 3.1890878677368164
Epoch 1 Loss: 3.6093716621398926
Epoch 1 Loss: 4.244319915771484
Epoch 1 Loss: 3.2866673469543457
Epoch 1 Loss: 2.862004518508911
Epoch 1 Loss: 3.250272035598755
Epoch 1 Loss: 4.001898288726807
Epoch 1 Loss: 2.3530080318450928
Epoch 1 Loss: 3.4974725246429443
Epoch 1 Loss: 3.453845500946045
Epoch 1 Loss: 3.04011607170105
Epoch 1 Loss: 3.7648472785949707
Epoch 1 Loss: 3.0353150367736816
Epoch 1 Loss: 3.5372064113616943
Epoch 1 Loss: 2.546428918838501
Epoch 1 Loss: 3.829205274581909
Epoch 1 Loss: 2.9929442405700684
Epoch 1 Loss: 2.5899057388305664
Epoch 1 Loss: 2.956357002258301
Epoch 1 Loss: 3.4570698738098145
Epoch 1 Loss: 3.2864251136779785
Epoch 1 Loss: 3.8198390007019043
Epoch 1 Loss: 3.962730646133423
Epoch 1 Loss: 3.259167194366455
Epoch 1 Loss: 4.130068778991699
Epoch 1 Loss: 3.67839956283

The attention mask and the pad token id were not set. As a consequence, you may observe unexpected behavior. Please pass your input's `attention_mask` to obtain reliable results.
Setting `pad_token_id` to `eos_token_id`:None for open-end generation.
The attention mask is not set and cannot be inferred from input because pad token is same as eos token. As a consequence, you may observe unexpected behavior. Please pass your input's `attention_mask` to obtain reliable results.


Epoch 3 Loss: 2.75286602973938


The attention mask and the pad token id were not set. As a consequence, you may observe unexpected behavior. Please pass your input's `attention_mask` to obtain reliable results.
Setting `pad_token_id` to `eos_token_id`:None for open-end generation.
The attention mask and the pad token id were not set. As a consequence, you may observe unexpected behavior. Please pass your input's `attention_mask` to obtain reliable results.
Setting `pad_token_id` to `eos_token_id`:None for open-end generation.


In [7]:
prompts = [song['lyrics'].split('\n')[1] for song in songs if song['lyrics'] is not None]
reference_lyrics = [song['lyrics'] for song in songs if song['lyrics'] is not None]

semantic_similarities = []
bleu_scores = []
rouge_scores = {'rouge1': [], 'rouge2': [], 'rougeL': []}

for i, prompt in enumerate(prompts):
    if not prompt.strip():  # Check if prompt is empty or contains only whitespace
        continue  # Skip empty prompts
    # Generate text
    input_ids = tokenizer(prompt, return_tensors='pt').input_ids.to(device)
    output_ids = model.generate(input_ids,
                                max_length=500,
                                num_return_sequences=1,
                                no_repeat_ngram_size=2,
                                top_k=20,
                                top_p=0.7,
                                temperature=0.7,
                                early_stopping=True)

    generated_lyrics = tokenizer.decode(output_ids[0], skip_special_tokens=True)

    # Compute scores
    semantic_similarities.append(compute_semantic_similarity(generated_lyrics, reference_lyrics[i]))
    bleu_scores.append(compute_bleu_score(generated_lyrics, reference_lyrics[i]))
    scores = compute_rouge_score(generated_lyrics, reference_lyrics[i])
    rouge_scores['rouge1'].append(scores['rouge1'].fmeasure)
    rouge_scores['rouge2'].append(scores['rouge2'].fmeasure)
    rouge_scores['rougeL'].append(scores['rougeL'].fmeasure)

# Compute averages
avg_semantic_similarity = sum(semantic_similarities) / len(semantic_similarities)
avg_bleu = sum(bleu_scores) / len(bleu_scores)
avg_rouge1 = sum(rouge_scores['rouge1']) / len(rouge_scores['rouge1'])
avg_rouge2 = sum(rouge_scores['rouge2']) / len(rouge_scores['rouge2'])
avg_rougeL = sum(rouge_scores['rougeL']) / len(rouge_scores['rougeL'])

# Print overall scores
print(f"Average Semantic Similarity: {avg_semantic_similarity}")
print(f"Average BLEU Score: {avg_bleu}")
print(f"Average ROUGE-1: {avg_rouge1}")
print(f"Average ROUGE-2: {avg_rouge2}")
print(f"Average ROUGE-L: {avg_rougeL}")

The attention mask and the pad token id were not set. As a consequence, you may observe unexpected behavior. Please pass your input's `attention_mask` to obtain reliable results.
Setting `pad_token_id` to `eos_token_id`:None for open-end generation.
The attention mask and the pad token id were not set. As a consequence, you may observe unexpected behavior. Please pass your input's `attention_mask` to obtain reliable results.
Setting `pad_token_id` to `eos_token_id`:None for open-end generation.
The attention mask and the pad token id were not set. As a consequence, you may observe unexpected behavior. Please pass your input's `attention_mask` to obtain reliable results.
Setting `pad_token_id` to `eos_token_id`:None for open-end generation.
The attention mask and the pad token id were not set. As a consequence, you may observe unexpected behavior. Please pass your input's `attention_mask` to obtain reliable results.
Setting `pad_token_id` to `eos_token_id`:None for open-end generation.


Average Semantic Similarity: 0.4931264477069521
Average BLEU Score: 0.04081852620557161
Average ROUGE-1: 0.3081357267141548
Average ROUGE-2: 0.07561454698180604
Average ROUGE-L: 0.15783110966527703


In [8]:
# Prompt for generation
prompt = "You wired me awake"
input_ids = tokenizer(prompt, return_tensors='pt').input_ids.to(device)

# Generate lyrics - you can change max_length to getlonger songs
model.generate(input_ids,
               max_length=500,
               num_return_sequences=1,
               no_repeat_ngram_size=2,
               top_k=20,
               top_p=0.7,
               temperature=0.7,
               early_stopping=True)

# Decode and print
generated_lyrics = tokenizer.decode(output_ids[0], skip_special_tokens=True)
print(generated_lyrics)

The attention mask and the pad token id were not set. As a consequence, you may observe unexpected behavior. Please pass your input's `attention_mask` to obtain reliable results.
Setting `pad_token_id` to `eos_token_id`:None for open-end generation.


That you've been seen
You've seen me

[Chorus]
I've got a little bit of fun
But I'm not the kind to go out and do nothing
And I don't like to be seen by nobody
So I've had to learn to walk away
Because I know I'll be remembered
For what I did
In the end
The only thing that matters
Is that you're gone
There's no reason to doubt
That I will
If you'll forgive me for what you did to me,
Then I won't be the sort to leave
Cause I have to know that I can
Just as long as you don’t try to hurt me again
Until I do
Oh, I love you
Love you, love me and don
Don't try and hurt
Like I said
When I die
It's gonna be all right
To say that it's all over
Well, it won t be
All right, but I m not gonna
Go away, Lord, and I ain‟t gonna go
No, no, don ive got to
Be ivin' in ixiv xiv
Or ize yiv, ie iz es
Ain't ia i il is io ip it ii iii 
Now ire ir ise ite ist ih iter its jus k
Til I ile ern ish ita itt im est iel ips irs ites
Mister ich enn ies en ing  ins ed s ings
Lord, if ier er ers ents ent een e hr ew n


## Elvis

In [9]:
# Load the JSON data
with open('/content/drive/My Drive/w266 Final Project/data/Elvis_cleaned_songs.json', 'r') as file: #Choose between Johhny Cash, Adele, Elvis
    songs = json.load(file)

# Combine all lyrics into a single string
lyrics_data = "\n\n".join([song['lyrics'] for song in songs if song['lyrics'] is not None])
print(f"Total length of lyrics data: {len(lyrics_data)} characters")

# Load pre-trained model and tokenizer
model_name = 'gpt2'
tokenizer = GPT2Tokenizer.from_pretrained(model_name)
model = GPT2LMHeadModel.from_pretrained(model_name)

# Split the lyrics data into smaller chunks
chunk_size = 1024  # You can adjust this size but it has to stay at max 1024 in size
lyrics_chunks = [lyrics_data[i:i + chunk_size] for i in range(0, len(lyrics_data), chunk_size)]

# Tokenize each chunk
tokenized_inputs = []
for chunk in lyrics_chunks:
    inputs = tokenizer(chunk, return_tensors='pt', max_length=1024, truncation=True)
    tokenized_inputs.append(inputs)

print(f"Number of chunks processed: {len(tokenized_inputs)}")

# Prepare Dataset and DataLoader
class LyricsDataset(Dataset):
    def __init__(self, input_ids):
        self.input_ids = input_ids

    def __len__(self):
        return len(self.input_ids)

    def __getitem__(self, idx):
        return self.input_ids[idx], self.input_ids[idx]

# Flatten tokenized inputs for Dataset
flattened_input_ids = [item['input_ids'].squeeze() for item in tokenized_inputs]
dataset = LyricsDataset(flattened_input_ids)
dataloader = DataLoader(dataset, batch_size=1, shuffle=True)

# Fine-Tune the GPT-2 Model
device = torch.device('cuda' if torch.cuda.is_available() else 'cpu')
model.to(device)
model.train()

optimizer = torch.optim.AdamW(model.parameters(), lr=5e-5)

for epoch in range(3):  # Example: 3 epochs
    for batch in dataloader:
        input_ids, labels = batch
        input_ids, labels = input_ids.to(device), labels.to(device)

        optimizer.zero_grad()
        outputs = model(input_ids, labels=labels)
        loss = outputs.loss
        loss.backward()
        optimizer.step()

        print(f"Epoch {epoch+1} Loss: {loss.item()}")

# Generate New Lyrics
model.eval()
#%%
# Prompt for generation
prompt1 = "When a boy like me meets a girl like you" #A Boy Like Me, A Girl Like You
input_ids1 = tokenizer(prompt1, return_tensors='pt').input_ids.to(device)

prompt2 = "Cowboy, cowboy marry me" #A Cane and a High Starched Collar
input_ids2 = tokenizer(prompt2, return_tensors='pt').input_ids.to(device)

prompt3 = "Now after loving you" #After Loving You
input_ids3 = tokenizer(prompt3, return_tensors='pt').input_ids.to(device)

# Generate lyrics - you can change max_length to get longer songs
output_ids1 = model.generate(input_ids1, max_length=210, num_return_sequences=1, no_repeat_ngram_size=2)
output_ids2 = model.generate(input_ids2, max_length=210, num_return_sequences=1, no_repeat_ngram_size=2)
output_ids3 = model.generate(input_ids3, max_length=210, num_return_sequences=1, no_repeat_ngram_size=2)

# Decode and print
generated_lyrics1 = tokenizer.decode(output_ids1[0], skip_special_tokens=True)
generated_lyrics2 = tokenizer.decode(output_ids2[0], skip_special_tokens=True)
generated_lyrics3 = tokenizer.decode(output_ids3[0], skip_special_tokens=True)

Total length of lyrics data: 389230 characters
Number of chunks processed: 381
Epoch 1 Loss: 3.1886322498321533
Epoch 1 Loss: 3.286513090133667
Epoch 1 Loss: 3.083163261413574
Epoch 1 Loss: 3.0564768314361572
Epoch 1 Loss: 3.4815874099731445
Epoch 1 Loss: 2.41302752494812
Epoch 1 Loss: 3.5387144088745117
Epoch 1 Loss: 1.6989465951919556
Epoch 1 Loss: 2.7911293506622314
Epoch 1 Loss: 3.2321910858154297
Epoch 1 Loss: 2.769406318664551
Epoch 1 Loss: 2.5125179290771484
Epoch 1 Loss: 2.943068265914917
Epoch 1 Loss: 3.12272310256958
Epoch 1 Loss: 2.488133192062378
Epoch 1 Loss: 3.1906495094299316
Epoch 1 Loss: 2.818291187286377
Epoch 1 Loss: 2.9487459659576416
Epoch 1 Loss: 2.3158910274505615
Epoch 1 Loss: 3.520522117614746
Epoch 1 Loss: 2.538862466812134
Epoch 1 Loss: 2.5444939136505127
Epoch 1 Loss: 2.306584358215332
Epoch 1 Loss: 2.715588331222534
Epoch 1 Loss: 2.3233654499053955
Epoch 1 Loss: 2.5759377479553223
Epoch 1 Loss: 3.4162240028381348
Epoch 1 Loss: 2.2429916858673096
Epoch 1 Los

The attention mask and the pad token id were not set. As a consequence, you may observe unexpected behavior. Please pass your input's `attention_mask` to obtain reliable results.
Setting `pad_token_id` to `eos_token_id`:None for open-end generation.


Epoch 3 Loss: 1.4777450561523438


The attention mask and the pad token id were not set. As a consequence, you may observe unexpected behavior. Please pass your input's `attention_mask` to obtain reliable results.
Setting `pad_token_id` to `eos_token_id`:None for open-end generation.
The attention mask and the pad token id were not set. As a consequence, you may observe unexpected behavior. Please pass your input's `attention_mask` to obtain reliable results.
Setting `pad_token_id` to `eos_token_id`:None for open-end generation.


In [10]:
prompts = [song['lyrics'].split('\n')[1] for song in songs if song['lyrics'] is not None]
reference_lyrics = [song['lyrics'] for song in songs if song['lyrics'] is not None]

semantic_similarities = []
bleu_scores = []
rouge_scores = {'rouge1': [], 'rouge2': [], 'rougeL': []}

for i, prompt in enumerate(prompts):
    if not prompt.strip():  # Check if prompt is empty or contains only whitespace
        continue  # Skip empty prompts
    # Generate text
    input_ids = tokenizer(prompt, return_tensors='pt').input_ids.to(device)
    output_ids = model.generate(input_ids,
                                max_length=500,
                                num_return_sequences=1,
                                no_repeat_ngram_size=2,
                                top_k=20,
                                top_p=0.7,
                                temperature=0.7,
                                early_stopping=True)

    generated_lyrics = tokenizer.decode(output_ids[0], skip_special_tokens=True)

    # Compute scores
    semantic_similarities.append(compute_semantic_similarity(generated_lyrics, reference_lyrics[i]))
    bleu_scores.append(compute_bleu_score(generated_lyrics, reference_lyrics[i]))
    scores = compute_rouge_score(generated_lyrics, reference_lyrics[i])
    rouge_scores['rouge1'].append(scores['rouge1'].fmeasure)
    rouge_scores['rouge2'].append(scores['rouge2'].fmeasure)
    rouge_scores['rougeL'].append(scores['rougeL'].fmeasure)

# Compute averages
avg_semantic_similarity = sum(semantic_similarities) / len(semantic_similarities)
avg_bleu = sum(bleu_scores) / len(bleu_scores)
avg_rouge1 = sum(rouge_scores['rouge1']) / len(rouge_scores['rouge1'])
avg_rouge2 = sum(rouge_scores['rouge2']) / len(rouge_scores['rouge2'])
avg_rougeL = sum(rouge_scores['rougeL']) / len(rouge_scores['rougeL'])

# Print overall scores
print(f"Average Semantic Similarity: {avg_semantic_similarity}")
print(f"Average BLEU Score: {avg_bleu}")
print(f"Average ROUGE-1: {avg_rouge1}")
print(f"Average ROUGE-2: {avg_rouge2}")
print(f"Average ROUGE-L: {avg_rougeL}")

The attention mask and the pad token id were not set. As a consequence, you may observe unexpected behavior. Please pass your input's `attention_mask` to obtain reliable results.
Setting `pad_token_id` to `eos_token_id`:None for open-end generation.
The attention mask and the pad token id were not set. As a consequence, you may observe unexpected behavior. Please pass your input's `attention_mask` to obtain reliable results.
Setting `pad_token_id` to `eos_token_id`:None for open-end generation.
The attention mask and the pad token id were not set. As a consequence, you may observe unexpected behavior. Please pass your input's `attention_mask` to obtain reliable results.
Setting `pad_token_id` to `eos_token_id`:None for open-end generation.
The attention mask and the pad token id were not set. As a consequence, you may observe unexpected behavior. Please pass your input's `attention_mask` to obtain reliable results.
Setting `pad_token_id` to `eos_token_id`:None for open-end generation.


Average Semantic Similarity: 0.5219295160765034
Average BLEU Score: 0.027856411155436182
Average ROUGE-1: 0.274350409865926
Average ROUGE-2: 0.06296809576096409
Average ROUGE-L: 0.14251845003675195


In [12]:
# Prompt for generation
prompt = "When a boy like me meets a girl like you"
input_ids = tokenizer(prompt, return_tensors='pt').input_ids.to(device)

# Generate lyrics - you can change max_length to getlonger songs
model.generate(input_ids,
               max_length=500,
               num_return_sequences=1,
               no_repeat_ngram_size=2,
               top_k=20,
               top_p=0.7,
               temperature=0.7,
               early_stopping=True)

# Decode and print
generated_lyrics = tokenizer.decode(output_ids[0], skip_special_tokens=True)
print(generated_lyrics)

The attention mask and the pad token id were not set. As a consequence, you may observe unexpected behavior. Please pass your input's `attention_mask` to obtain reliable results.
Setting `pad_token_id` to `eos_token_id`:None for open-end generation.


(Young dreams of love, young dreams of love)

[Verse 1]
I'm a little boy, I'm just a boy
And I love to play with dolls
But I can't stand the sight of a doll
That's just me, baby
You know I've got a lot of fun
So I'll just pretend
Like a baby, pretend like a child
'Cause I know that I have a ton of
Fun
(Fun, fun, yeah) I got to pretend, like baby (Yeah) (Yes) baby dolly
Yeah, dollies, dolls, yes
Dollies are my baby's toy
My baby dolls are the only ones I ever loved
Yes, they're my favorite
They're the ones that make me happy
When I see them, my heart goes crazy
Oh, and I get to go to the park
To the dollhouse, to see the dolls that my little girl
Won't you come along?
Come along, come on, let's go
Let's get cozy, we can play together
We can make out, make love in the morning
Well, you know, when I look at you
It's like I said, 'cause I just got so
Gonna try to make you happy, too
Now, if you'll excuse me
If you're gonna go, go on
Go on and on with your dollie
Because I want to be with y

## Adele

In [13]:
# Load the JSON data
with open('/content/drive/My Drive/w266 Final Project/data/Adele_cleaned_songs.json', 'r') as file: #Choose between Johhny Cash, Adele, Elvis
    songs = json.load(file)

# Combine all lyrics into a single string
lyrics_data = "\n\n".join([song['lyrics'] for song in songs if song['lyrics'] is not None])
print(f"Total length of lyrics data: {len(lyrics_data)} characters")

# Load pre-trained model and tokenizer
model_name = 'gpt2'
tokenizer = GPT2Tokenizer.from_pretrained(model_name)
model = GPT2LMHeadModel.from_pretrained(model_name)

# Split the lyrics data into smaller chunks
chunk_size = 1024  # You can adjust this size but it has to stay at max 1024 in size
lyrics_chunks = [lyrics_data[i:i + chunk_size] for i in range(0, len(lyrics_data), chunk_size)]

# Tokenize each chunk
tokenized_inputs = []
for chunk in lyrics_chunks:
    inputs = tokenizer(chunk, return_tensors='pt', max_length=1024, truncation=True)
    tokenized_inputs.append(inputs)

print(f"Number of chunks processed: {len(tokenized_inputs)}")

# Prepare Dataset and DataLoader
class LyricsDataset(Dataset):
    def __init__(self, input_ids):
        self.input_ids = input_ids

    def __len__(self):
        return len(self.input_ids)

    def __getitem__(self, idx):
        return self.input_ids[idx], self.input_ids[idx]

# Flatten tokenized inputs for Dataset
flattened_input_ids = [item['input_ids'].squeeze() for item in tokenized_inputs]
dataset = LyricsDataset(flattened_input_ids)
dataloader = DataLoader(dataset, batch_size=1, shuffle=True)

# Fine-Tune the GPT-2 Model
device = torch.device('cuda' if torch.cuda.is_available() else 'cpu')
model.to(device)
model.train()

optimizer = torch.optim.AdamW(model.parameters(), lr=5e-5)

for epoch in range(3):  # Example: 3 epochs
    for batch in dataloader:
        input_ids, labels = batch
        input_ids, labels = input_ids.to(device), labels.to(device)

        optimizer.zero_grad()
        outputs = model(input_ids, labels=labels)
        loss = outputs.loss
        loss.backward()
        optimizer.step()

        print(f"Epoch {epoch+1} Loss: {loss.item()}")

# Generate New Lyrics
model.eval()
#%%
# Prompt for generation
prompt1 = "I will leave my heart at the door" #All I Ask
input_ids1 = tokenizer(prompt1, return_tensors='pt').input_ids.to(device)

prompt2 = "Wait, do you see my heart on my sleeve?" #Best for Last
input_ids2 = tokenizer(prompt2, return_tensors='pt').input_ids.to(device)

prompt3 = "Pave me a path to follow" #Can I Get It
input_ids3 = tokenizer(prompt3, return_tensors='pt').input_ids.to(device)

# Generate lyrics - you can change max_length to get longer songs
output_ids1 = model.generate(input_ids1, max_length=210, num_return_sequences=1, no_repeat_ngram_size=2)
output_ids2 = model.generate(input_ids2, max_length=210, num_return_sequences=1, no_repeat_ngram_size=2)
output_ids3 = model.generate(input_ids3, max_length=210, num_return_sequences=1, no_repeat_ngram_size=2)

# Decode and print
generated_lyrics1 = tokenizer.decode(output_ids1[0], skip_special_tokens=True)
generated_lyrics2 = tokenizer.decode(output_ids2[0], skip_special_tokens=True)
generated_lyrics3 = tokenizer.decode(output_ids3[0], skip_special_tokens=True)

Total length of lyrics data: 88122 characters
Number of chunks processed: 87
Epoch 1 Loss: 2.44923734664917
Epoch 1 Loss: 1.971820592880249
Epoch 1 Loss: 2.22863507270813
Epoch 1 Loss: 2.463425636291504
Epoch 1 Loss: 2.611232280731201
Epoch 1 Loss: 3.4799935817718506
Epoch 1 Loss: 2.469205856323242
Epoch 1 Loss: 3.0482378005981445
Epoch 1 Loss: 2.2428455352783203
Epoch 1 Loss: 3.1195991039276123
Epoch 1 Loss: 2.4615721702575684
Epoch 1 Loss: 3.07855486869812
Epoch 1 Loss: 2.338122844696045
Epoch 1 Loss: 2.17063307762146
Epoch 1 Loss: 2.2754127979278564
Epoch 1 Loss: 2.895221710205078
Epoch 1 Loss: 2.272202253341675
Epoch 1 Loss: 1.561142086982727
Epoch 1 Loss: 2.609318971633911
Epoch 1 Loss: 2.274970769882202
Epoch 1 Loss: 2.7759013175964355
Epoch 1 Loss: 1.9393969774246216
Epoch 1 Loss: 2.181821823120117
Epoch 1 Loss: 2.978675127029419
Epoch 1 Loss: 2.45615291595459
Epoch 1 Loss: 2.6457557678222656
Epoch 1 Loss: 2.121992826461792
Epoch 1 Loss: 2.7032504081726074
Epoch 1 Loss: 2.699750

The attention mask and the pad token id were not set. As a consequence, you may observe unexpected behavior. Please pass your input's `attention_mask` to obtain reliable results.
Setting `pad_token_id` to `eos_token_id`:None for open-end generation.


Epoch 3 Loss: 1.651416540145874


The attention mask and the pad token id were not set. As a consequence, you may observe unexpected behavior. Please pass your input's `attention_mask` to obtain reliable results.
Setting `pad_token_id` to `eos_token_id`:None for open-end generation.
The attention mask and the pad token id were not set. As a consequence, you may observe unexpected behavior. Please pass your input's `attention_mask` to obtain reliable results.
Setting `pad_token_id` to `eos_token_id`:None for open-end generation.


In [14]:
prompts = [song['lyrics'].split('\n')[1] for song in songs if song['lyrics'] is not None]
reference_lyrics = [song['lyrics'] for song in songs if song['lyrics'] is not None]

semantic_similarities = []
bleu_scores = []
rouge_scores = {'rouge1': [], 'rouge2': [], 'rougeL': []}

for i, prompt in enumerate(prompts):
    if not prompt.strip():  # Check if prompt is empty or contains only whitespace
        continue  # Skip empty prompts
    # Generate text
    input_ids = tokenizer(prompt, return_tensors='pt').input_ids.to(device)
    output_ids = model.generate(input_ids,
                                max_length=500,
                                num_return_sequences=1,
                                no_repeat_ngram_size=2,
                                top_k=20,
                                top_p=0.7,
                                temperature=0.7,
                                early_stopping=True)

    generated_lyrics = tokenizer.decode(output_ids[0], skip_special_tokens=True)

    # Compute scores
    semantic_similarities.append(compute_semantic_similarity(generated_lyrics, reference_lyrics[i]))
    bleu_scores.append(compute_bleu_score(generated_lyrics, reference_lyrics[i]))
    scores = compute_rouge_score(generated_lyrics, reference_lyrics[i])
    rouge_scores['rouge1'].append(scores['rouge1'].fmeasure)
    rouge_scores['rouge2'].append(scores['rouge2'].fmeasure)
    rouge_scores['rougeL'].append(scores['rougeL'].fmeasure)

# Compute averages
avg_semantic_similarity = sum(semantic_similarities) / len(semantic_similarities)
avg_bleu = sum(bleu_scores) / len(bleu_scores)
avg_rouge1 = sum(rouge_scores['rouge1']) / len(rouge_scores['rouge1'])
avg_rouge2 = sum(rouge_scores['rouge2']) / len(rouge_scores['rouge2'])
avg_rougeL = sum(rouge_scores['rougeL']) / len(rouge_scores['rougeL'])

# Print overall scores
print(f"Average Semantic Similarity: {avg_semantic_similarity}")
print(f"Average BLEU Score: {avg_bleu}")
print(f"Average ROUGE-1: {avg_rouge1}")
print(f"Average ROUGE-2: {avg_rouge2}")
print(f"Average ROUGE-L: {avg_rougeL}")

The attention mask and the pad token id were not set. As a consequence, you may observe unexpected behavior. Please pass your input's `attention_mask` to obtain reliable results.
Setting `pad_token_id` to `eos_token_id`:None for open-end generation.
The attention mask and the pad token id were not set. As a consequence, you may observe unexpected behavior. Please pass your input's `attention_mask` to obtain reliable results.
Setting `pad_token_id` to `eos_token_id`:None for open-end generation.
The attention mask and the pad token id were not set. As a consequence, you may observe unexpected behavior. Please pass your input's `attention_mask` to obtain reliable results.
Setting `pad_token_id` to `eos_token_id`:None for open-end generation.
The attention mask and the pad token id were not set. As a consequence, you may observe unexpected behavior. Please pass your input's `attention_mask` to obtain reliable results.
Setting `pad_token_id` to `eos_token_id`:None for open-end generation.


Average Semantic Similarity: 0.5717865807505754
Average BLEU Score: 0.03631658011701559
Average ROUGE-1: 0.37256802394927774
Average ROUGE-2: 0.08646717603408056
Average ROUGE-L: 0.17246120848531118


In [15]:
# Prompt for generation
prompt = "I will leave my heart at the door"
input_ids = tokenizer(prompt, return_tensors='pt').input_ids.to(device)

# Generate lyrics - you can change max_length to getlonger songs
model.generate(input_ids,
               max_length=500,
               num_return_sequences=1,
               no_repeat_ngram_size=2,
               top_k=20,
               top_p=0.7,
               temperature=0.7,
               early_stopping=True)

# Decode and print
generated_lyrics = tokenizer.decode(output_ids[0], skip_special_tokens=True)
print(generated_lyrics)

The attention mask and the pad token id were not set. As a consequence, you may observe unexpected behavior. Please pass your input's `attention_mask` to obtain reliable results.
Setting `pad_token_id` to `eos_token_id`:None for open-end generation.


You're driving me away, give me a reason to stay

[Verse 1]
I'm so tired of being in the dark
It's so hard to be in control
You can't control what you do
When you're so close to me, I can feel your heartbeat
And it's like you've been there for hours
But I'm scared to leave
Because I know you'll be there
If you stay, you will be my keeper
So stay strong
Don't let go
Just stay where you are
Keep on driving
Like I said, stay on
Love you, love you

