In [35]:
import locale
locale.getpreferredencoding = lambda: "UTF-8"

In [36]:
!pip install sacrebleu

Collecting sacrebleu
  Downloading sacrebleu-2.4.0-py3-none-any.whl (106 kB)
[2K     [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m106.3/106.3 kB[0m [31m1.8 MB/s[0m eta [36m0:00:00[0m
[?25hCollecting portalocker (from sacrebleu)
  Downloading portalocker-2.8.2-py3-none-any.whl (17 kB)
Collecting colorama (from sacrebleu)
  Downloading colorama-0.4.6-py2.py3-none-any.whl (25 kB)
Installing collected packages: portalocker, colorama, sacrebleu
Successfully installed colorama-0.4.6 portalocker-2.8.2 sacrebleu-2.4.0


In [37]:
import pandas as pd
import numpy as np
import torch
from transformers import AdamW, get_linear_schedule_with_warmup
from tqdm import tqdm  # For progress bars
from torch.utils.data import DataLoader, TensorDataset, random_split
from transformers import GPT2LMHeadModel, GPT2Tokenizer
from sklearn.preprocessing import KBinsDiscretizer
from sacrebleu.metrics import BLEU

In [13]:
file_path = 'lyrics_train.tsv'
file_path_2 = 'lyrics_dev.tsv'

# Adjusted to load the dataset with ' | ' separation
df_first = pd.read_csv(file_path, sep=' \| ', engine='python', header=None, names=['Artist', 'Title', 'Popularity', 'Duration_ms', 'Explicit', 'Danceability', 'Energy', 'Key', 'Loudness', 'Mode', 'Speechiness', 'Acousticness', 'Instrumentalness', 'Liveness', 'Valence', 'Tempo', 'Time_Signature', 'Track_Genre', 'Lyrics'])
df_second = pd.read_csv(file_path_2, sep=' \| ', engine='python', header=None, names=['Artist', 'Title', 'Popularity', 'Duration_ms', 'Explicit', 'Danceability', 'Energy', 'Key', 'Loudness', 'Mode', 'Speechiness', 'Acousticness', 'Instrumentalness', 'Liveness', 'Valence', 'Tempo', 'Time_Signature', 'Track_Genre', 'Lyrics'])

df_first = df_first.replace('□', '', regex=True)
df_second = df_second.replace('□', '', regex=True)

df = pd.concat([df_first, df_second], axis=0)
df = df[~df['Artist'].str.contains('BTS')]
df['Lyrics'] = df['Lyrics'].apply(lambda x: x.replace('   ', '\n') if isinstance(x, str) else x)

df.drop(columns=['Time_Signature'], inplace=True)
df.drop(columns=['Key'], inplace=True)
df.drop(columns=['Mode'], inplace=True)
df = df.rename(columns={'Duration_ms': 'Duration'})
df = df.rename(columns={'Track_Genre': 'Genre'})
df['Explicit'] = df['Explicit'].replace({'True': 1, 'False': 0})

# SETUP CATEGORICAL VS NUMERICAL FEATURES
categorical_columns = ['Artist', 'Title', 'Genre', 'Lyrics']
numerical_columns = ['Popularity', 'Danceability', 'Energy', 'Loudness', 'Speechiness', 'Acousticness', 'Instrumentalness', 'Liveness', 'Valence', 'Tempo']

categorical_df = df[categorical_columns]
numerical_df = df[numerical_columns]
categorical_df[['Before_Tab', 'After_Tab']] = categorical_df['Genre'].str.split('\t', n=1, expand=True)
categorical_df.drop(columns=['Lyrics'], inplace=True)
categorical_df.drop(columns=['Genre'], inplace=True)

categorical_df = categorical_df.rename(columns={'Before_Tab': 'Genre'})
categorical_df = categorical_df.rename(columns={'After_Tab': 'Lyrics'})
categorical_df['Lyrics'] = categorical_df['Lyrics'].str.strip()
categorical_df['Lyrics'] = categorical_df['Lyrics'].fillna('').astype(str)

complete_df = pd.concat([categorical_df, numerical_df], axis=1)

print(len(complete_df))

1045


A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  categorical_df[['Before_Tab', 'After_Tab']] = categorical_df['Genre'].str.split('\t', n=1, expand=True)
A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  categorical_df[['Before_Tab', 'After_Tab']] = categorical_df['Genre'].str.split('\t', n=1, expand=True)
A value is trying to be set on a copy of a slice from a DataFrame

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  categorical_df.drop(columns=['Lyrics'], inplace=True)


In [29]:
# Initialize the tokenizer
tokenizer = GPT2Tokenizer.from_pretrained('gpt2')

# Add [PAD] token to the tokenizer and update pad_token_id
pad_token = '[PAD]'
if pad_token not in tokenizer.get_added_vocab():
    tokenizer.add_special_tokens({'pad_token': pad_token})

# Check if the pad_token is recognized correctly
print("Pad token:", tokenizer.pad_token, "ID:", tokenizer.pad_token_id)

# Initialize the model
model = GPT2LMHeadModel.from_pretrained('gpt2')
# Resize model embeddings to account for the new token
model.resize_token_embeddings(len(tokenizer))
# IMPORTANT: Set the model's pad_token_id to match the tokenizer's pad_token_id


Pad token: [PAD] ID: 50257


Embedding(50258, 768)

In [15]:
input_sequences = []
for idx, row in complete_df.iterrows():
    features_text = ' '.join([f"|Artist| {row['Artist']}", f"|Title| {row['Title']}", f"|Genre| {row['Genre']}"] +
                             [f"|{feature}| {row[feature]}" for feature in numerical_columns])
    input_sequence = f"{features_text} |Lyrics| {row['Lyrics']}"
    input_sequences.append(input_sequence)

In [16]:
# Batch tokenize all input sequences, apply padding, and generate attention masks
inputs = tokenizer(input_sequences,
                   padding=True,  # Pad to the longest sequence
                   truncation=True,  # Truncate to max model length
                   return_tensors="pt",  # Return PyTorch tensors
                   max_length=512)  # Max length for truncation

# Extract padded token IDs and attention masks
input_ids = inputs['input_ids']
attention_masks = inputs['attention_mask']

In [17]:
# Wrap input_ids and attention_masks in a TensorDataset
dataset = TensorDataset(input_ids, attention_masks)

# Split the dataset into training and validation sets
train_size = int(0.9 * len(dataset))
val_size = len(dataset) - train_size
train_dataset, val_dataset = random_split(dataset, [train_size, val_size])

# Create DataLoaders for training and validation sets
train_loader = DataLoader(train_dataset, batch_size=8, shuffle=True)
val_loader = DataLoader(val_dataset, batch_size=8)

In [18]:
device = torch.device("cuda" if torch.cuda.is_available() else "cpu")
model.to(device)

optimizer = AdamW(model.parameters(), lr=5e-5)

epochs = 5
# Calculate the total number of training steps
total_steps = len(train_loader) * epochs

# Initialize the scheduler
scheduler = get_linear_schedule_with_warmup(optimizer,
                                            num_warmup_steps=0,  # No warm-up
                                            num_training_steps=total_steps)




In [38]:
for epoch in range(epochs):
    model.train()
    total_loss = 0
    for batch_idx, batch in enumerate(tqdm(train_loader, desc=f"Epoch {epoch+1}")):
        b_input_ids, b_attention_mask = batch
        b_input_ids = b_input_ids.to(device)
        b_attention_mask = b_attention_mask.to(device)

        model.zero_grad()

        outputs = model(b_input_ids, attention_mask=b_attention_mask, labels=b_input_ids)
        loss = outputs.loss

        loss.backward()
        total_loss += loss.item()

        optimizer.step()
        scheduler.step()  # Update the learning rate

        if (batch_idx + 1) % 10 == 0:
            print(f"Epoch {epoch+1}, Batch {batch_idx+1}/{len(train_loader)}, Loss: {loss.item()}")

    avg_train_loss = total_loss / len(train_loader)
    print(f"Epoch {epoch+1}, Average Training Loss: {avg_train_loss}")

    # Validation step with BLEU score calculation
    model.eval()
    val_loss = 0
    hypotheses = []  # Generated sequences
    references = []  # Actual sequences
    with torch.no_grad():
        for batch_idx, batch in enumerate(val_loader):
            b_input_ids, b_attention_mask = batch
            b_input_ids = b_input_ids.to(device)
            b_attention_mask = b_attention_mask.to(device)

            outputs = model(b_input_ids, attention_mask=b_attention_mask, labels=b_input_ids)
            loss = outputs.loss
            val_loss += loss.item()

            # Generate predictions
            predictions = model.generate(b_input_ids, max_length=512)

            # Convert predictions and references to text
            hyp_texts = [tokenizer.decode(g, skip_special_tokens=True) for g in predictions]
            ref_texts = [tokenizer.decode(g, skip_special_tokens=True) for g in b_input_ids]

            hypotheses.extend(hyp_texts)
            references.extend([[r] for r in ref_texts])  # BLEU expects a list of references

    avg_val_loss = val_loss / len(val_loader)
    val_bleu = corpus_bleu(hypotheses, references).score
    print(f"Epoch {epoch+1}, Validation Loss: {avg_val_loss}, Validation BLEU: {val_bleu}")


Epoch 1:   1%|          | 1/118 [00:01<03:18,  1.69s/it]


OutOfMemoryError: CUDA out of memory. Tried to allocate 784.00 MiB. GPU 0 has a total capacty of 14.75 GiB of which 133.06 MiB is free. Process 9206 has 14.62 GiB memory in use. Of the allocated memory 13.80 GiB is allocated by PyTorch, and 703.83 MiB is reserved by PyTorch but unallocated. If reserved but unallocated memory is large try setting max_split_size_mb to avoid fragmentation.  See documentation for Memory Management and PYTORCH_CUDA_ALLOC_CONF

In [20]:
model_save_path = './gpt2_lyrics_model'
model.save_pretrained(model_save_path)

# Save the tokenizer
tokenizer_save_path = './gpt2_lyrics_tokenizer'
tokenizer.save_pretrained(tokenizer_save_path)

('./gpt2_lyrics_tokenizer/tokenizer_config.json',
 './gpt2_lyrics_tokenizer/special_tokens_map.json',
 './gpt2_lyrics_tokenizer/vocab.json',
 './gpt2_lyrics_tokenizer/merges.txt',
 './gpt2_lyrics_tokenizer/added_tokens.json')

In [30]:
model = GPT2LMHeadModel.from_pretrained(model_save_path)

# Load the tokenizer
tokenizer = GPT2Tokenizer.from_pretrained(tokenizer_save_path)

if tokenizer.pad_token is None:
    tokenizer.add_special_tokens({'pad_token': '[PAD]'})

# Resize the token embeddings in case new tokens were added
model.resize_token_embeddings(len(tokenizer))
# IMPORTANT: Set the model's pad_token_id
model.config.pad_token_id = tokenizer.pad_token_id

print("Model pad_token_id:", model.config.pad_token_id)

# Make sure to move the model to the appropriate device
device = torch.device("cuda" if torch.cuda.is_available() else "cpu")
model.to(device)

Model pad_token_id: 50257


GPT2LMHeadModel(
  (transformer): GPT2Model(
    (wte): Embedding(50258, 768)
    (wpe): Embedding(1024, 768)
    (drop): Dropout(p=0.1, inplace=False)
    (h): ModuleList(
      (0-11): 12 x GPT2Block(
        (ln_1): LayerNorm((768,), eps=1e-05, elementwise_affine=True)
        (attn): GPT2Attention(
          (c_attn): Conv1D()
          (c_proj): Conv1D()
          (attn_dropout): Dropout(p=0.1, inplace=False)
          (resid_dropout): Dropout(p=0.1, inplace=False)
        )
        (ln_2): LayerNorm((768,), eps=1e-05, elementwise_affine=True)
        (mlp): GPT2MLP(
          (c_fc): Conv1D()
          (c_proj): Conv1D()
          (act): NewGELUActivation()
          (dropout): Dropout(p=0.1, inplace=False)
        )
      )
    )
    (ln_f): LayerNorm((768,), eps=1e-05, elementwise_affine=True)
  )
  (lm_head): Linear(in_features=768, out_features=50258, bias=False)
)

In [23]:
def generate_lyrics(prompt_text, max_length=100):
    # Encode the prompt text
    encoded_prompt = tokenizer.encode(prompt_text, add_special_tokens=True, return_tensors="pt")
    encoded_prompt = encoded_prompt.to(device)
    attention_mask = torch.ones(encoded_prompt.shape, dtype=torch.long, device=device)  # Assuming no actual padding is needed here

    # Generate a sequence of tokens following the prompt
    output_sequences = model.generate(
        input_ids=encoded_prompt,
        attention_mask=attention_mask,
        max_length=100,
        temperature=1.0,
        top_k=50,
        top_p=0.95,
        repetition_penalty=1.2,
        do_sample=True,
        num_return_sequences=1,
    )

    # Decode the generated tokens to text
    generated_sequence = output_sequences[0].tolist()
    text = tokenizer.decode(generated_sequence, clean_up_tokenization_spaces=True)

    # Remove the prompt text from the output
    text = text[len(tokenizer.decode(encoded_prompt[0], clean_up_tokenization_spaces=True)):]

    return text.strip()

# Example usage
prompt_text = "The moon shines brightly in the night, "
lyrics = generate_lyrics(prompt_text, max_length=100)
print("Generated Lyrics:\n" + lyrics)


Setting `pad_token_id` to `eos_token_id`:50256 for open-end generation.


Generated Lyrics:
[PAD][PAD][PAD][PAD][PAD][PAD][PAD][PAD][PAD][PAD][PAD][PAD][PAD][PAD][PAD][PAD][PAD][PAD][PAD][PAD] day[PAD] the top of your hand comes on a black metal[PAD][PAD][PAD][PAD][PAD][PAD][PAD][PAD][PAD][PAD][PAD][PAD][PAD][PAD][PAD] day the[PAD][PAD][PAD][PAD][PAD][PAD][PAD] year you took out some new drugs and we got a new life for that money in this city
 The[PAD][PAD][PAD][PAD][PAD][PAD] day the top is my way to


In [24]:
prompt_text = "|Artist| Justin Bieber |Genre| Pop |Energy| 0.502 |Tempo| 120 |Lyrics| "
lyrics = generate_lyrics(prompt_text, max_length=100)
print("Generated Lyrics:\n" + lyrics)


Setting `pad_token_id` to `eos_token_id`:50256 for open-end generation.


Generated Lyrics:
༽_Dance it up please cause i'll never forget a song that's like nothing else you've done oh right now is this what makes me wanna be king do we ever once? I'm sorry when your shit starts to[PAD][PAD][PAD][PAD][PAD][PAD][PAD][PAD] I thee howst all my life would have been so much better if not for those two


In [25]:
prompt_text = "It's about time we broke up my love"
lyrics = generate_lyrics(prompt_text, max_length=100)
print("Generated Lyrics:\n" + lyrics)

Setting `pad_token_id` to `eos_token_id`:50256 for open-end generation.


Generated Lyrics:
[PAD][PAD][PAD][PAD][PAD][PAD][PAD][PAD][PAD][PAD][PAD][PAD][PAD][PAD][PAD][PAD][PAD][PAD][PAD] the[PAD][PAD][PAD] the the one you[PAD] the[PAD][PAD][PAD][PAD][PAD][PAD][PAD] one the one the[PAD][PAD][PAD][PAD][PAD][PAD][PAD][PAD][PAD][PAD][PAD][PAD][PAD][PAD][PAD][PAD][PAD][PAD][PAD][PAD][PAD][PAD][PAD][PAD][PAD][PAD][PAD][PAD][PAD][PAD][PAD][PAD][PAD][PAD][PAD][PAD][PAD][PAD][PAD][PAD][PAD][PAD][PAD][PAD][PAD][PAD][PAD][PAD][PAD][PAD][PAD]


In [None]:
# Example thematic keywords related to "love" songs
thematic_keywords = ['love', 'heart', 'forever', 'together', 'passion']
def thematic_keyword_match(lyrics, keywords):
    """
    Counts the number of thematic keywords present in the generated lyrics.
    """
    matches = sum(1 for word in keywords if word in lyrics.lower())
    return matches

# Example usage with generated lyrics
generated_lyrics = "This love has taken its toll on me, She said goodbye too many times before."
matches = thematic_keyword_match(generated_lyrics, thematic_keywords)
print(f"Thematic Keywords Found: {matches} out of {len(thematic_keywords)}")

def evaluate_thematic_content(model, tokenizer, dataloader, device, keywords):
    model.eval()
    keyword_matches = []
    with torch.no_grad():
        for batch_idx, batch in enumerate(tqdm(dataloader)):
            b_input_ids = batch[0].to(device)
            generated_outputs = model.generate(b_input_ids, max_length=512)
            for output in generated_outputs:
                lyrics = tokenizer.decode(output, skip_special_tokens=True)
                matches = thematic_keyword_match(lyrics, keywords)
                keyword_matches.append(matches)

    avg_matches = sum(keyword_matches) / len(keyword_matches)
    print(f"Average Thematic Keywords Found: {avg_matches}")
    return avg_matches

# Calculate the average number of thematic keyword matches in the validation set
avg_matches = evaluate_thematic_content(model, tokenizer, val_loader, device, thematic_keywords)

In [None]:
import matplotlib.pyplot as plt

epochs = range(1, len(train_losses) + 1)

plt.figure(figsize=(10, 6))
plt.plot(epochs, train_losses, 'bo-', label='Training Loss')
plt.plot(epochs, val_losses, 'ro-', label='Validation Loss')
plt.title('Training and Validation Loss Over Epochs')
plt.xlabel('Epochs')
plt.ylabel('Loss')
plt.legend()

plt.show()


In [None]:
plt.figure(figsize=(10, 6))
plt.plot(epochs, bleu_scores, 'go-', label='Validation BLEU Score')
plt.title('BLEU Score Progression Over Epochs')
plt.xlabel('Epochs')
plt.ylabel('BLEU Score')
plt.legend()

plt.show()


In [None]:
plt.figure(figsize=(10, 6))
plt.plot(epochs, avg_keyword_matches, 'mo-', label='Average Thematic Keyword Matches')
plt.title('Thematic Keyword Match Frequency Over Epochs')
plt.xlabel('Epochs')
plt.ylabel('Average Matches')
plt.legend()

plt.show()


In [None]:
plt.figure(figsize=(10, 6))
plt.plot(epochs, learning_rates, 'ko-', label='Learning Rate')
plt.title('Learning Rate Schedule Over Epochs')
plt.xlabel('Epochs')
plt.ylabel('Learning Rate')
plt.yscale('log')  # Use logarithmic scale if learning rates vary widely
plt.legend()

plt.show()