In [69]:
import pandas as pd
from sklearn.model_selection import train_test_split
from transformers import GPT2Tokenizer, GPT2LMHeadModel, DataCollatorForLanguageModeling
from torch.utils.data import Dataset, DataLoader
from transformers import AdamW
from transformers import get_scheduler
import torch
from torch.optim.lr_scheduler import StepLR


In [70]:
df = pd.read_csv('final_lyrics_features_combined.csv')
df = df.drop_duplicates() # remove duplicate rows
df['Lyrics'] = df['Lyrics'].str.replace('   ', '|') # replace the 3 space delimtter setup with the | character
column_rename_map = {
    'Artist': 'artist',
    'Title': 'title',
    'Lyrics': 'lyrics'
}

# just make all column names lowercase for convenience
df.rename(columns=column_rename_map, inplace=True)

#df.head()

df['input_text'] = df.apply(lambda row: ' '.join(row.astype(str)), axis=1)
train_texts, val_texts, _, _ = train_test_split(df['input_text'], df['lyrics'], test_size=0.1)
# train_texts is 90% of features, val_texts is the remaining 10% left for testing

In [71]:
tokenizer = GPT2Tokenizer.from_pretrained('gpt2')
tokenizer.pad_token = tokenizer.eos_token

In [72]:
class SongLyricsDataset(Dataset):
    def __init__(self, texts, tokenizer, max_length):
        self.tokenizer = tokenizer
        self.texts = texts
        self.max_length = max_length

    def __len__(self):
        return len(self.texts)

    def __getitem__(self, idx):
        encoding = self.tokenizer(self.texts[idx], return_tensors='pt', max_length=self.max_length, truncation=True, padding='max_length')
        return {key: val.squeeze() for key, val in encoding.items()}

In [73]:
# Create datasets
max_length = 512  # Adjust based on your GPU memory
train_dataset = SongLyricsDataset(train_texts.tolist(), tokenizer, max_length)
val_dataset = SongLyricsDataset(val_texts.tolist(), tokenizer, max_length)

In [74]:
# Create dataloaders
batch_size = 4  # Adjust based on your GPU memory
data_collator = DataCollatorForLanguageModeling(tokenizer=tokenizer, mlm=False)
train_loader = DataLoader(train_dataset, batch_size=batch_size, shuffle=True, collate_fn=data_collator)
val_loader = DataLoader(val_dataset, batch_size=batch_size, shuffle=False, collate_fn=data_collator)

In [75]:
device = torch.device("cuda" if torch.cuda.is_available() else "cpu")

model = GPT2LMHeadModel.from_pretrained('gpt2').to(device)
model.resize_token_embeddings(len(tokenizer))

initial_lr = 3e-5

# Prepare optimizer and schedule (linear warmup and decay)
optimizer = AdamW(model.parameters(), lr=initial_lr)



In [76]:
num_epochs = 3  # Adjust based on your dataset size and desired training duration
num_training_steps = num_epochs * len(train_loader)
lr_scheduler = get_scheduler(
    "linear",
    optimizer=optimizer,
    num_warmup_steps=0,
    num_training_steps=num_training_steps
)

# Training loop
model.train()
for epoch in range(num_epochs):
    for batch in train_loader:
        batch = {k: v.to(device) for k, v in batch.items()}
        
        # Ensure that `labels` are correctly included in your batch
        # For GPT-2, labels are typically the same as input_ids for language modeling tasks
        # If your data loader already prepares this, you don't need to change anything here
        
        outputs = model(**batch)
        loss = outputs.loss
        loss.backward()

        optimizer.step()
        lr_scheduler.step()
        optimizer.zero_grad()

        print(f"Epoch: {epoch}, Loss: {loss.item()}")

KeyboardInterrupt: 