In [1]:
import pandas as pd
from sklearn.model_selection import train_test_split
from transformers import GPT2Tokenizer, GPT2LMHeadModel, DataCollatorForLanguageModeling
from torch.utils.data import TensorDataset, DataLoader, RandomSampler, Dataset, random_split
from transformers import AdamW, get_linear_schedule_with_warmup
from transformers import get_scheduler
import torch
import torch.nn as nn
from transformers import GPT2Model, GPT2Config
from torch.optim.lr_scheduler import StepLR
from sklearn.preprocessing import LabelEncoder, StandardScaler
import torch.nn.functional as F



  from .autonotebook import tqdm as notebook_tqdm


In [3]:
df = pd.read_csv('final_lyrics_features_combined.csv')
df = df.drop_duplicates() # remove duplicate rows
# Assuming df is your DataFrame and 'column_name' is the name of the column you want to remove
df.drop(columns=['track_id'], inplace=True)

df['Lyrics'] = df['Lyrics'].str.replace('   ', '\n') # replace the 3 space delimtter setup with the | character
column_rename_map = {
    'Artist': 'artist',
    'Title': 'title',
    'Lyrics': 'lyrics'
}
# just make all column names lowercase for convenience
df.rename(columns=column_rename_map, inplace=True)
df['explicit'] = df['explicit'].astype(int)
df['lyrics'] = df['lyrics'].astype(str)

In [5]:
label_encoder = LabelEncoder()
# df['artist'] = label_encoder.fit_transform(df['artist'])
# df['title'] = label_encoder.fit_transform(df['title'])
# df['track_genre'] = label_encoder.fit_transform(df['track_genre'])

categorical_features = ['artist', 'title', 'track_genre']
numerical_features = ['popularity', 'duration_ms', 'explicit', 'danceability', 'energy', 'key', 'loudness', 'mode', 'speechiness', 'acousticness', 'instrumentalness', 'liveness', 'valence', 'tempo', 'time_signature']

# Convert categorical features to integers
label_encoders = {}
for feature in categorical_features:
    le = LabelEncoder()
    df[feature] = le.fit_transform(df[feature])
    label_encoders[feature] = le  # Save the encoder if you need to invert the encoding later

scaler = StandardScaler()
df[numerical_features] = scaler.fit_transform(df[numerical_features])
#df.head()
#train_texts, val_texts, _, _ = train_test_split(df['input_text'], df['lyrics'], test_size=0.1)
# train_texts is 90% of features, val_texts is the remaining 10% left for testing

In [6]:
# Assuming df is a pandas DataFrame with your preprocessed data
features_tensor = torch.tensor(df[numerical_features + categorical_features].values, dtype=torch.float)

# You might separate them if using different pathways (e.g., embeddings for categorical)
numerical_tensor = torch.tensor(df[numerical_features].values, dtype=torch.float)
categorical_tensor = torch.tensor(df[categorical_features].values, dtype=torch.long)  # long type for embedding indices

In [7]:
tokenizer = GPT2Tokenizer.from_pretrained('gpt2')
tokenizer.pad_token = tokenizer.eos_token
tokenized_lyrics = tokenizer(df['lyrics'].tolist(), add_special_tokens=True, padding='max_length', truncation=True, max_length=512, return_tensors="pt")
input_ids = tokenized_lyrics['input_ids']
attention_mask = tokenized_lyrics['attention_mask']
dataset = TensorDataset(input_ids, attention_mask)  # Include labels if available
batch_size = 8  # Adjust based on the capabilities of your hardware
dataloader = DataLoader(dataset, sampler=RandomSampler(dataset), batch_size=batch_size)

In [13]:
class CustomLyricGenerator(nn.Module):
    def __init__(self, num_artist, num_title, num_genre, num_numerical_features, gpt2_model_name='gpt2'):
        super(CustomLyricGenerator, self).__init__()
        
        # Embedding for categorical features
        self.artist_embedding = nn.Embedding(num_artist + 1, 10)  # Assuming embedding size of 10
        self.title_embedding = nn.Embedding(num_title + 1, 10)
        self.genre_embedding = nn.Embedding(num_genre + 1, 10)
        
        # Processing for numerical features
        self.numerical_processor = nn.Linear(num_numerical_features, 128)  # Example size
        
        # GPT-2 model
        self.gpt2_config = GPT2Config.from_pretrained(gpt2_model_name)
        self.gpt2_model = GPT2Model.from_pretrained(gpt2_model_name)
        
        # Assuming a simple way to integrate categorical and numerical features into GPT-2's embedding space
        self.feature_combiner = nn.Linear(3 * 10 + 128, self.gpt2_config.n_embd)
        
        # Decoder to project GPT-2's output embeddings back to the vocabulary size for generating logits
        self.decoder = nn.Linear(self.gpt2_config.n_embd, self.gpt2_config.vocab_size)

    def forward(self, categorical_features, numerical_features, input_ids, attention_mask):
        # Process categorical features
        artist_embeds = self.artist_embedding(categorical_features[:, 0])
        title_embeds = self.title_embedding(categorical_features[:, 1])
        genre_embeds = self.genre_embedding(categorical_features[:, 2])
        
        cat_embeds = torch.cat((artist_embeds, title_embeds, genre_embeds), dim=1)
        
        # Process numerical features
        num_features_processed = self.numerical_processor(numerical_features)
        
        # Combine all features
        combined_features = torch.cat((cat_embeds, num_features_processed), dim=1)
        
        # Integrate combined features with GPT-2 input embeddings
        # Note: This step assumes you have a mechanism to combine these features with each token's embedding in the sequence
        # For simplicity, this example directly combines them without considering the sequence's structure
        combined_features = self.feature_combiner(combined_features).unsqueeze(1)  # Add sequence dimension
        combined_features = combined_features.expand(-1, input_ids.size(1), -1)  # Expand to match sequence length
        
        # Get GPT-2 model outputs
        gpt2_outputs = self.gpt2_model(inputs_embeds=combined_features, attention_mask=attention_mask)
        sequence_output = gpt2_outputs.last_hidden_state
        
        # Decode each position in the sequence back to the vocabulary space
        logits = self.decoder(sequence_output)
        
        return logits


In [14]:
# Combine tensors into a single dataset
full_dataset = TensorDataset(categorical_tensor, numerical_tensor, input_ids, attention_mask)

# Split dataset into training and validation
train_size = int(0.9 * len(full_dataset))
val_size = len(full_dataset) - train_size
train_dataset, val_dataset = random_split(full_dataset, [train_size, val_size])

train_loader = DataLoader(train_dataset, batch_size=8, shuffle=True)
val_loader = DataLoader(val_dataset, batch_size=8)

In [15]:
# Assuming the CustomLyricGenerator model is defined and instantiated
device = torch.device("cuda" if torch.cuda.is_available() else "cpu")

num_artist = df['artist'].nunique()
num_title = df['title'].nunique()
num_genre = df['track_genre'].nunique()
num_numerical_features = len(numerical_features)

# Instantiate the model with the correct arguments
model = CustomLyricGenerator(
    num_artist=num_artist, 
    num_title=num_title, 
    num_genre=num_genre, 
    num_numerical_features=num_numerical_features, 
    gpt2_model_name='gpt2'
).to(device)

num_epochs = 3

# Optimizer
optimizer = AdamW(model.parameters(), lr=5e-5)

# Scheduler (optional, for learning rate decay)
scheduler = get_linear_schedule_with_warmup(optimizer, num_warmup_steps=0, num_training_steps=len(train_loader)*num_epochs)

# Loss function
loss_fn = torch.nn.CrossEntropyLoss(ignore_index=tokenizer.pad_token_id)  # Ignore padding tokens for loss calculation



In [16]:
for feature in categorical_features:
    max_value = df[feature].max()
    print(f"Max index for {feature}: {max_value}")
    # Ensure the corresponding embedding layer in the model has at least max_value + 1 embeddings.


Max index for artist: 33
Max index for title: 964
Max index for track_genre: 110


In [19]:
def train(model, dataloader, optimizer, scheduler=None):
    model.train()
    total_loss = 0
    for batch_idx, batch in enumerate(dataloader):
        cat_feats, num_feats, input_ids, attn_mask = [b.to(device) for b in batch]
        
        optimizer.zero_grad()
        outputs = model(cat_feats, num_feats, input_ids, attn_mask)
        logits = outputs
        
        loss = loss_fn(logits.view(-1, logits.size(-1)), input_ids.view(-1))
        loss.backward()
        optimizer.step()
        if scheduler:
            scheduler.step()
        
        total_loss += loss.item()

        # Print the current loss for this batch
        print(f"Epoch {epoch+1}, Batch {batch_idx+1}/{len(dataloader)}, Current Loss: {loss.item():.4f}")

    avg_loss = total_loss / len(dataloader)
    return avg_loss



# Training epochs
num_epochs = 3

for epoch in range(num_epochs):
    avg_train_loss = train(model, train_loader, optimizer, scheduler)
    print(f"Epoch {epoch+1}/{num_epochs}, Training Loss: {avg_train_loss:.4f}")


Epoch 1, Batch 1/671, Current Loss: 11.4308


KeyboardInterrupt: 

In [None]:
import torch
from transformers import GPT2Tokenizer, GPT2LMHeadModel

# Assuming these objects are defined based on your training setup
artist_encoder = ...  # LabelEncoder for artist
title_encoder = ...   # LabelEncoder for title
genre_encoder = ...   # LabelEncoder for genre
scaler = ...          # StandardScaler for numerical features
model = ...           # Your trained CustomLyricGenerator model
tokenizer = GPT2Tokenizer.from_pretrained('gpt2')

def generate_lyrics(genre, bpm, danceability):
    # Convert user-specified features into model inputs
    genre_id = torch.tensor([genre_encoder.transform([genre])]).to(device)
    bpm_scaled = scaler.transform([[bpm]])[0]  # Assuming BPM is scaled as a numerical feature
    danceability_scaled = scaler.transform([[danceability]])[0]
    
    # Combine all features into a single input tensor
    features = torch.cat((genre_id, torch.tensor([[bpm_scaled, danceability_scaled]], device=device)), dim=1)
    
    # Generate input_ids and attention_mask for a dummy input to start generation
    input_ids = torch.tensor(tokenizer.encode("<|startoftext|>", add_special_tokens=False)).unsqueeze(0).to(device)
    attention_mask = torch.ones_like(input_ids).to(device)
    
    # Generate lyrics conditioned on the features
    # Adjust the generation parameters as needed
    output_sequences = model.generate(input_ids=input_ids, attention_mask=attention_mask,
                                      max_length=512, temperature=1.0, 
                                      top_k=50, top_p=0.95, repetition_penalty=1.2,
                                      do_sample=True, num_return_sequences=1)
    
    # Decode generated lyrics
    generated_lyrics = tokenizer.decode(output_sequences[0], skip_special_tokens=True)
    
    return generated_lyrics

# Example usage
genre = "indie pop"
bpm = 120
danceability = 0.8
lyrics = generate_lyrics(genre, bpm, danceability)
print("Generated Lyrics:", lyrics)


In [None]:
def validate(model, dataloader):
    model.eval()
    total_loss = 0
    with torch.no_grad():
        for batch in dataloader:
            cat_feats, num_feats, input_ids, attn_mask = [b.to(device) for b in batch]
            outputs = model(cat_feats, num_feats, input_ids, attn_mask)
            logits = outputs
            loss = loss_fn(logits.view(-1, logits.size(-1)), input_ids.view(-1))
            total_loss += loss.item()

    avg_loss = total_loss / len(dataloader)
    return avg_loss

avg_val_loss = validate(model, val_loader)
print(f"Validation Loss: {avg_val_loss:.4f}")


In [None]:
def generate_lyrics(model, tokenizer, features_prompt, max_length=50):
    model.eval()
    with torch.no_grad():
        # Prepare the feature prompt
        input_ids = tokenizer.encode(features_prompt, return_tensors='pt').to(device)
        
        # Generate lyrics
        output_sequences = model.generate(input_ids=input_ids, max_length=max_length)
        
        # Decode generated ids to text
        generated_text = tokenizer.decode(output_sequences[0], skip_special_tokens=True)
        
    return generated_text

# Example feature prompt
features_prompt = "genre: indie-pop bpm: 120 danceability: 0.8"
generated_lyrics = generate_lyrics(model, tokenizer, features_prompt)
print(generated_lyrics)
