# RhymeLM v2 ‚Äî Dual-Corpus Character Language Model

**Architecture Philosophy:**
- Learn *what words look like* from the English dictionary (vocabulary grounding)
- Learn *how artists flow* from rap lyrics (style, structure, rhyme schemes)
- Combine both to generate coherent 16-bar verses

**Key Upgrades from v1:**
- LSTM backbone (better sequence modeling than feedforward)
- Dual corpus: English dictionary + lyrics interleaved
- Scaled dimensions: 256 embed, 512 hidden, 2 layers
- CMU Pronouncing Dictionary integration for rhyme-aware vocabulary

---
## 1. Environment Setup

In [2]:
import torch
import torch.nn as nn
import torch.nn.functional as F
import pandas as pd
import numpy as np
from collections import Counter
import random
import os

# Device selection (MPS for Mac, CUDA for GPU, else CPU)
if torch.backends.mps.is_available():
    device = torch.device("mps")
    print("Using Apple Silicon (MPS)")
elif torch.cuda.is_available():
    device = torch.device("cuda")
    print(f"Using CUDA: {torch.cuda.get_device_name(0)}")
else:
    device = torch.device("cpu")
    print("Using CPU")

# Reproducibility
torch.manual_seed(42)
random.seed(42)
np.random.seed(42)

Using CUDA: NVIDIA GeForce RTX 3080 Ti


---
## 2. Load & Prepare the Dual Corpus

In [3]:
# -----------------------------------------
# 2a. Load English Dictionary (Vocabulary Grounding)
# -----------------------------------------
import nltk
nltk.download('words', quiet=True)
nltk.download('cmudict', quiet=True)

from nltk.corpus import words as nltk_words
from nltk.corpus import cmudict

# Get all English words
all_english = set(w.lower() for w in nltk_words.words())

# Get CMU dictionary (words with pronunciation = rhymeable)
cmu_dict = cmudict.dict()
rhymeable_words = set(cmu_dict.keys())

# Prioritize rhymeable words, but include common English words too
# Filter to reasonable length (2-15 chars) for efficiency
vocab_words = [w for w in rhymeable_words if 2 <= len(w) <= 15]
vocab_words += [w for w in all_english if 2 <= len(w) <= 12 and w not in rhymeable_words]

print(f"Dictionary corpus: {len(vocab_words):,} words")

Dictionary corpus: 286,522 words


In [4]:
# -----------------------------------------
# 2b. Load Lyrics Corpus
# -----------------------------------------
CSV_PATH = "lyrics_raw.csv"  # Adjust path as needed

df = pd.read_csv(CSV_PATH)
print(f"Loaded {len(df):,} tracks")
print(f"Columns: {list(df.columns)}")
print(f"Artists: {df['artist'].nunique()} unique")

Loaded 530 tracks
Columns: ['track_name', 'artist', 'raw_lyrics', 'artist_verses']
Artists: 11 unique


In [5]:
# -----------------------------------------
# 2c. Extract 16-Bar Verses from Lyrics
# -----------------------------------------
def extract_verses(texts, min_bars=8, max_bars=16):
    """Split lyrics into chunks of 8-16 bars (lines)."""
    verses = []
    for txt in texts:
        if not isinstance(txt, str):
            continue
        lines = [ln.strip() for ln in txt.splitlines() if ln.strip()]
        
        # Clean out common noise
        lines = [ln for ln in lines if not ln.startswith('See ') 
                 and 'tickets as low as' not in ln
                 and 'You might also like' not in ln]
        
        for i in range(0, len(lines), max_bars):
            chunk = lines[i:i + max_bars]
            if len(chunk) >= min_bars:
                verses.append("\n".join(chunk))
    return verses

lyrics_texts = df["artist_verses"].dropna().tolist()
sixteen_bar_verses = extract_verses(lyrics_texts)

print(f"Extracted {len(sixteen_bar_verses):,} verse chunks (8-16 bars each)")

Extracted 2,189 verse chunks (8-16 bars each)


In [6]:
# Preview a sample verse
print("=" * 50)
print("SAMPLE VERSE:")
print("=" * 50)
print(random.choice(sixteen_bar_verses))

SAMPLE VERSE:
Young Money raised me, grew up out in Baisley
Southside Jamaica, Queens, and it's crazy
'Cause I'm still hood, Hollywood couldn't change me
Shout out to my haters, sorry that you couldn't faze me
Ain't being cocky, we just vindicated
Best believe that when we done, this moment will be syndicated
I don't know, this night just remind me of
Everything they deprived me of (Yeah)
Put your drinks up
It's a celebration every time we link up
We done did everything they can think of
Greatness is what we on the brink of
I wish that I could have this moment for life, for life, for life
'Cause in this moment, I just feel so alive, alive, alive
I wish that I could have this moment for life, for life, for life
This is my moment, I just feel so alive, alive, alive


In [7]:
# -----------------------------------------
# 2d. Build Combined Corpus
# -----------------------------------------
# Strategy: Interleave dictionary words with lyrics
# This teaches the model both "what words look like" and "how verses flow"

def build_dual_corpus(verses, dictionary_words, dict_ratio=0.3):
    """
    Build a training corpus that interleaves:
    - Verse chunks (primary content)
    - Dictionary word blocks (vocabulary grounding)
    
    dict_ratio: fraction of corpus that should be dictionary words
    """
    corpus_parts = []
    
    # Shuffle dictionary words
    shuffled_dict = dictionary_words.copy()
    random.shuffle(shuffled_dict)
    
    # Calculate how many dictionary blocks we need
    num_verses = len(verses)
    dict_blocks_needed = int(num_verses * dict_ratio / (1 - dict_ratio))
    
    # Create dictionary blocks (groups of ~50 words separated by newlines)
    words_per_block = 50
    dict_blocks = []
    for i in range(0, min(len(shuffled_dict), dict_blocks_needed * words_per_block), words_per_block):
        block = "\n".join(shuffled_dict[i:i + words_per_block])
        dict_blocks.append(block)
    
    print(f"Created {len(dict_blocks):,} dictionary blocks")
    
    # Interleave: verse, dict, verse, dict, ...
    verse_idx = 0
    dict_idx = 0
    
    while verse_idx < len(verses):
        # Add a verse
        corpus_parts.append(verses[verse_idx])
        verse_idx += 1
        
        # Occasionally add a dictionary block (based on ratio)
        if random.random() < dict_ratio and dict_idx < len(dict_blocks):
            corpus_parts.append(dict_blocks[dict_idx])
            dict_idx += 1
    
    return "\n\n".join(corpus_parts)

corpus = build_dual_corpus(sixteen_bar_verses, vocab_words, dict_ratio=0.25)
print(f"\nTotal corpus length: {len(corpus):,} characters")

Created 729 dictionary blocks

Total corpus length: 1,614,351 characters


In [8]:
# -----------------------------------------
# 2e. Build Character Vocabulary
# -----------------------------------------
chars = sorted(list(set(corpus)))
vocab_size = len(chars)

stoi = {ch: i for i, ch in enumerate(chars)}
itos = {i: ch for i, ch in enumerate(chars)}

print(f"Character vocabulary size: {vocab_size}")
print(f"Characters: {''.join(chars[:50])}... (showing first 50)")

Character vocabulary size: 261
Characters: 
 !"#$%&'()*+,-./:;<?@ABCDEFGHIJKLMNOPQRSTUVWXYZ\^... (showing first 50)


In [9]:
# -----------------------------------------
# 2f. Encode Corpus to Tensor
# -----------------------------------------
encoded = torch.tensor([stoi[c] for c in corpus], dtype=torch.long)
print(f"Encoded tensor shape: {encoded.shape}")

# Train/val split (90/10)
split_idx = int(len(encoded) * 0.9)
train_data = encoded[:split_idx]
val_data = encoded[split_idx:]

print(f"Train: {len(train_data):,} chars | Val: {len(val_data):,} chars")

Encoded tensor shape: torch.Size([1614351])
Train: 1,452,915 chars | Val: 161,436 chars


---
## 3. Batch Generator

In [10]:
# -----------------------------------------
# Configurable batch generator
# -----------------------------------------
BLOCK_SIZE = 128  # Context length (chars the model sees at once)

def get_batch(split='train', batch_size=64):
    """Generate a batch of training examples."""
    data = train_data if split == 'train' else val_data
    
    # Random starting indices
    ix = torch.randint(len(data) - BLOCK_SIZE - 1, (batch_size,))
    
    x = torch.stack([data[i:i + BLOCK_SIZE] for i in ix])
    y = torch.stack([data[i + 1:i + BLOCK_SIZE + 1] for i in ix])
    
    return x.to(device), y.to(device)

# Test batch
x_test, y_test = get_batch()
print(f"Batch shapes: x={x_test.shape}, y={y_test.shape}")

Batch shapes: x=torch.Size([64, 128]), y=torch.Size([64, 128])


---
## 4. Model Architecture ‚Äî LSTM-Based RhymeLM

In [11]:
class RhymeLM(nn.Module):
    """
    LSTM-based character language model.
    
    Architecture:
    - Character embedding layer
    - Multi-layer LSTM (captures sequential dependencies)
    - Dropout for regularization
    - Linear projection to vocabulary
    """
    
    def __init__(self, vocab_size, embed_dim=256, hidden_dim=512, num_layers=2, dropout=0.2):
        super().__init__()
        
        self.hidden_dim = hidden_dim
        self.num_layers = num_layers
        
        # Embedding: characters ‚Üí vectors
        self.embed = nn.Embedding(vocab_size, embed_dim)
        
        # LSTM backbone
        self.lstm = nn.LSTM(
            input_size=embed_dim,
            hidden_size=hidden_dim,
            num_layers=num_layers,
            batch_first=True,
            dropout=dropout if num_layers > 1 else 0
        )
        
        # Regularization
        self.dropout = nn.Dropout(dropout)
        
        # Output projection
        self.fc = nn.Linear(hidden_dim, vocab_size)
        
        # Initialize weights
        self._init_weights()
    
    def _init_weights(self):
        """Xavier initialization for better training dynamics."""
        for name, param in self.named_parameters():
            if 'weight' in name and param.dim() > 1:
                nn.init.xavier_uniform_(param)
            elif 'bias' in name:
                nn.init.zeros_(param)
    
    def forward(self, x, hidden=None):
        """
        Forward pass.
        
        Args:
            x: (batch, seq_len) token indices
            hidden: optional (h_0, c_0) tuple for LSTM
            
        Returns:
            logits: (batch, seq_len, vocab_size)
            hidden: updated (h_n, c_n) tuple
        """
        # Embed characters
        emb = self.embed(x)  # (B, T, E)
        emb = self.dropout(emb)
        
        # LSTM
        if hidden is None:
            lstm_out, hidden = self.lstm(emb)
        else:
            lstm_out, hidden = self.lstm(emb, hidden)
        
        # Project to vocabulary
        lstm_out = self.dropout(lstm_out)
        logits = self.fc(lstm_out)  # (B, T, V)
        
        return logits, hidden
    
    def init_hidden(self, batch_size):
        """Initialize hidden state with zeros."""
        h = torch.zeros(self.num_layers, batch_size, self.hidden_dim).to(device)
        c = torch.zeros(self.num_layers, batch_size, self.hidden_dim).to(device)
        return (h, c)

In [12]:
# -----------------------------------------
# Instantiate Model
# -----------------------------------------
model = RhymeLM(
    vocab_size=vocab_size,
    embed_dim=256,
    hidden_dim=512,
    num_layers=2,
    dropout=0.2
).to(device)

# Count parameters
num_params = sum(p.numel() for p in model.parameters())
print(f"Model parameters: {num_params:,}")
print(model)

Model parameters: 3,878,917
RhymeLM(
  (embed): Embedding(261, 256)
  (lstm): LSTM(256, 512, num_layers=2, batch_first=True, dropout=0.2)
  (dropout): Dropout(p=0.2, inplace=False)
  (fc): Linear(in_features=512, out_features=261, bias=True)
)


---
## 5. Training Loop

In [13]:
# -----------------------------------------
# Training Configuration
# -----------------------------------------
LEARNING_RATE = 1e-3
BATCH_SIZE = 64
NUM_STEPS = 50000
EVAL_INTERVAL = 1000
SAMPLE_INTERVAL = 5000
CHECKPOINT_INTERVAL = 10000

optimizer = torch.optim.AdamW(model.parameters(), lr=LEARNING_RATE)
scheduler = torch.optim.lr_scheduler.CosineAnnealingLR(optimizer, T_max=NUM_STEPS)

In [14]:
# -----------------------------------------
# Loss Estimation Function
# -----------------------------------------
@torch.no_grad()
def estimate_loss(eval_iters=100):
    """Estimate loss on train and val sets."""
    model.eval()
    losses = {}
    
    for split in ['train', 'val']:
        batch_losses = []
        for _ in range(eval_iters):
            x, y = get_batch(split, BATCH_SIZE)
            logits, _ = model(x)
            B, T, V = logits.shape
            loss = F.cross_entropy(logits.view(B * T, V), y.view(B * T))
            batch_losses.append(loss.item())
        losses[split] = np.mean(batch_losses)
    
    model.train()
    return losses

In [15]:
# -----------------------------------------
# Generation Function (for sampling during training)
# -----------------------------------------
@torch.no_grad()
def generate_verse(start_text=" ", num_bars=16, temperature=0.8, max_chars=2000):
    """
    Generate a verse with the specified number of bars (lines).
    
    Args:
        start_text: Initial characters to seed generation
        num_bars: Number of lines to generate
        temperature: Sampling temperature (lower = more conservative)
        max_chars: Safety limit on generation length
    """
    model.eval()
    
    # Encode starting text
    tokens = [stoi.get(c, 0) for c in start_text]
    x = torch.tensor([tokens], dtype=torch.long).to(device)
    
    hidden = model.init_hidden(1)
    generated = list(start_text)
    bar_count = 0
    
    while bar_count < num_bars and len(generated) < max_chars:
        logits, hidden = model(x, hidden)
        logits = logits[:, -1, :] / temperature
        
        probs = F.softmax(logits, dim=-1)
        next_token = torch.multinomial(probs, num_samples=1)
        
        next_char = itos[next_token.item()]
        generated.append(next_char)
        
        if next_char == '\n':
            bar_count += 1
        
        x = next_token
    
    model.train()
    return ''.join(generated)

In [16]:
# -----------------------------------------
# Checkpoint Functions
# -----------------------------------------
def save_checkpoint(path="rhyme_lm_v2.pt", step=0):
    """Save model checkpoint."""
    torch.save({
        'step': step,
        'model_state': model.state_dict(),
        'optimizer_state': optimizer.state_dict(),
        'scheduler_state': scheduler.state_dict(),
        'stoi': stoi,
        'itos': itos,
        'vocab_size': vocab_size,
        'block_size': BLOCK_SIZE,
        'config': {
            'embed_dim': 256,
            'hidden_dim': 512,
            'num_layers': 2,
            'dropout': 0.2
        }
    }, path)
    print(f"üíæ Saved checkpoint to {path}")

def load_checkpoint(path="rhyme_lm_v2.pt"):
    """Load model checkpoint."""
    checkpoint = torch.load(path, map_location=device)
    model.load_state_dict(checkpoint['model_state'])
    optimizer.load_state_dict(checkpoint['optimizer_state'])
    scheduler.load_state_dict(checkpoint['scheduler_state'])
    print(f"‚úÖ Loaded checkpoint from {path} (step {checkpoint['step']})")
    return checkpoint['step']

In [17]:
# -----------------------------------------
# Main Training Loop
# -----------------------------------------
print("Starting training...")
print(f"Config: {NUM_STEPS} steps, batch_size={BATCH_SIZE}, lr={LEARNING_RATE}")
print("=" * 60)

train_losses = []
val_losses = []

for step in range(1, NUM_STEPS + 1):
    # Get batch
    x, y = get_batch('train', BATCH_SIZE)
    
    # Forward pass
    logits, _ = model(x)
    B, T, V = logits.shape
    loss = F.cross_entropy(logits.view(B * T, V), y.view(B * T))
    
    # Backward pass
    optimizer.zero_grad()
    loss.backward()
    torch.nn.utils.clip_grad_norm_(model.parameters(), max_norm=1.0)
    optimizer.step()
    scheduler.step()
    
    # Logging
    if step % EVAL_INTERVAL == 0:
        losses = estimate_loss()
        train_losses.append(losses['train'])
        val_losses.append(losses['val'])
        lr = scheduler.get_last_lr()[0]
        print(f"Step {step:>6} | train loss: {losses['train']:.4f} | val loss: {losses['val']:.4f} | lr: {lr:.6f}")
    
    # Sample generation
    if step % SAMPLE_INTERVAL == 0:
        print("\n" + "=" * 40)
        print("üé§ SAMPLE VERSE:")
        print("=" * 40)
        print(generate_verse(num_bars=8, temperature=0.7))
        print("=" * 40 + "\n")
    
    # Checkpointing
    if step % CHECKPOINT_INTERVAL == 0:
        save_checkpoint(f"rhyme_lm_v2_step{step}.pt", step)

print("\n‚úÖ Training complete!")
save_checkpoint("rhyme_lm_v2_final.pt", NUM_STEPS)

Starting training...
Config: 50000 steps, batch_size=64, lr=0.001
Step   1000 | train loss: 1.6452 | val loss: 1.7123 | lr: 0.000999
Step   2000 | train loss: 1.4305 | val loss: 1.6247 | lr: 0.000996
Step   3000 | train loss: 1.3171 | val loss: 1.6045 | lr: 0.000991
Step   4000 | train loss: 1.2179 | val loss: 1.6133 | lr: 0.000984
Step   5000 | train loss: 1.1498 | val loss: 1.6213 | lr: 0.000976

üé§ SAMPLE VERSE:
 expedent
Watch the new York state of mind
We can see the roof comes off, 'til the roof comes off, 'til the truth I did
And I'm a friend and the place
They say the police in the death of the streets
When they see a lot of guys, we lookin' for her
The new state of the real motherfuckers as white tits, it's like Kobe marrie
My daddy crack me at the court, the millions are real


Step   6000 | train loss: 1.1025 | val loss: 1.6326 | lr: 0.000965
Step   7000 | train loss: 1.0540 | val loss: 1.6485 | lr: 0.000952
Step   8000 | train loss: 1.0167 | val loss: 1.6681 | lr: 0.00093

---
## 6. Generation Interface

In [18]:
# -----------------------------------------
# Generate a Full 16-Bar Verse
# -----------------------------------------
def write_16(prompt="I ", temperature=0.8):
    """
    Generate a 16-bar verse.
    
    Args:
        prompt: Starting text to seed the generation
        temperature: Controls randomness (0.5=conservative, 1.0=creative)
    """
    verse = generate_verse(start_text=prompt, num_bars=16, temperature=temperature)
    print("üé§ " + "=" * 50)
    print(verse)
    print("=" * 53)
    return verse

# Try it out!
write_16(prompt="Yeah, ", temperature=0.75)

Yeah, oh-oh-oh
Oh-oh, oh-oh, oh-oh
Oh-oh-oh
Oh-oh, oh-oh, oh-oh
Oh-oh-oh
Oh-oh, oh-oh, oh-oh
Bitch, don't get too comfortable
Better not get too comfortable
Better not get too comfortable
Four or five hoes, stuff 'em in a Lamborghini, yeah (Move your feet)
We been on the low for a while
I been seeing the setter from the hood, grrah, brother
Ball in his way seems so much from the same thing that we were in the back
Close to the bag on, and we just did addicting the globe

Bitch, don't get too comfortable



"Yeah, oh-oh-oh\nOh-oh, oh-oh, oh-oh\nOh-oh-oh\nOh-oh, oh-oh, oh-oh\nOh-oh-oh\nOh-oh, oh-oh, oh-oh\nBitch, don't get too comfortable\nBetter not get too comfortable\nBetter not get too comfortable\nFour or five hoes, stuff 'em in a Lamborghini, yeah (Move your feet)\nWe been on the low for a while\nI been seeing the setter from the hood, grrah, brother\nBall in his way seems so much from the same thing that we were in the back\nClose to the bag on, and we just did addicting the globe\n\nBitch, don't get too comfortable\n"

In [19]:
# -----------------------------------------
# Temperature Comparison
# -----------------------------------------
print("TEMPERATURE COMPARISON")
print("=" * 60)

for temp in [0.5, 0.7, 0.9, 1.0]:
    print(f"\nüå°Ô∏è Temperature: {temp}")
    print("-" * 40)
    verse = generate_verse(start_text="I ", num_bars=4, temperature=temp)
    print(verse)
    print()

TEMPERATURE COMPARISON

üå°Ô∏è Temperature: 0.5
----------------------------------------
I ever say girl there's no longer than me
I wanna see the family way back to the floor now
I know they tryna make it but I did it (Huh?)
I see no case and she said "No comments"



üå°Ô∏è Temperature: 0.7
----------------------------------------
I start a few outlaw like selfish
Maybe that's your words to see your man
You the one who did that girl in the margin
Tryna see the corner put on some and strapped with a golden chare



üå°Ô∏è Temperature: 0.9
----------------------------------------
I take it, I got a flo life
Say she reople that family hustler, you're still out here
But it's in a party's side, when we bigger white like Dallaze




üå°Ô∏è Temperature: 1.0
----------------------------------------
I didn't deal with the best when I gotta start tlied struggle, and I got to come up
When the hood runnin' out the park study and bitches
I moved in those S with the G case and drink, I'm Halid

---
## 7. Training Visualization

In [None]:
import matplotlib.pyplot as plt

if train_losses and val_losses:
    plt.figure(figsize=(10, 5))
    steps = [i * EVAL_INTERVAL for i in range(1, len(train_losses) + 1)]
    
    plt.plot(steps, train_losses, label='Train Loss', alpha=0.8)
    plt.plot(steps, val_losses, label='Val Loss', alpha=0.8)
    
    plt.xlabel('Step')
    plt.ylabel('Loss')
    plt.title('RhymeLM v2 Training Progress')
    plt.legend()
    plt.grid(True, alpha=0.3)
    plt.tight_layout()
    plt.savefig('training_curve.png', dpi=150)
    plt.show()
else:
    print("No training history to plot yet.")

---
## 8. Resume Training

In [None]:
# Uncomment to continue training from checkpoint:
# start_step = load_checkpoint("rhyme_lm_v2_step10000.pt")

def train_more(additional_steps=10000, batch_size=64):
    """Continue training for more steps."""
    print(f"Training for {additional_steps:,} more steps...")
    
    for step in range(1, additional_steps + 1):
        x, y = get_batch('train', batch_size)
        
        logits, _ = model(x)
        B, T, V = logits.shape
        loss = F.cross_entropy(logits.view(B * T, V), y.view(B * T))
        
        optimizer.zero_grad()
        loss.backward()
        torch.nn.utils.clip_grad_norm_(model.parameters(), max_norm=1.0)
        optimizer.step()
        
        if step % 1000 == 0:
            print(f"Step {step}/{additional_steps}, loss: {loss.item():.4f}")
        
        if step % 5000 == 0:
            print("\n--- Sample ---")
            print(generate_verse(num_bars=4, temperature=0.7))
            print("-" * 30 + "\n")
    
    print("Done!")

# train_more(20000)

In [1]:
import torch

print(f"PyTorch version: {torch.__version__}")
print(f"CUDA available: {torch.cuda.is_available()}")
print(f"CUDA version: {torch.version.cuda if torch.cuda.is_available() else 'N/A'}")

if torch.cuda.is_available():
    print(f"GPU: {torch.cuda.get_device_name(0)}")
    print("‚úÖ You're good to go with NVIDIA!")
elif hasattr(torch.backends, 'mps') and torch.backends.mps.is_available():
    print("‚úÖ Apple Silicon (MPS) available")
else:
    print("\n‚ö†Ô∏è  No GPU detected - running on CPU")
    print("\nFor NVIDIA on Fedora, try:")
    print("  sudo dnf install akmod-nvidia xorg-x11-drv-nvidia-cuda")
    print("  pip install torch --index-url https://download.pytorch.org/whl/cu118")

PyTorch version: 2.5.1+cu121
CUDA available: True
CUDA version: 12.1
GPU: NVIDIA GeForce RTX 3080 Ti
‚úÖ You're good to go with NVIDIA!
