In [1]:
!pip install datasets
# !pip install ipywidgets
!pip install tqdm
# !pip install tensorflow



In [79]:
#from datasets import load_dataset

dataset_raw = load_dataset("sebastiandizon/genius-song-lyrics")
#dataset = load_dataset("sebastiandizon/genius-song-lyrics", split="train[:100000]")

Loading dataset shards:   0%|          | 0/19 [00:00<?, ?it/s]

In [136]:
dataset = dataset_raw["train"].shuffle(seed=42).select(range(10000))

In [137]:
dataset

Dataset({
    features: ['title', 'tag', 'artist', 'year', 'views', 'features', 'lyrics', 'id', 'language_cld3', 'language_ft', 'language'],
    num_rows: 10000
})

In [138]:
# drop id
dataset = dataset.remove_columns(['id'])
#drop views
dataset = dataset.remove_columns(['views'])

In [139]:
import multiprocessing
num_cpus = multiprocessing.cpu_count()
num_cpus

4

In [140]:
# drop non-english songs
dataset = dataset.filter(
    lambda song: song['language'] == 'en' and song['language_ft'] == 'en' and song['language_cld3'] == 'en',
    num_proc = num_cpus
)
dataset = dataset.remove_columns(['language_cld3', 'language_ft', 'language', 'artist', 'year', 'features'])

Filter (num_proc=4):   0%|          | 0/10000 [00:00<?, ? examples/s]

In [141]:
# see how many observations/rows in dataset along with our features
dataset

Dataset({
    features: ['title', 'tag', 'lyrics'],
    num_rows: 6613
})

In [142]:
import re

def generate_tokens(batch):
    """ Tokenizer that assigns each word to a song section correctly. """
    tokenized_lyrics = []

    for lyrics in batch["lyrics"]:
        tokens = re.findall(r"[\w']+|[.,!?;(){}\[\]]", lyrics)  # Tokenize with punctuation
        section = "Unknown"  # Default section for words before the first section label
        in_section_label = False  # Flag for detecting section headers
        sectioned_tokens = []  # Store (word, section) pairs

        for token in tokens:
            if token == "[":  
                in_section_label = True  # Start detecting section name
                continue
            elif in_section_label:
                section = token  # Set new section name
                in_section_label = False  # Reset flag
                continue
            elif token == "]":
                continue  # Ignore the closing bracket
            
            # Only store tokens that are actual words, not stray brackets or empty sections
            if token.isalnum():  
                sectioned_tokens.append((token, section))

        tokenized_lyrics.append(sectioned_tokens)

    return {"lyric_tokens": tokenized_lyrics}

dataset = dataset.map(generate_tokens, batched=True, num_proc=num_cpus)

Map (num_proc=4):   0%|          | 0/6613 [00:00<?, ? examples/s]

In [143]:
dataset

Dataset({
    features: ['title', 'tag', 'lyrics', 'lyric_tokens'],
    num_rows: 6613
})

In [144]:
# check our tags/genres
set(dataset['tag'])

{'country', 'misc', 'pop', 'rap', 'rb', 'rock'}

In [145]:
import numpy as np

class HMM:
    def __init__(self, states, tag, vocab_size=10000, smoothing=1):
        self.states = states  # Song sections (Verse, Chorus, etc.)
        self.tag = tag  # Genre or category
        self.S = len(states)  # Number of states
        self.vocab_size = vocab_size  # Max vocab size
        self.smoothing = smoothing  # Laplace smoothing

        # Transition Matrix: P(next_section | current_section)
        self.transition_matrix = np.zeros((self.S, self.S))

        # Emission Probabilities: P(word | section)
        self.obs_vals = []  # Vocabulary
        self.word_counts = defaultdict(lambda: defaultdict(int))  # word_counts[section][word] = count

        self.T = 0  # Number of words in vocab (set later)
        self.emission_probs = None  # Will be initialized in `fit()`

    def fit(self, dataset):
        """ Train HMM using song lyrics dataset. """
        transition_counts = defaultdict(lambda: defaultdict(int))
        section_counts = defaultdict(int)  # Counts occurrences of each section
        word_totals = defaultdict(int)  # Counts total words across all sections
        vocab_counter = Counter()  # Global word frequency

        # Collect transition and emission counts
        for song in dataset:
            if song['tag'] != self.tag:
                continue

            prev_section = None  # Track previous section

            for data in song['lyric_tokens']:
                if len(data) != 2:
                    print("Error - tuple greater than length 2")
                    print(data)
                    break
                word, section = data
                word = word.lower()  # Normalize case
                vocab_counter[word] += 1  # Track word count
                self.word_counts[section][word] += 1  # Track in section
                section_counts[section] += 1  # Track section occurrences
                word_totals[word] += 1  # Total occurrences

                if prev_section is not None:
                    transition_counts[prev_section][section] += 1  # Update transition count
                prev_section = section  # Update previous

        # Keep most common words
        self.obs_vals = [word for word, _ in vocab_counter.most_common(self.vocab_size)]
        self.T = len(self.obs_vals)
        self.emission_probs = np.zeros((self.S, self.T))  # Initialize emission matrix

        # Map section names and words to indices
        section_to_idx = {s: i for i, s in enumerate(self.states)}
        word_to_idx = {w: i for i, w in enumerate(self.obs_vals)}

        # Compute Transition Probabilities (with Laplace smoothing)
        for s1 in self.states:
            total_transitions = sum(transition_counts[s1].values()) + (self.S * self.smoothing)
            for s2 in self.states:
                self.transition_matrix[section_to_idx[s1], section_to_idx[s2]] = (
                    (transition_counts[s1][s2] + self.smoothing) / total_transitions
                )

        # Compute Emission Probabilities (with Laplace smoothing)
        for section in self.states:
            section_idx = section_to_idx[section]
            total_words = section_counts[section] + (self.T * self.smoothing)

            for word in self.obs_vals:
                word_idx = word_to_idx[word]
                self.emission_probs[section_idx, word_idx] = (
                    (self.word_counts[section][word] + self.smoothing) / total_words
                )

    def viterbi_generate(self, start_section="Verse", length=50, temperature=1.0):
        """ Generate lyrics using a pure Viterbi approach (without modifying transition probabilities). """
        if not self.emission_probs.any():
            raise ValueError("Model must be trained using fit() before generating.")
    
        generated_lyrics = []
        section_sequence = []
    
        # Map section names and words to indices
        section_to_idx = {s: i for i, s in enumerate(self.states)}
        word_to_idx = {w: i for i, w in enumerate(self.obs_vals)}
        idx_to_word = {i: w for w, i in word_to_idx.items()}
        idx_to_section = {i: s for s, i in section_to_idx.items()}
    
        # Start at the given section
        current_section = start_section
        section_sequence.append(current_section)
    
        for _ in range(length):
            section_idx = section_to_idx[current_section]
    
            # Sample a word instead of always choosing the most probable one
            word_probs = self.emission_probs[section_idx]
            word_probs = np.exp(word_probs / temperature)  # Apply temperature scaling
            word_probs /= np.sum(word_probs)  # Normalize
            word_idx = np.random.choice(self.T, p=word_probs)  # Sample a word
            
            generated_lyrics.append(idx_to_word[word_idx])
    
            # Use the raw transition probabilities to sample the next section
            next_section_probs = self.transition_matrix[section_idx]
            next_section_probs = np.exp(next_section_probs / temperature)  # Apply softmax scaling
            next_section_probs /= np.sum(next_section_probs)  # Normalize
            
            # Sample the next section
            next_section_idx = np.random.choice(self.S, p=next_section_probs)
            current_section = idx_to_section[next_section_idx]
            section_sequence.append(current_section)
    
        return " ".join(generated_lyrics), section_sequence

In [146]:
states = [
    "Verse",
    "Chorus",
    "Refrain",
    "Hook",
    "Bridge",
    "Breakdown",
    "Intro",
    "Outro",
    "Skit",
    "Pre-Chorus",
    "Post-Chorus",
    "Interlude",
    "Segue",
    "Instrumental",
    "Instrumental Break",
    "Snippet",
]

In [149]:
hmm_model = HMM(states, tag="rap", vocab_size=500)
hmm_model.fit(dataset)

# Generate lyrics with Viterbi
generated_song, section_sequence = hmm_model.viterbi_generate(start_section="Verse", length=100, temperature=0.25)

print("Generated Lyrics:")
print(generated_song)
print("\nSection Sequence:")
print(section_sequence)

Generated Lyrics:
cool out yea used tryna until any going crazy on feel something since t other right right shot take roll did when don la bro though hot yuh want break never most world pull diamonds only of red gon dont rich every maybe not pack all after living hard when words knew truth thought gun huh cash trap heard some around mind good streets go pain thing pass looking boys fast how black rappers getting green yea home listen trust a em broke hoes shoot hook these that finna yes rap friend every move about weed rock them pain body

Section Sequence:
['Verse', 'Verse', 'Verse', 'Verse', 'Verse', 'Verse', 'Verse', 'Verse', 'Verse', 'Verse', 'Verse', 'Verse', 'Verse', 'Verse', 'Verse', 'Post-Chorus', 'Bridge', 'Snippet', 'Pre-Chorus', 'Chorus', 'Chorus', 'Chorus', 'Chorus', 'Chorus', 'Segue', 'Refrain', 'Refrain', 'Pre-Chorus', 'Outro', 'Outro', 'Outro', 'Verse', 'Verse', 'Segue', 'Instrumental Break', 'Verse', 'Verse', 'Verse', 'Skit', 'Skit', 'Skit', 'Skit', 'Segue', 'Interlude'

In [66]:
hmm_model.transition_matrix

array([[9.87548638e-01, 2.33463035e-03, 3.89105058e-04, 3.11284047e-03,
        1.16731518e-03, 3.89105058e-04, 3.89105058e-04, 3.89105058e-04,
        3.89105058e-04, 3.89105058e-04, 3.89105058e-04, 3.89105058e-04,
        3.89105058e-04, 3.89105058e-04, 3.89105058e-04, 3.89105058e-04],
       [8.13835198e-03, 9.74567650e-01, 1.01729400e-03, 1.01729400e-03,
        1.01729400e-03, 1.01729400e-03, 1.01729400e-03, 2.03458800e-03,
        1.01729400e-03, 1.01729400e-03, 1.01729400e-03, 1.01729400e-03,
        1.01729400e-03, 1.01729400e-03, 1.01729400e-03, 1.01729400e-03],
       [6.25000000e-02, 6.25000000e-02, 6.25000000e-02, 6.25000000e-02,
        6.25000000e-02, 6.25000000e-02, 6.25000000e-02, 6.25000000e-02,
        6.25000000e-02, 6.25000000e-02, 6.25000000e-02, 6.25000000e-02,
        6.25000000e-02, 6.25000000e-02, 6.25000000e-02, 6.25000000e-02],
       [7.66871166e-03, 1.53374233e-03, 1.53374233e-03, 9.64723926e-01,
        1.53374233e-03, 1.53374233e-03, 1.53374233e-03, 4.601