In [1]:
!pip install datasets
# !pip install ipywidgets
!pip install tqdm
# !pip install tensorflow

Collecting datasets
  Downloading datasets-3.4.0-py3-none-any.whl.metadata (19 kB)
Collecting dill<0.3.9,>=0.3.0 (from datasets)
  Downloading dill-0.3.8-py3-none-any.whl.metadata (10 kB)
Collecting xxhash (from datasets)
  Downloading xxhash-3.5.0-cp311-cp311-manylinux_2_17_x86_64.manylinux2014_x86_64.whl.metadata (12 kB)
Collecting multiprocess<0.70.17 (from datasets)
  Downloading multiprocess-0.70.16-py311-none-any.whl.metadata (7.2 kB)
Downloading datasets-3.4.0-py3-none-any.whl (487 kB)
[2K   [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m487.4/487.4 kB[0m [31m12.7 MB/s[0m eta [36m0:00:00[0m
[?25hDownloading dill-0.3.8-py3-none-any.whl (116 kB)
[2K   [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m116.3/116.3 kB[0m [31m5.4 MB/s[0m eta [36m0:00:00[0m
[?25hDownloading multiprocess-0.70.16-py311-none-any.whl (143 kB)
[2K   [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m143.5/143.5 kB[0m [31m6.9 MB/s[0m eta [36m0:00:00[0m
[?25hDownloading x

In [3]:
from datasets import load_dataset

dataset_raw = load_dataset("sebastiandizon/genius-song-lyrics")
#dataset = load_dataset("sebastiandizon/genius-song-lyrics", split="train[:100000]")

The secret `HF_TOKEN` does not exist in your Colab secrets.
To authenticate with the Hugging Face Hub, create a token in your settings tab (https://huggingface.co/settings/tokens), set it as secret in your Google Colab and restart your session.
You will be able to reuse this secret in all of your notebooks.
Please note that authentication is recommended but still optional to access public models or datasets.


song_lyrics%202.csv:   0%|          | 0.00/9.07G [00:00<?, ?B/s]

Generating train split: 0 examples [00:00, ? examples/s]

Loading dataset shards:   0%|          | 0/19 [00:00<?, ?it/s]

In [4]:
dataset = dataset_raw["train"].shuffle(seed=42).select(range(10000))

In [5]:
dataset

Dataset({
    features: ['title', 'tag', 'artist', 'year', 'views', 'features', 'lyrics', 'id', 'language_cld3', 'language_ft', 'language'],
    num_rows: 10000
})

In [6]:
# drop id
dataset = dataset.remove_columns(['id'])
#drop views
dataset = dataset.remove_columns(['views'])

In [7]:
import multiprocessing
num_cpus = multiprocessing.cpu_count()
num_cpus

2

In [8]:
# drop non-english songs
dataset = dataset.filter(
    lambda song: song['language'] == 'en' and song['language_ft'] == 'en' and song['language_cld3'] == 'en',
    num_proc = num_cpus
)
dataset = dataset.remove_columns(['language_cld3', 'language_ft', 'language', 'artist', 'year', 'features'])

Filter (num_proc=2):   0%|          | 0/10000 [00:00<?, ? examples/s]

In [9]:
# see how many observations/rows in dataset along with our features
dataset

Dataset({
    features: ['title', 'tag', 'lyrics'],
    num_rows: 6613
})

In [10]:
import re

def generate_tokens(batch):
    """ Tokenizer that assigns each word to a song section correctly. """
    tokenized_lyrics = []

    for lyrics in batch["lyrics"]:
        tokens = re.findall(r"[\w']+|[.,!?;(){}\[\]]", lyrics)  # Tokenize with punctuation
        section = "Unknown"  # Default section for words before the first section label
        in_section_label = False  # Flag for detecting section headers
        sectioned_tokens = []  # Store (word, section) pairs

        for token in tokens:
            if token == "[":
                in_section_label = True  # Start detecting section name
                continue
            elif in_section_label:
                section = token  # Set new section name
                in_section_label = False  # Reset flag
                continue
            elif token == "]":
                continue  # Ignore the closing bracket

            # Only store tokens that are actual words, not stray brackets or empty sections
            if token.isalnum():
                sectioned_tokens.append((token, section))

        tokenized_lyrics.append(sectioned_tokens)

    return {"lyric_tokens": tokenized_lyrics}

dataset = dataset.map(generate_tokens, batched=True, num_proc=num_cpus)

Map (num_proc=2):   0%|          | 0/6613 [00:00<?, ? examples/s]

In [11]:
dataset

Dataset({
    features: ['title', 'tag', 'lyrics', 'lyric_tokens'],
    num_rows: 6613
})

In [12]:
# check our tags/genres
set(dataset['tag'])

{'country', 'misc', 'pop', 'rap', 'rb', 'rock'}

In [35]:
import numpy as np
from collections import defaultdict, Counter
from tqdm import tqdm

class HMM:
    def __init__(self, states, tag, vocab_size=10000, smoothing=1):
        self.states = states # song sections (ex. Chorus, Intro, Hook)
        self.tag = tag
        self.S = len(states)
        self.vocab_size = vocab_size
        self.smoothing = smoothing  # laplace smoothing

        # transitions: P(next_section | current_section)
        self.transition_matrix = np.zeros((self.S, self.S))

        # emission probs: P(word | section)
        self.vocab = []
        self.word_counts = defaultdict(lambda: defaultdict(int))  # word_counts[section][word] = count

        self.T = 0  # num words in vocab (set later)
        self.emission_probs = None  # Will be initialized in `fit()`

    def fit(self, dataset):
        transition_counts = defaultdict(lambda: defaultdict(int))
        section_counts = defaultdict(int)
        word_totals = defaultdict(int)
        vocab_counter = Counter()

        # calculate transition and emission counts
        for _, song in tqdm(enumerate(dataset), total=len(dataset)):
            if song['tag'] != self.tag:
                continue

            prev_section = None
            for data in song['lyric_tokens']:
                word, section = data
                word = word.lower()
                vocab_counter[word] += 1
                self.word_counts[section][word] += 1
                section_counts[section] += 1
                word_totals[word] += 1

                if prev_section is not None:
                    transition_counts[prev_section][section] += 1  # update transition count
                prev_section = section  # update previous

        # keep most common words
        self.vocab = [word for word, _ in vocab_counter.most_common(self.vocab_size)]
        self.T = len(self.vocab)
        self.emission_probs = np.zeros((self.S, self.T))  # Initialize emission matrix

        section_to_idx = {s: i for i, s in enumerate(self.states)}
        word_to_idx = {w: i for i, w in enumerate(self.vocab)}

        # compute Transition Probabilities (with Laplace smoothing)
        for _, s1 in tqdm(enumerate(self.states), total=len(self.states)):
            total_transitions = sum(transition_counts[s1].values()) + self.S * self.smoothing
            for s2 in self.states:
                self.transition_matrix[section_to_idx[s1], section_to_idx[s2]] = (transition_counts[s1][s2] + self.smoothing) / total_transitions

        # compute Emission Probabilities (with Laplace smoothing)
        for _, section in tqdm(enumerate(self.states), total=len(self.states)):
            section_idx = section_to_idx[section]
            total_words = section_counts[section] + self.T * self.smoothing

            for word in self.vocab:
                word_idx = word_to_idx[word]
                self.emission_probs[section_idx, word_idx] = (self.word_counts[section][word] + self.smoothing) / total_words

    def viterbi_generate(self, start_section="Intro", length=50, temperature=1.0):
        generated_lyrics = []
        section_sequence = []

        section_to_idx = {s: i for i, s in enumerate(self.states)}
        word_to_idx = {w: i for i, w in enumerate(self.vocab)}
        idx_to_word = {i: w for w, i in word_to_idx.items()}
        idx_to_section = {i: s for s, i in section_to_idx.items()}

        # start at the given section
        current_section = start_section
        section_sequence.append(current_section)

        # randomly sample the next word instead of greedily choosing next word to produce less predictable results
        for _ in tqdm(range(length)):
            section_idx = section_to_idx[current_section]

            # randomly sample next word
            word_probs = np.exp(self.emission_probs[section_idx] / temperature) / np.sum(np.exp(self.emission_probs[section_idx] / temperature)) # apply softmax with temperature to get probs
            word_idx = np.random.choice(self.T, p=word_probs)
            generated_lyrics.append(idx_to_word[word_idx])

            # use the raw transition probabilities to sample the next section
            next_section_probs = np.exp(self.transition_matrix[section_idx] / temperature) / np.sum(np.exp(self.transition_matrix[section_idx] / temperature))
            next_section_idx = np.random.choice(self.S, p=next_section_probs)
            current_section = idx_to_section[next_section_idx]
            section_sequence.append(current_section)

        return " ".join(generated_lyrics), section_sequence

    def get_log_likelihood(self, lyrics):
        section_to_idx = {s: i for i, s in enumerate(self.states)}
        word_to_idx = {w: i for i, w in enumerate(self.vocab)}

        log_transitions = np.log(self.transition_matrix)
        log_emissions = np.log(self.emission_probs)

        L = len(lyrics)

        # get initial probs
        initial_log_probs = np.zeros(self.S)
        for section in self.states:
            initial_log_probs[section_to_idx[section]] = np.log((sum(self.word_counts[section].values()) + self.vocab_size) / self.vocab_size ** 2)

        alpha = np.zeros((L, self.S))
        for section in range(self.S):
            alpha[0, section] = initial_log_probs[section] + log_emissions[section, word_to_idx[lyrics[0]]]

        # use forward algorithm to calculate the log likelihood of the given lyrics
        for word_index in range(1, L):
            cur_word = lyrics[word_index]
            for cur_state in range(self.S):
                for prev_state in range(self.S):
                    log_obs = log_emissions[cur_state, word_to_idx[cur_word]] if cur_word in self.vocab else np.log(1 / self.vocab_size)
                    alpha[word_index, cur_state] += log_obs + np.log(self.transition_matrix[prev_state, cur_state]) + alpha[word_index - 1, prev_state]
        #print(alpha)

        return np.sum(alpha[L - 1])

In [36]:
states = [
    "Verse",
    "Chorus",
    "Refrain",
    "Hook",
    "Bridge",
    "Breakdown",
    "Intro",
    "Outro",
    "Skit",
    "Pre-Chorus",
    "Post-Chorus",
    "Interlude",
    "Segue",
    "Instrumental",
    "Instrumental Break",
    "Snippet",
]

In [37]:
hmm_model = HMM(states, tag="pop", vocab_size=500)
hmm_model.fit(dataset)

100%|██████████| 6613/6613 [00:11<00:00, 567.33it/s]
100%|██████████| 16/16 [00:00<00:00, 30883.05it/s]
100%|██████████| 16/16 [00:00<00:00, 2347.45it/s]


In [38]:
# generate lyrics with viterbi
generated_song, section_sequence = hmm_model.viterbi_generate(start_section="Verse", length=100, temperature=0.25)

print("\n\nGenerated Lyrics:")
print(generated_song)
print("\nSection Sequence:")
print(section_sequence)

100%|██████████| 100/100 [00:00<00:00, 7078.76it/s]



Generated Lyrics:
woah were easy line gone from seems find talk be now till leave little must full or everybody ah done smile night okay until full ask chorus moon upon gonna such got change your hate or head getting how won for we shine here world somebody fuck started dead seems eyes hit free feel just past ll arms hey tried these to his his more do its going fucking thought seems my everybody call think dark beautiful will matter cause those o being air together cool blow hand he things brain behind la do other trying last thinking how tomorrow

Section Sequence:
['Verse', 'Verse', 'Verse', 'Verse', 'Pre-Chorus', 'Refrain', 'Intro', 'Intro', 'Intro', 'Intro', 'Intro', 'Instrumental Break', 'Instrumental', 'Instrumental', 'Refrain', 'Refrain', 'Segue', 'Pre-Chorus', 'Intro', 'Intro', 'Intro', 'Refrain', 'Refrain', 'Refrain', 'Verse', 'Verse', 'Verse', 'Interlude', 'Post-Chorus', 'Segue', 'Chorus', 'Chorus', 'Chorus', 'Bridge', 'Bridge', 'Bridge', 'Bridge', 'Bridge', 'Bridge', 'Outr




In [39]:
# to evaluate our models, we will use lyrics from Gotye's "Somebody That I Used to Know"
test_lyrics = ['now', 'and', 'then', ',', 'i', 'think', 'of', 'all', 'the', 'times', 'you', 'screwed', 'me', 'over']
hmm_model.get_log_likelihood(test_lyrics)

-1.578536794109031e+18

In [40]:
# to evaluate our models, we will use lyrics from Whitney Houston's "I Wanna Dance With Somebody"
test_lyrics = ['i', 'wanna', 'dance', 'with', 'somebody']
hmm_model.get_log_likelihood(test_lyrics)

-23170310.177259937

In [41]:
# simple case to evaluate
test_lyrics = ['i', 'love', 'you']
hmm_model.get_log_likelihood(test_lyrics)

-87293.94117342455