In [1]:
!pip install datasets
# !pip install ipywidgets
!pip install tqdm
# !pip install tensorflow

Collecting datasets
  Downloading datasets-3.4.0-py3-none-any.whl.metadata (19 kB)
Collecting dill<0.3.9,>=0.3.0 (from datasets)
  Downloading dill-0.3.8-py3-none-any.whl.metadata (10 kB)
Collecting xxhash (from datasets)
  Downloading xxhash-3.5.0-cp311-cp311-manylinux_2_17_x86_64.manylinux2014_x86_64.whl.metadata (12 kB)
Collecting multiprocess<0.70.17 (from datasets)
  Downloading multiprocess-0.70.16-py311-none-any.whl.metadata (7.2 kB)
Downloading datasets-3.4.0-py3-none-any.whl (487 kB)
[2K   [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m487.4/487.4 kB[0m [31m8.5 MB/s[0m eta [36m0:00:00[0m
[?25hDownloading dill-0.3.8-py3-none-any.whl (116 kB)
[2K   [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m116.3/116.3 kB[0m [31m8.3 MB/s[0m eta [36m0:00:00[0m
[?25hDownloading multiprocess-0.70.16-py311-none-any.whl (143 kB)
[2K   [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m143.5/143.5 kB[0m [31m10.5 MB/s[0m eta [36m0:00:00[0m
[?25hDownloading x

In [2]:
from datasets import load_dataset

dataset = load_dataset("sebastiandizon/genius-song-lyrics")
#dataset = load_dataset("sebastiandizon/genius-song-lyrics", split="train[:100000]")

The secret `HF_TOKEN` does not exist in your Colab secrets.
To authenticate with the Hugging Face Hub, create a token in your settings tab (https://huggingface.co/settings/tokens), set it as secret in your Google Colab and restart your session.
You will be able to reuse this secret in all of your notebooks.
Please note that authentication is recommended but still optional to access public models or datasets.


song_lyrics%202.csv:   0%|          | 0.00/9.07G [00:00<?, ?B/s]

Generating train split: 0 examples [00:00, ? examples/s]

Loading dataset shards:   0%|          | 0/19 [00:00<?, ?it/s]

In [3]:
dataset = dataset["train"].shuffle(seed=42).select(range(100))

In [4]:
dataset

Dataset({
    features: ['title', 'tag', 'artist', 'year', 'views', 'features', 'lyrics', 'id', 'language_cld3', 'language_ft', 'language'],
    num_rows: 100
})

In [5]:
# drop id
dataset = dataset.remove_columns(['id'])

In [6]:
#drop views
dataset = dataset.remove_columns(['views'])

In [7]:
import multiprocessing
num_cpus = multiprocessing.cpu_count()
num_cpus

2

In [8]:
# drop non-english songs
dataset = dataset.filter(
    lambda song: song['language'] == 'en' and song['language_ft'] == 'en' and song['language_cld3'] == 'en',
    num_proc = num_cpus
)

Filter (num_proc=2):   0%|          | 0/100 [00:00<?, ? examples/s]

In [9]:
# see how many observations/rows in dataset along with our features
dataset

Dataset({
    features: ['title', 'tag', 'artist', 'year', 'features', 'lyrics', 'language_cld3', 'language_ft', 'language'],
    num_rows: 73
})

In [10]:
import re

# convert each song to a list of string tokens - treat punctuation as token
def generate_tokens(batch):
    #return {"lyric_tokens": [lyrics.split() for lyrics in batch["lyrics"]]}
    return {"lyric_tokens": [re.findall(r"[\w']+|[.,!?;(){}\[\]]", lyrics) for lyrics in batch["lyrics"]]}

dataset = dataset.map(generate_tokens, batched=True, num_proc=num_cpus)

Map (num_proc=2):   0%|          | 0/73 [00:00<?, ? examples/s]

In [11]:
dataset

Dataset({
    features: ['title', 'tag', 'artist', 'year', 'features', 'lyrics', 'language_cld3', 'language_ft', 'language', 'lyric_tokens'],
    num_rows: 73
})

In [12]:
# check our tags/genres
set(dataset['tag'])

{'country', 'misc', 'pop', 'rap', 'rb', 'rock'}

In [18]:
import numpy as np
from collections import defaultdict, Counter

# hmm class - states are different sections of song, observations are each word/token
# will use hmm to generate song lyrics and calculate log likelihood of lyrics
class HMM:
    def __init__(self, states, tag, vocab_size=10000):
        self.states = states
        self.tag = tag

        self.S = len(states)
        self.transition_matrix = np.zeros((self.S, self.S))

        self.obs_vals = []
        self.vocab_size = vocab_size
        self.word_counts = defaultdict(lambda: defaultdict(lambda: 0))

        self.T = 0
        self.emission_probs = np.zeros((self.T, self.S))

    def fit(self, dataset):
        # get counts of each word in tag
        word_counts = Counter()
        for song in dataset:
            if song['tag'] != self.tag:
                continue
            lowercase_lyrics = [lyrics.lower() for lyrics in song['lyric_tokens']]
            word_counts.update(lowercase_lyrics)

        # keep vocab_size words
        for word, count in word_counts.most_common(self.vocab_size):
            self.obs_vals.append(word)

        # transition probs: P(next_state | prev_state) = (prev_state to next_state counts) / (prev_state to any state counts)

        # emission probs is P(word | section) = P(word, state) / P(state)
        # for each song in dataset, assign each word to a state
        # now, P(word, state) = P(word|state)P(state)
        # P(state) = (state words count) / (tot words count)
        # P(word | state) = (state word count) / (word count)



In [19]:
# hidden states are different sections of the song - intro, chorus, bridge, etc.
# our states are the section headers given by Genius' section header guide https://genius.com/Genius-song-sections-and-headers-guide-annotated
states = [
    "[Verse",
    "[Chorus",
    "[Refrain",
    "[Hook",
    "[Bridge",
    "[Breakdown",
    "[Intro",
    "[Outro",
    "[Skit",
    "[Pre-Chorus",
    "[Post-Chorus",
    "[Interlude",
    "[Segue",
    "[Instrumental",
    "[Instrumental Break",
    "[Snippet",
]

In [21]:
# let's make an hmm for pop songs
pop_hmm = HMM(states, "pop")