To run this code you need to first create a hugging face account. 

## Install necessary libraries

In [1]:
#!pip install datasets transformers
#!pip install wandb

In [2]:
# Import the load_dataset function from the 'datasets' library for dataset loading
from datasets import load_dataset

# Import AutoTokenizer from the 'transformers' library for handling tokenization
from transformers import AutoTokenizer

# Import Tokenizer, WordLevel model, WhitespaceSplit pre_tokenizer, and WordLevelTrainer from the 'tokenizers' library
from tokenizers import Tokenizer
from tokenizers.models import WordLevel
from tokenizers.pre_tokenizers import WhitespaceSplit
from tokenizers.trainers import WordLevelTrainer

# Import PreTrainedTokenizerFast from the 'transformers' library for efficient tokenization
from transformers import PreTrainedTokenizerFast

# Import notebook_login from the 'huggingface_hub' library for logging into the Hugging Face Model Hub
from huggingface_hub import notebook_login

# Import pandas for data manipulation
import pandas as pd

# Import wandb for logging experiments with Weights & Biases
import wandb

In [3]:
# Set parameters for WandB (Weights & Biases) integration
wandb_project = "lmd_musicgen"
entity = "musicgen"
data_processed = "lmd_processed"

## Download Dataset from Hugging Face

In [4]:
# Import necessary library or module for loading datasets
from datasets import load_dataset

# Load the dataset named "juancopi81/mmm_track_lmd_8bars_nots" with the "train" split
ds = load_dataset("juancopi81/mmm_track_lmd_8bars_nots", split="train")

# Split the loaded dataset into training and testing sets
# Set test_size to 0.1, indicating that 10% of the data will be used for testing
# Shuffle the data to ensure randomness in the selection of training and testing samples
raw_datasets = ds.train_test_split(test_size=0.1, shuffle=True)

# Display the resulting raw datasets, which now consist of training and testing sets
raw_datasets


Found cached dataset parquet (C:/Users/naomi/.cache/huggingface/datasets/juancopi81___parquet/juancopi81--mmm_track_lmd_8bars_nots-449984f4d6dfe7ce/0.0.0/2a3b91fbd88a2c90d1dbbb32b460cf621d31bd5b05b934492fdef7d8d6f236ec)


DatasetDict({
    train: Dataset({
        features: ['text'],
        num_rows: 159810
    })
    test: Dataset({
        features: ['text'],
        num_rows: 17757
    })
})

## Train the tokenizer

Let's start by seeing how the default GPT-2 tokenizer works on our dataset

In [5]:
# Extract the "text" field from the "train" split of the raw_datasets

# Select the 11th sample (index 10 since indexing starts at 0) from the "train" split
sample_10 = raw_datasets["train"]["text"][10]

# Extract a substring from the selected sample, taking the first 242 characters
sample = sample_10[:242]

# Display the resulting substring (sample)
sample


'PIECE_START  GENRE=OTHER TRACK_START INST=0 DENSITY=0 BAR_START BAR_END BAR_START BAR_END BAR_START BAR_END BAR_START BAR_END BAR_START BAR_END BAR_START BAR_END BAR_START BAR_END BAR_START BAR_END TRACK_END TRACK_START INST=43 DENSITY=0 BAR_'

In [6]:
# Load the pre-trained GPT-2 tokenizer
tokenizer = AutoTokenizer.from_pretrained("gpt2")

# Print the tokens obtained by tokenizing the 'sample' using the GPT-2 tokenizer
# The 'sample' is a text string, and the .tokens() method returns the list of tokens
print(tokenizer(sample).tokens())


['PI', 'EC', 'E', '_', 'ST', 'ART', 'Ġ', 'ĠGEN', 'RE', '=', 'OTHER', 'ĠTR', 'ACK', '_', 'ST', 'ART', 'ĠINST', '=', '0', 'ĠD', 'ENS', 'ITY', '=', '0', 'ĠBAR', '_', 'ST', 'ART', 'ĠBAR', '_', 'END', 'ĠBAR', '_', 'ST', 'ART', 'ĠBAR', '_', 'END', 'ĠBAR', '_', 'ST', 'ART', 'ĠBAR', '_', 'END', 'ĠBAR', '_', 'ST', 'ART', 'ĠBAR', '_', 'END', 'ĠBAR', '_', 'ST', 'ART', 'ĠBAR', '_', 'END', 'ĠBAR', '_', 'ST', 'ART', 'ĠBAR', '_', 'END', 'ĠBAR', '_', 'ST', 'ART', 'ĠBAR', '_', 'END', 'ĠBAR', '_', 'ST', 'ART', 'ĠBAR', '_', 'END', 'ĠTR', 'ACK', '_', 'END', 'ĠTR', 'ACK', '_', 'ST', 'ART', 'ĠINST', '=', '43', 'ĠD', 'ENS', 'ITY', '=', '0', 'ĠBAR', '_']


Not the best since it's not familiar with this English vocabulary, the tokenizer is using quite a few subwords. The tokenizer process involves four steps: 

1. Normalization, 
2. Pretokenization, 
3. Applying the tokenizer model, and 
4. Postprocessing. 

Let's break down each step in our example.

### 1. Normalization

In the normalization step, we perform *"general cleanup, such as eliminating unnecessary whitespace, converting to lowercase, and/or removing accents..."* as per the HF course.

Since our vocabulary is already normalized, there's no requirement to eliminate whitespace, convert to lowercase, or perform any additional cleanup. This step can be bypassed.

### 2. Pretokenization

*"As outlined in the upcoming sections, training a tokenizer solely on raw text is not feasible. The initial step involves breaking down the texts into smaller units, such as words. This is where the pre-tokenization step becomes crucial. As demonstrated in Chapter 2, a word-based tokenizer can easily segment raw text into words based on whitespace and punctuation."* HF course.

In our scenario, this step is straightforward; our pretokenization aims to segment our text into "words" since our dataset is already a sequence of tokens. Therefore, a Whitespace pre_tokenizer would be suitable here. Once again, the chosen model is "WordLevel".

In [7]:
# Initialize a new tokenizer with the WordLevel model
# The 'unk_token="[UNK]"' parameter sets the token to be used for unknown or out-of-vocabulary words
new_tokenizer = Tokenizer(model=WordLevel(unk_token="[UNK]"))

In [8]:
# Assign the WhitespaceSplit pre_tokenizer to the 'pre_tokenizer' attribute of the new_tokenizer
new_tokenizer.pre_tokenizer = WhitespaceSplit()

In [9]:
# Let's test our pre_tokenizer
new_tokenizer.pre_tokenizer.pre_tokenize_str(sample)

[('PIECE_START', (0, 11)),
 ('GENRE=OTHER', (13, 24)),
 ('TRACK_START', (25, 36)),
 ('INST=0', (37, 43)),
 ('DENSITY=0', (44, 53)),
 ('BAR_START', (54, 63)),
 ('BAR_END', (64, 71)),
 ('BAR_START', (72, 81)),
 ('BAR_END', (82, 89)),
 ('BAR_START', (90, 99)),
 ('BAR_END', (100, 107)),
 ('BAR_START', (108, 117)),
 ('BAR_END', (118, 125)),
 ('BAR_START', (126, 135)),
 ('BAR_END', (136, 143)),
 ('BAR_START', (144, 153)),
 ('BAR_END', (154, 161)),
 ('BAR_START', (162, 171)),
 ('BAR_END', (172, 179)),
 ('BAR_START', (180, 189)),
 ('BAR_END', (190, 197)),
 ('TRACK_END', (198, 207)),
 ('TRACK_START', (208, 219)),
 ('INST=43', (220, 227)),
 ('DENSITY=0', (228, 237)),
 ('BAR_', (238, 242))]

### 3. Tokenizer model training

In [10]:
# This function will yield the samples to train our tokenizer
# Define a function named 'get_training_corpus' to generate training data in chunks
def get_training_corpus():
  # Access the "train" split of the raw_datasets
  dataset = raw_datasets["train"]
  
  # Iterate through the dataset in chunks of 1000 samples
  for i in range(0, len(dataset), 1000):
    # Yield the "text" field of each chunk, creating a generator for training corpus
    yield dataset[i : i + 1000]["text"]

In [11]:
# Initialize a WordLevelTrainer for training a WordLevel tokenizer
# The trainer is configured with special tokens including "[UNK]", "[CLS]", "[SEP]", "[PAD]", and "[MASK]"
trainer = WordLevelTrainer(
    special_tokens=["[UNK]", "[CLS]", "[SEP]", "[PAD]", "[MASK]"]
)

**[UNK] (Unknown Token):**
Represents unknown or out-of-vocabulary words. When the tokenizer encounters a word not present in its vocabulary, it replaces the unknown word with "[UNK]" during tokenization.Handles words that are not part of the model's training vocabulary, ensuring that even unseen words can be represented.<br>
**[CLS] (Classification Token):**
Represents the beginning of a sequence or a classification task. It is often used in conjunction with sequence classification tasks or sentence-level embeddings. Indicates the start of a sequence, helping models understand the structure of input data and facilitating tasks like sentence classification.<br>
**[SEP] (Separator Token)**
Represents the separation between two segments in a sequence. Commonly used to separate sentences or segments in tasks like question-answering or text generation. Helps the model distinguish between different parts of the input sequence, guiding the model to understand relationships between segments.<br>
**[PAD] (Padding Token)**
Represents padding in sequences. It is used to make sequences of variable lengths equal in size by adding padding tokens to shorter sequences. Ensures that input sequences have consistent lengths, allowing for efficient batch processing during training and inference.<br>
**[MASK] (Mask Token)**
Used in masked language modeling tasks. During training, some words are replaced with "[MASK]" tokens, and the model is tasked with predicting the original words based on the context. Supports the pre-training of language models by training the model to predict missing or masked words, enhancing its ability to understand context and relationships between words.<br>

In [12]:
# Train the 'new_tokenizer' using the training data generated by the 'get_training_corpus' function
# The training is performed with the specified 'trainer', which is a WordLevelTrainer
new_tokenizer.train_from_iterator(get_training_corpus(), trainer=trainer)

### 4. Post processing and save it to the hub

In [13]:
# Save the trained 'new_tokenizer' to a file named "tokenizer.json"
new_tokenizer.save("tokenizer.json")

# Load the saved tokenizer from "tokenizer.json" and create a new instance of PreTrainedTokenizerFast
new_tokenizer = PreTrainedTokenizerFast(tokenizer_file="tokenizer.json")

# Add a special token, '[PAD]', to the loaded tokenizer
new_tokenizer.add_special_tokens({'pad_token': '[PAD]'})

# We will receive '0' as output if saving was successful with no errors

0

You must create an 'Access Token' in hugging face with the same name as the new tokenizer to run the below. i.e. pop909_tokenizer.

In [14]:
# Log in to the Hugging Face Model Hub 
notebook_login()

VBox(children=(HTML(value='<center> <img\nsrc=https://huggingface.co/front/assets/huggingface_logo-noborder.sv…

In [28]:
# Push the trained 'new_tokenizer' to the Hugging Face Model Hub 
new_tokenizer.push_to_hub("aimusicgen/lmd_tokenizer")

CommitInfo(commit_url='https://huggingface.co/aimusicgen/lmd_tokenizer/commit/fce7dab92a08aa2925bdfa46c94f8dadbd78e141', commit_message='Upload tokenizer', commit_description='', oid='fce7dab92a08aa2925bdfa46c94f8dadbd78e141', pr_url=None, pr_revision=None, pr_num=None)

In [29]:
# Load a pre-trained tokenizer using the AutoTokenizer class from the Hugging Face Model Hub
# The tokenizer is loaded from the repository named "aimusicgen/lmd_tokenizer"
load_tokenizer = AutoTokenizer.from_pretrained("aimusicgen/lmd_tokenizer")

Downloading tokenizer_config.json:   0%|          | 0.00/146 [00:00<?, ?B/s]

Downloading tokenizer.json:   0%|          | 0.00/599k [00:00<?, ?B/s]

Downloading (…)cial_tokens_map.json:   0%|          | 0.00/27.0 [00:00<?, ?B/s]

In [30]:
# Tokenize the 'sample' text using the pre-trained 'load_tokenizer'
# The 'sample' text is expected to be processed into a sequence of tokens
load_tokenizer(sample)

{'input_ids': [194, 780, 41, 173, 95, 6, 5, 6, 5, 6, 5, 6, 5, 6, 5, 6, 5, 6, 5, 6, 5, 40, 41, 1779, 95, 0], 'token_type_ids': [0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0], 'attention_mask': [1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1]}

What we can see here is the tokenized text in the form of dictionaries.<br>

**input_ids:** Represents the tokenized input sequence where each number corresponds to a specific token.Each number corresponds to a token in the vocabulary.<br>

**token_type_ids:** Represents the segment or sentence to which each token belongs. All tokens in the sequence belong to the same segment or sentence (segment 0). <br>

**attention_mask:** Represents the attention mask indicating which tokens should be attended to (have a value of 1) and which should be ignored (have a value of 0). All tokens in the sequence are attended to, indicating that none should be ignored.

In [31]:
# Tokenize the 'sample' text using the pre-trained 'load_tokenizer'
# Retrieve and display the list of tokens obtained from the tokenization process
load_tokenizer(sample).tokens()

['PIECE_START',
 'GENRE=OTHER',
 'TRACK_START',
 'INST=0',
 'DENSITY=0',
 'BAR_START',
 'BAR_END',
 'BAR_START',
 'BAR_END',
 'BAR_START',
 'BAR_END',
 'BAR_START',
 'BAR_END',
 'BAR_START',
 'BAR_END',
 'BAR_START',
 'BAR_END',
 'BAR_START',
 'BAR_END',
 'BAR_START',
 'BAR_END',
 'TRACK_END',
 'TRACK_START',
 'INST=43',
 'DENSITY=0',
 '[UNK]']

In [32]:
# Load a pre-trained tokenizer using the AutoTokenizer class from the Hugging Face Model Hub
# The tokenizer is loaded from the repository named "aimusicgen/lmd_tokenizer"
tokenizer = AutoTokenizer.from_pretrained("aimusicgen/lmd_tokenizer")

# Tokenize the 'sample' text using the loaded tokenizer
# Print and display the list of tokens obtained from the tokenization process
print(tokenizer(sample).tokens())

['PIECE_START', 'GENRE=OTHER', 'TRACK_START', 'INST=0', 'DENSITY=0', 'BAR_START', 'BAR_END', 'BAR_START', 'BAR_END', 'BAR_START', 'BAR_END', 'BAR_START', 'BAR_END', 'BAR_START', 'BAR_END', 'BAR_START', 'BAR_END', 'BAR_START', 'BAR_END', 'BAR_START', 'BAR_END', 'TRACK_END', 'TRACK_START', 'INST=43', 'DENSITY=0', '[UNK]']


In [33]:
# Obtain the vocabulary of the tokenizer using the 'get_vocab' method
vocab = tokenizer.get_vocab()

# Display and store the obtained vocabulary
vocab

{'TIME_DELTA=6.135416666666667': 5536,
 'TIME_DELTA=14.864583333333332': 11884,
 'TIME_DELTA=3.3125000000000004': 4853,
 'TIME_DELTA=3.0999999999999988': 13364,
 'TIME_DELTA=0.18749999999999956': 2013,
 'TIME_DELTA=0.5312500000000018': 4786,
 'TIME_DELTA=0.2777777777777781': 9629,
 'TIME_DELTA=4.137931034482759': 13461,
 'TIME_DELTA=0.031249999999998224': 3006,
 'TIME_DELTA=8.583333333333334': 2229,
 'TIME_DELTA=0.25925925925925924': 10412,
 'TIME_DELTA=2.816666666666667': 3153,
 'TIME_DELTA=4.031249999999999': 8485,
 'TIME_DELTA=8.591666666666667': 8425,
 'TIME_DELTA=3.302083333333334': 3955,
 'TIME_DELTA=10.366666666666667': 4558,
 'TIME_DELTA=1.6562499999999996': 7024,
 'TIME_DELTA=3.6916666666666664': 3988,
 'TIME_DELTA=0.36666666666666675': 12510,
 'TIME_DELTA=1.9666666666666686': 5257,
 'TIME_DELTA=3.5083333333333337': 7046,
 'TIME_DELTA=21.925': 13293,
 'TIME_DELTA=0.9375000000000009': 2264,
 'TIME_DELTA=14.697916666666666': 6921,
 'TIME_DELTA=6.9333333333333345': 13669,
 'TIME_

The vocabulary of the tokenizer is the equalivant as a metadata dictionary for the tokenizer we created. It maps tokens to numerical indices, providing information about the unique elements (tokens) present in the dataset.It outlines the structure and organization of the tokenization process, detailing how words or subwords are represented by numerical indices.

Create a dataframe with the tokenizer's vocabulary to upload it to weights & bias.

In [34]:
# Create a DataFrame 'df' using a list comprehension, where each row contains a token and its corresponding index from
# the tokenizer's vocabulary
df = pd.DataFrame([{"Token": token, "Index": idx} for token, idx in vocab.items()]).sort_values(by="Index")

# Display the DataFrame 'df', which presents the tokens and their associated indices in the tokenizer's vocabulary
df

Unnamed: 0,Token,Index
12314,[UNK],0
2918,[CLS],1
10915,[SEP],2
12469,[PAD],3
6134,[MASK],4
...,...,...
4743,TIME_DELTA=9.925000000000002,13853
1343,TIME_DELTA=9.933333333333332,13854
12177,TIME_DELTA=9.974999999999998,13855
11076,TIME_DELTA=9.975000000000001,13856


### Upload vocab to W&B

In [35]:
# Initialize a Weights & Biases (wandb) run with project name is set to "lmd_pretokenization", 
# and the job type is set to "upload"
run = wandb.init(project=wandb_project, job_type="upload")

In [36]:
# Create table with vocab
vocab_table = wandb.Table(data=df)

"If your framework uses or produces models or datasets, you can log them for full traceability and have wandb automatically monitor your entire pipeline through W&B Artifacts." - https://docs.wandb.ai/guides/integrations/add-wandb-to-any-library

In [37]:
# Create artifact for raw data
processed_data_at = wandb.Artifact(name=data_processed, type="processed_data")

In [38]:
# Add 'vocab_table' to 'processed_data_at' artifact with the name "vocab_table"
processed_data_at.add(vocab_table, name="vocab_table")

ArtifactManifestEntry(path='vocab_table.table.json', digest='41KqUPBY4eP+tHqAGdZPrw==', size=542886, local_path='C:\\Users\\naomi\\AppData\\Local\\wandb\\wandb\\artifacts\\staging\\tmpiptf_in0')

In [39]:
# Log the 'processed_data_at' artifact to the Weights & Biases run
run.log_artifact(processed_data_at)

<Artifact lmd_processed>

In [40]:
# Complete and finish the Weights & Biases run
run.finish()

VBox(children=(Label(value='0.001 MB of 0.026 MB uploaded\r'), FloatProgress(value=0.04424456501948217, max=1.…