In [None]:
!git clone https://github.com/music-x-lab/POP909-Dataset.git
!cd POP909-Dataset

In [1]:
from symusic import Score
from miditok import REMI
import os

tokenizer = REMI()

def extract_tokens_from_midi(file_path: str):
    score = Score.from_file(file_path)

    token_sequences = tokenizer(score)

    if not token_sequences:
        return []

    return token_sequences[0].tokens  # first track's tokens


def collect_pop909_tokens(base_dir: str = r"C:\\Documents\\CompSci\\CSE153\\assignment2\\POP909-Dataset\\POP909"):
    """Walk the POP909 folder structure and return a flat list of REMI tokens for all MIDI files.

    Args:
        base_dir: Root directory of the POP909 dataset.

    Returns:
        list[str]: A flat list of all REMI tokens from the dataset.
    """
    all_tokens: list[str] = []

    for folder_id in range(1, 910):
        folder_name = f"{folder_id:03d}"
        folder_path = os.path.join(base_dir, folder_name)

        if not os.path.isdir(folder_path):
            continue

        for fname in os.listdir(folder_path):
            if fname.lower().endswith((".mid", ".midi")):
                fpath = os.path.join(folder_path, fname)
                try:
                    tokens = extract_tokens_from_midi(fpath)
                    all_tokens.extend(tokens)  # append tokens to global list
                except Exception as exc:
                    print(f"[WARN] Failed to tokenize {fpath}: {exc}")

    return all_tokens

  from .autonotebook import tqdm as notebook_tqdm


In [2]:
token_sequence = collect_pop909_tokens()
print(token_sequence[:100])

['Bar_None', 'Bar_None', 'Bar_None', 'Bar_None', 'Bar_None', 'Position_25', 'Pitch_61', 'Velocity_115', 'Duration_0.1.8', 'Position_27', 'Pitch_63', 'Velocity_111', 'Duration_0.1.8', 'Position_29', 'Pitch_66', 'Velocity_119', 'Duration_0.1.8', 'Position_31', 'Pitch_68', 'Velocity_111', 'Duration_0.1.8', 'Bar_None', 'Position_1', 'Pitch_70', 'Velocity_111', 'Duration_0.1.8', 'Position_5', 'Pitch_66', 'Velocity_111', 'Duration_0.2.8', 'Position_9', 'Pitch_63', 'Velocity_119', 'Duration_0.3.8', 'Position_13', 'Pitch_68', 'Velocity_115', 'Duration_1.4.8', 'Bar_None', 'Position_1', 'Pitch_68', 'Velocity_111', 'Duration_0.3.8', 'Position_5', 'Pitch_65', 'Velocity_115', 'Duration_0.2.8', 'Position_9', 'Pitch_61', 'Velocity_115', 'Duration_0.2.8', 'Position_13', 'Pitch_66', 'Velocity_119', 'Duration_1.1.8', 'Position_25', 'Pitch_61', 'Velocity_111', 'Duration_0.1.8', 'Position_27', 'Pitch_63', 'Velocity_111', 'Duration_0.1.8', 'Position_29', 'Pitch_66', 'Velocity_115', 'Duration_0.1.8', 'Posit

In [None]:
import tensorflow as tf
from tensorflow.keras import layers
import numpy as np

# 1. Build vocab
unique_tokens = sorted(set(token_sequence))
token_to_id = {tok: i for i, tok in enumerate(unique_tokens)}
id_to_token = {i: tok for tok, i in token_to_id.items()}

# 2. Encode tokens to ids
encoded_sequence = [token_to_id[tok] for tok in token_sequence]

seq_length = 20  # how many tokens in input sequence

inputs = []
targets = []

for i in range(len(encoded_sequence) - seq_length):
    inputs.append(encoded_sequence[i:i+seq_length])
    targets.append(encoded_sequence[i+1:i+seq_length+1])

inputs = np.array(inputs)
targets = np.array(targets)

vocab_size = len(unique_tokens)
embedding_dim = 64
rnn_units = 128

model = tf.keras.Sequential([
    layers.Embedding(vocab_size, embedding_dim, input_length=seq_length),
    layers.LSTM(rnn_units, return_sequences=True),
    layers.Dense(vocab_size, activation='softmax')
])

model.compile(optimizer='adam', loss='sparse_categorical_crossentropy')

model.fit(inputs, targets, epochs=5, batch_size=64)



Epoch 1/5
[1m20435/20435[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m381s[0m 19ms/step - loss: 2.2089
Epoch 2/5
[1m14729/20435[0m [32m━━━━━━━━━━━━━━[0m[37m━━━━━━[0m [1m1:56[0m 20ms/step - loss: 1.7568

In [None]:
def generate_tokens(model, seed_sequence, gen_length=1000, temperature=1.0):
    generated = list(seed_sequence)
    for _ in range(gen_length):
        input_seq = np.array(generated[-seq_length:])[None, :]  # batch size 1
        preds = model.predict(input_seq)[0, -1]
        preds = np.log(preds) / temperature
        exp_preds = np.exp(preds)
        preds = exp_preds / np.sum(exp_preds)
        next_id = np.random.choice(len(preds), p=preds)
        generated.append(next_id)
    return generated

# Start generation with the first sequence as seed
seed_seq = encoded_sequence[:seq_length]
generated_ids = generate_tokens(model, seed_seq)

generated_tokens = [id_to_token[i] for i in generated_ids]
print(generated_tokens)

In [None]:
print(generated_tokens)

score = tokenizer([generated_tokens])  # this is equivalent to tokenizer.decode()

# Save to MIDI
score.dump_midi("generated_piece.mid")