# Piano Playalong Generation from MIDI

## Description

Add Description

## Tokenization

We use the MIDITok Tokenizer to create Tokens from our MIDI files:

https://miditok.readthedocs.io/




### Imports

In [2]:
import numpy as np
import pandas as pd
import os
import json
from pathlib import Path
from tqdm import tqdm

import torch
from torch import nn, optim
from torch.utils.data import Dataset, ConcatDataset, DataLoader
from torch.nn import TransformerEncoder, TransformerEncoderLayer

import partitura as pt

from miditok import Structured, TokenizerConfig
from miditok.pytorch_data import DatasetMIDI, DataCollator


### Create Tokenizer
Using the "Structured"-Tokenizer from MidiTok

In [3]:
# parameters
TOKENIZER_PARAMS = {
    "pitch_range": (21, 109),
    "beat_res": {(0, 4): 8, (4, 12): 4},
    "num_velocities": 32,
    "special_tokens": ["PAD", "BOS", "EOS", "MASK"],
    "use_chords": False,
    "use_rests": False,
    "use_tempos": False,
    "use_time_signatures": False,
    "use_programs": False,
    "num_tempos": 32,  # number of tempo bins
    "tempo_range": (40, 200),  # (min, max)
}
config = TokenizerConfig(**TOKENIZER_PARAMS)

# Creates the tokenizer
tokenizer = Structured(config)


### Load data and tokenize
Here you can filter out Midi-files you want to exclude (e.g. Tuning Tracks - Use lookup.json file to inspect data)

In [4]:
midi_paths = list(Path("data").glob("**/*.mid"))
lookup_path = Path("data", "lookup.json")
tokenizer_path = Path("data", "tokenizer", "tokenizer.json")

# load tokenizer if it already exists
if os.path.exists(tokenizer_path):
    tokenizer = Structured(params=tokenizer_path)

else:
    # files to exclude:
    idx_del = []
    lookup = json.loads(lookup_path.read_text())
    midi_paths_cleaned = [] 
    # files containing this string will be excluded
    lookup_str = "tuning"
    for key, title in lookup.items():
        if lookup_str in title.lower():
            idx_del.append(key)
        # manually specify keys to exclude 
        elif str(key) in ["0021","0361","0362"]:   # verbal instructions and whole cds that couldnt be converted to midi (too long)
            idx_del.append(key)
        else:
            midi_paths_cleaned.append(Path("data", str(key)+".mid"))

    print(f"Excluded {len(idx_del)} files, {len(midi_paths_cleaned)} remaining.")

    # Builds the vocabulary with BPE
    print(f"Learning BPE...")
    tokenizer.learn_bpe(vocab_size=30000, files_paths=midi_paths_cleaned)
    print(f"Saving tokenizer to {tokenizer_path}")
    tokenizer.save_params(tokenizer_path)
    print("Finished.")


Excluded 55 files, 1260 remaining.
Learning BPE...



Saving tokenizer to data/tokenizer/tokenizer.json
Finished.


In [5]:
midi_paths_cleaned[0]

PosixPath('data/0001.mid')

In [9]:
from partitura import load_performance_midi
score = load_performance_midi(midi_paths_cleaned[0])
# Get the duration of the MIDI file in seconds

print("Number of notes in the MIDI file:", len(score.note_array()))
score.note_array()[-1]

<partitura.performance.Performance object at 0x7c685a302810>
Number of notes in the MIDI file: 2188


(219.03645, 3.1432292, 168220, 2414, 71, 28, 0, 0, 'n2187')

In [10]:
midi = tokenizer(midi_paths_cleaned[0])
len(midi[0])

2559

### Dataloader and Collator

In [4]:
print(f"Tokenizing")
dataset = DatasetMIDI(
    files_paths=midi_paths_cleaned,
    min_seq_len=100,
    max_seq_len=1024,
    tokenizer=tokenizer,
)
collator = DataCollator(
    tokenizer["PAD_None"], tokenizer["BOS_None"], tokenizer["EOS_None"]
)
data_loader = DataLoader(dataset=dataset, collate_fn=collator)




Loading data: data: 100%|██████████| 1260/1260 [01:52<00:00, 11.16it/s]

Tokenizing





## Training the model

In [None]:


# Using the data loader in the training loop
for i, batch in enumerate(tqdm(data_loader)):
    print(f"Training model on batch {i}...")
