# Piano Playalong Generation from MIDI

## Description

Add Description

## Tokenization

We use the MIDITok Tokenizer to create Tokens from our MIDI files:

https://miditok.readthedocs.io/




### Imports

In [32]:
import numpy as np
import pandas as pd
import os
import json
from pathlib import Path
from tqdm import tqdm

import torch
from torch import nn, optim
from torch.utils.data import Dataset, ConcatDataset, DataLoader, random_split
from torch.nn import TransformerEncoder, TransformerEncoderLayer
import lightning as L
import partitura as pt

from miditok import Structured, TokenizerConfig
from miditok.pytorch_data import DatasetMIDI, DataCollator, split_midis_for_training


### Create Tokenizer
Using the "Structured"-Tokenizer from MidiTok

In [7]:
# parameters
TOKENIZER_PARAMS = {
    "pitch_range": (21, 109),
    "beat_res": {(0, 4): 8, (4, 12): 4},
    "num_velocities": 32,
    "special_tokens": ["PAD", "BOS", "EOS", "MASK"],
    "use_chords": False,
    "use_rests": False,
    "use_tempos": False,
    "use_time_signatures": False,
    "use_programs": False,
    "num_tempos": 32,  # number of tempo bins
    "tempo_range": (40, 200),  # (min, max)
}
# Set to True if you want to use BPE
USE_BPE = False

config = TokenizerConfig(**TOKENIZER_PARAMS)

# Creates the tokenizer
tokenizer = Structured(config)


### Load data and tokenize
Here you can filter out Midi-files you want to exclude (e.g. Tuning Tracks - Use lookup.json file to inspect data)

In [8]:
midi_paths = list(Path("data").glob("**/*.mid"))
lookup_path = Path("data", "lookup.json")

# files to exclude:
idx_del = []
lookup = json.loads(lookup_path.read_text())
midi_paths_cleaned = [] 
# files containing this string will be excluded
lookup_str = "tuning"
for key, title in lookup.items():
    if lookup_str in title.lower():
        idx_del.append(key)
    # manually specify keys to exclude 
    elif str(key) in ["0021","0361","0362"]:   # verbal instructions and whole cds that couldnt be converted to midi (too long)
        idx_del.append(key)
    else:
        midi_paths_cleaned.append(Path("data", str(key)+".mid"))

print(f"Loaded {len(midi_paths_cleaned)} valid files, {len(idx_del)} invalid files excluded.")


Loaded 1260 valid files, 55 invalid files excluded.


In [9]:

tokenizer_path = Path("data", "tokenizer", "tokenizer.json")

# load tokenizer if it already exists
if os.path.exists(tokenizer_path):
    tokenizer = Structured(params=tokenizer_path)

else:

    
    # Builds the vocabulary with BPE
    if USE_BPE:
        print(f"Learning BPE...")
        tokenizer.learn_bpe(vocab_size=30000, files_paths=midi_paths_cleaned)
        print(f"Saving tokenizer with BPE to {tokenizer_path}")
        tokenizer.save_params(tokenizer_path)
        print("Finished.")
    # Saves tokenizer without BPE
    else:
        print(f"Saving tokenizer to {tokenizer_path}")
        tokenizer.save_params(tokenizer_path)
        print("Done.")


In [10]:
midi_paths_cleaned[0]

PosixPath('data/0001.mid')

In [11]:
from partitura import load_performance_midi
score = load_performance_midi(midi_paths_cleaned[0])
# Get the duration of the MIDI file in seconds

print("Number of notes in the MIDI file:", len(score.note_array()))
score.note_array()[-1]

Number of notes in the MIDI file: 2188


(219.03645, 3.1432292, 168220, 2414, 71, 28, 0, 0, 'n2187')

In [12]:
midi = tokenizer(midi_paths_cleaned[0])
len(midi[0])

8752

### Split MIDIs into subsequences

In [23]:
# Split MIDIs into smaller chunks for training
MAX_SEQUENCE_LENGTH = 128
dataset_chunks_dir = Path("data", "midi_chunks")

if not os.path.exists(dataset_chunks_dir) or not os.listdir(dataset_chunks_dir):
    midi_paths_chunks = split_midis_for_training(
        files_paths=midi_paths_cleaned,
        tokenizer=tokenizer,
        save_dir=dataset_chunks_dir,
        max_seq_len=MAX_SEQUENCE_LENGTH,
    )
else: 
    midi_paths_chunks = sorted([Path(p) for p in dataset_chunks_dir.iterdir() if p.is_file()])[1:] #first object is some hidden file

In [24]:
print(f"Total number of files after splitting into chunks: ",len(os.listdir(dataset_chunks_dir)))
midi_paths_chunks[0:10]

Total number of files after splitting into chunks:  148667


[PosixPath('data/midi_chunks/0001_0.mid'),
 PosixPath('data/midi_chunks/0001_1.mid'),
 PosixPath('data/midi_chunks/0001_10.mid'),
 PosixPath('data/midi_chunks/0001_11.mid'),
 PosixPath('data/midi_chunks/0001_12.mid'),
 PosixPath('data/midi_chunks/0001_13.mid'),
 PosixPath('data/midi_chunks/0001_14.mid'),
 PosixPath('data/midi_chunks/0001_15.mid'),
 PosixPath('data/midi_chunks/0001_16.mid'),
 PosixPath('data/midi_chunks/0001_17.mid')]

### Dataloading and Collator

In [36]:
# Load midi chunks into dataset
dataset = DatasetMIDI(
    files_paths=midi_paths_chunks,
    max_seq_len=MAX_SEQUENCE_LENGTH,
    tokenizer=tokenizer,
    bos_token_id=tokenizer["BOS_None"],
    eos_token_id=tokenizer["EOS_None"]
)
dataset_train, dataset_val = random_split(dataset, [0.8, 0.2])

# set PAD, BOS, EOS for collator - add attention masks
# and pads (on left for generation - models usually cant generate from padding tokens)
## TODO : does this make sense?
collator = DataCollator(
    tokenizer["PAD_None"], pad_on_left=True, )

# Set up dataloader
data_loader_train = DataLoader(dataset=dataset_train, batch_size=64, collate_fn=collator,shuffle=True)
data_loader_val = DataLoader(dataset=dataset_val, batch_size=64, collate_fn=collator,shuffle=False)
print(f"Dataloader created.")
print(f"N samples in train/val : {len(data_loader_train)} / {len(data_loader_val)}")

Dataloader created.
N samples in train/val : 1859 / 465


In [37]:
# Inspect elements in batch
first_batch = next(iter(data_loader_train))
first_batch


{'input_ids': tensor([[193,  38, 112,  ...,  56, 108, 125],
         [189,  43, 113,  ...,  52, 111, 126],
         [192,  49, 103,  ...,  50, 108, 128],
         ...,
         [190,  61, 106,  ...,  42, 107, 125],
         [192,  50, 108,  ...,  52, 110, 124],
         [198,  62, 109,  ...,  59, 105, 129]]),
 'attention_mask': tensor([[1, 1, 1,  ..., 1, 1, 1],
         [1, 1, 1,  ..., 1, 1, 1],
         [1, 1, 1,  ..., 1, 1, 1],
         ...,
         [1, 1, 1,  ..., 1, 1, 1],
         [1, 1, 1,  ..., 1, 1, 1],
         [1, 1, 1,  ..., 1, 1, 1]], dtype=torch.int32)}

## Training the model

### Transformer model class

In [38]:
import pytorch_lightning as pl
from torch.nn import functional as F
from transformers import GPT2LMHeadModel, AutoConfig, Trainer, TrainingArguments
from transformers.optimization import AdamW 

# Setting the seed
pl.seed_everything(42)

# Ensure that all operations are deterministic on GPU (if used) for reproducibility
torch.backends.cudnn.deterministic = True
torch.backends.cudnn.benchmark = False

device = torch.device("cuda:0") if torch.cuda.is_available() else torch.device("cpu")
print("Device:", device)


#for i, batch in enumerate(tqdm(data_loader)):
#    pass
    #print(f"Training model on batch {i}...")


Seed set to 42


Device: cpu


In [30]:
# load config and setup untrained model of gpt2
config = AutoConfig.from_pretrained("gpt2",vocab_size = len(tokenizer),
                                    n_ctx=MAX_SEQUENCE_LENGTH, 
                                    bos_token_id = tokenizer["BOS_None"], 
                                    eos_token_id = tokenizer["EOS_None"],
                                    )

model = GPT2LMHeadModel(config)
"Number of Parameters in model:", sum(p.numel() for p in model.parameters()),model.config



('Number of Parameters in model:',
 86083584,
 GPT2Config {
   "_name_or_path": "gpt2",
   "activation_function": "gelu_new",
   "architectures": [
     "GPT2LMHeadModel"
   ],
   "attn_pdrop": 0.1,
   "bos_token_id": 1,
   "embd_pdrop": 0.1,
   "eos_token_id": 2,
   "initializer_range": 0.02,
   "layer_norm_epsilon": 1e-05,
   "model_type": "gpt2",
   "n_ctx": 128,
   "n_embd": 768,
   "n_head": 12,
   "n_inner": null,
   "n_layer": 12,
   "n_positions": 1024,
   "reorder_and_upcast_attn": false,
   "resid_pdrop": 0.1,
   "scale_attn_by_inverse_layer_idx": false,
   "scale_attn_weights": true,
   "summary_activation": null,
   "summary_first_dropout": 0.1,
   "summary_proj_to_labels": true,
   "summary_type": "cls_index",
   "summary_use_proj": true,
   "task_specific_params": {
     "text-generation": {
       "do_sample": true,
       "max_length": 50
     }
   },
   "transformers_version": "4.39.3",
   "use_cache": true,
   "vocab_size": 314
 })

In [31]:
training_args = TrainingArguments(
    output_dir = "./model/gpt-2",
    evaluation_strategy = "epoch",
    auto_find_batch_size=True,
    num_train_epochs=2,
    gradient_accumulation_steps=8,
    weight_decay=0.1,
    lr_scheduler_type="cosine",
    learning_rate=5e-4,
    fp16=True,
    logging_steps=10,
    )
trainer = Trainer(
    model=model, 
    tokenizer= tokenizer, 
    args = training_args,
    data_collator=collator,
    train_dataset=dataset_train,
    eval_dataset=dataset_val,    
)

# Train the model
trainer.train()

SyntaxError: invalid syntax (3318070914.py, line 1)

In [14]:


class BartModel(pl.LightningModule):
    def __init__(self, learning_rate=1e-5):
        super().__init__()
        self.save_hyperparameters()
        self.model = BartForConditionalGeneration(BartConfig())

    def forward(self, x):
        return self.model(x).logits

    def training_step(self, batch, batch_idx):
        
        loss = self.model(**batch).loss
        
        self.log('train_loss', loss, on_epoch=True)
        return loss

    def validation_step(self, batch, batch_idx):
        loss = self.model(**batch).loss
        self.log('valid_loss', loss, on_step=True, sync_dist=True)

    def configure_optimizers(self):
        optimizer = AdamW(self.parameters(),
                          self.hparams.learning_rate)
        return optimizer



GPU available: False, used: False
TPU available: False, using: 0 TPU cores
IPU available: False, using: 0 IPUs
HPU available: False, using: 0 HPUs


TypeError: `model` must be a `LightningModule` or `torch._dynamo.OptimizedModule`, got `BartModel`

In [None]:
next(iter(data_loader))

TypeError: 'NoneType' object is not subscriptable