In [1]:
import os
import sys
import torch
import torch.utils.data as data
import miditok
from miditok import MIDILike
from miditok import REMI, TokenizerConfig
from miditok.pytorch_data import DatasetMIDI, DataCollator
from miditok.utils import split_files_for_training
from pathlib import Path
import glob

sys.path.append('MusicTransformer-Pytorch')

In [2]:
dataset_path = Path('maestro-v3.0.0').resolve()
files_paths = [file_path.resolve() for file_path in dataset_path.glob("**/*.midi")]
# print(files_paths)

In [3]:
tokenizer = MIDILike()
dataset_chunks_dir = Path('/storage/vsub851/neural-architecture-search/music-gen/maestro-v3.0.0').resolve()
print(dataset_chunks_dir)

/storage/vsub851/neural-architecture-search/music-gen/maestro-v3.0.0


In [4]:
split_files_for_training(
    files_paths=files_paths,
    tokenizer=tokenizer,
    save_dir=dataset_chunks_dir,
    max_seq_len=1024,
)
dataset = DatasetMIDI(
    files_paths=files_paths,
    tokenizer=tokenizer,
    max_seq_len=1024,
    bos_token_id=tokenizer['BOS_None'],
    eos_token_id=tokenizer['EOS_None'],
)

Splitting music files (/storage/vsub851/neural-architecture-search/music-gen/maestro-v3.0.0): 100%|██████████████████████| 118492/118492 [02:00<00:00, 984.21it/s]


In [5]:
collator = DataCollator(tokenizer.pad_token_id, copy_inputs_as_labels=True)
dataloader = data.DataLoader(dataset, batch_size=64, collate_fn=collator)

In [6]:
for batch in dataloader:
    print(batch)
    break

{'input_ids': tensor([[  1, 212,  66,  ...,  65, 198,   0],
        [  1, 212,  69,  ..., 108,  32,   0],
        [  1,  16, 198,  ...,   0,   0,   0],
        ...,
        [ 63, 197, 212,  ...,  51, 191,  55],
        [  1, 213,  16,  ..., 212,  52,   0],
        [  1, 231,  72,  ..., 198, 212,   0]]), 'labels': tensor([[   1,  212,   66,  ...,   65,  198, -100],
        [   1,  212,   69,  ...,  108,   32, -100],
        [   1,   16,  198,  ..., -100, -100, -100],
        ...,
        [  63,  197,  212,  ...,   51,  191,   55],
        [   1,  213,   16,  ...,  212,   52, -100],
        [   1,  231,   72,  ...,  198,  212, -100]]), 'attention_mask': tensor([[1, 1, 1,  ..., 1, 1, 0],
        [1, 1, 1,  ..., 1, 1, 0],
        [1, 1, 1,  ..., 0, 0, 0],
        ...,
        [1, 1, 1,  ..., 1, 1, 1],
        [1, 1, 1,  ..., 1, 1, 0],
        [1, 1, 1,  ..., 1, 1, 0]], dtype=torch.int32)}


In [7]:
from model.music_transformer import MusicTransformer




In [8]:
MusicTransformer

model.music_transformer.MusicTransformer

In [9]:
model = MusicTransformer(n_layers=12, num_heads=12,
            d_model=768, dim_feedforward=2048, dropout=0.1,
            max_sequence=2048, rpr=True)

In [10]:
model

MusicTransformer(
  (dummy): DummyDecoder()
  (embedding): Embedding(390, 768)
  (positional_encoding): PositionalEncoding(
    (dropout): Dropout(p=0.1, inplace=False)
  )
  (transformer): Transformer(
    (encoder): TransformerEncoderRPR(
      (layers): ModuleList(
        (0-11): 12 x TransformerEncoderLayerRPR(
          (self_attn): MultiheadAttentionRPR(
            (out_proj): Linear(in_features=768, out_features=768, bias=True)
          )
          (linear1): Linear(in_features=768, out_features=2048, bias=True)
          (dropout): Dropout(p=0.1, inplace=False)
          (linear2): Linear(in_features=2048, out_features=768, bias=True)
          (norm1): LayerNorm((768,), eps=1e-05, elementwise_affine=True)
          (norm2): LayerNorm((768,), eps=1e-05, elementwise_affine=True)
          (dropout1): Dropout(p=0.1, inplace=False)
          (dropout2): Dropout(p=0.1, inplace=False)
        )
      )
      (norm): LayerNorm((768,), eps=1e-05, elementwise_affine=True)
    )
    

In [11]:
for batch in dataloader:
    print(batch)
    print(model(batch['input_ids']))
    break

{'input_ids': tensor([[  1, 212,  66,  ...,  65, 198,   0],
        [  1, 212,  69,  ..., 108,  32,   0],
        [  1,  16, 198,  ...,   0,   0,   0],
        ...,
        [ 63, 197, 212,  ...,  51, 191,  55],
        [  1, 213,  16,  ..., 212,  52,   0],
        [  1, 231,  72,  ..., 198, 212,   0]]), 'labels': tensor([[   1,  212,   66,  ...,   65,  198, -100],
        [   1,  212,   69,  ...,  108,   32, -100],
        [   1,   16,  198,  ..., -100, -100, -100],
        ...,
        [  63,  197,  212,  ...,   51,  191,   55],
        [   1,  213,   16,  ...,  212,   52, -100],
        [   1,  231,   72,  ...,  198,  212, -100]]), 'attention_mask': tensor([[1, 1, 1,  ..., 1, 1, 0],
        [1, 1, 1,  ..., 1, 1, 0],
        [1, 1, 1,  ..., 0, 0, 0],
        ...,
        [1, 1, 1,  ..., 1, 1, 1],
        [1, 1, 1,  ..., 1, 1, 0],
        [1, 1, 1,  ..., 1, 1, 0]], dtype=torch.int32)}
tensor([[[ 0.0358, -0.1839, -0.5190,  ...,  0.2532,  0.0804,  0.0048],
         [-0.5835,  0.5661, -0.