In [5]:
!pip install --upgrade -q mido

In [6]:
import math
import torch
import torch.nn as nn
import torch.nn.functional as F
import numpy as np

import time
import glob
import os
import pickle

from enum import IntEnum
from collections import deque

from torch.utils.data import Dataset, DataLoader
from mido import MidiFile, MidiTrack, Message, MetaMessage

In [7]:
!nvidia-smi

Thu May 12 10:28:26 2022       
+-----------------------------------------------------------------------------+
| NVIDIA-SMI 460.32.03    Driver Version: 460.32.03    CUDA Version: 11.2     |
|-------------------------------+----------------------+----------------------+
| GPU  Name        Persistence-M| Bus-Id        Disp.A | Volatile Uncorr. ECC |
| Fan  Temp  Perf  Pwr:Usage/Cap|         Memory-Usage | GPU-Util  Compute M. |
|                               |                      |               MIG M. |
|   0  Tesla V100-SXM2...  Off  | 00000000:00:04.0 Off |                    0 |
| N/A   37C    P0    23W / 300W |      0MiB / 16160MiB |      0%      Default |
|                               |                      |                  N/A |
+-------------------------------+----------------------+----------------------+
                                                                               
+-----------------------------------------------------------------------------+
| Proces

In [8]:
from google.colab import drive
drive.mount('/content/gdrive')

Drive already mounted at /content/gdrive; to attempt to forcibly remount, call drive.mount("/content/gdrive", force_remount=True).


In [9]:
MAX_TOKENS = 4096

In [10]:
# import serializer definition
%run /content/gdrive/MyDrive/Colab\ Notebooks/serializer-single.ipynb

In [11]:
class MaestroDataset(Dataset):

    def __init__(self, ds_path: str, device: str):
        self.ds_path = ds_path
        self.length = len(glob.glob(os.path.join(self.ds_path, "*.pt")))
        self.device = device


    def __len__(self):
        return self.length

    def __getitem__(self, idx):
        if torch.is_tensor(idx):
            idx = idx.tolist()

        if isinstance(idx, int):
          idx = [idx]

        tensors = []
        for i in idx:
          ts_path = os.path.join(self.ds_path, str(i) + ".pt")
          tsor = torch.load(ts_path, map_location=self.device)[:MAX_TOKENS+1]
          if tsor[-1] != SpecialTokens.PADDING.value:
            tsor[-1] = SpecialTokens.END.value
          tensors.append(tsor)

        return torch.cat(tensors)

In [12]:
DS_TYPE = "single" # chord or single representation
DS_PATH = "/content/gdrive/MyDrive/BP/datasets/MAESTRO-" + DS_TYPE
BATCH_SIZE = 1
with open(os.path.join(DS_PATH, "serializer.pickle"), "rb") as f:
  serializer = pickle.load(f)
train_ds = MaestroDataset(os.path.join(DS_PATH, "train"), "cuda:0")
test_ds = MaestroDataset(os.path.join(DS_PATH, "test"), "cuda:0")
valid_ds = MaestroDataset(os.path.join(DS_PATH, "validation"), "cuda:0")

In [13]:
training_loader = torch.utils.data.DataLoader(train_ds, batch_size=BATCH_SIZE, shuffle=True, num_workers=0)
test_loader = torch.utils.data.DataLoader(test_ds, batch_size=BATCH_SIZE, shuffle=True, num_workers=0)
valid_loader = torch.utils.data.DataLoader(valid_ds, batch_size=BATCH_SIZE, shuffle=True, num_workers=0)

In [14]:
class TransformerModel(nn.Module):

    def __init__(self, ntoken, ninp, nhead, nhid, nlayers, dropout=0.5):
        super(TransformerModel, self).__init__()
        from torch.nn import TransformerEncoder, TransformerEncoderLayer
        self.model_type = 'Transformer'
        self.src_mask = None
        self.pos_encoder = PositionalEncoding(ninp, dropout)
        encoder_layers = TransformerEncoderLayer(ninp, nhead, nhid, dropout)
        self.transformer_encoder = TransformerEncoder(encoder_layers, nlayers)
        self.encoder = nn.Embedding(ntoken, ninp)
        self.ninp = ninp
        self.decoder = nn.Linear(ninp, ntoken)

        self.init_weights()

    def _generate_square_subsequent_mask(self, sz):
        mask = (torch.triu(torch.ones(sz, sz)) == 1).transpose(0, 1)
        mask = mask.float().masked_fill(mask == 0, float('-inf')).masked_fill(mask == 1, float(0.0))
        return mask

    def init_weights(self):
        initrange = 0.1
        self.encoder.weight.data.uniform_(-initrange, initrange)
        self.decoder.bias.data.zero_()
        self.decoder.weight.data.uniform_(-initrange, initrange)

    def forward(self, src):
        if self.src_mask is None or self.src_mask.size(0) != len(src):
            device = src.device
            mask = self._generate_square_subsequent_mask(len(src)).to(device)
            self.src_mask = mask

        src = self.encoder(src) * math.sqrt(self.ninp)
        src = self.pos_encoder(src)
        output = self.transformer_encoder(src, self.src_mask)
        output = self.decoder(output)
        return output

In [15]:
class PositionalEncoding(nn.Module):

    def __init__(self, d_model, dropout=0.1, max_len=5000):
        super(PositionalEncoding, self).__init__()
        self.dropout = nn.Dropout(p=dropout)

        pe = torch.zeros(max_len, d_model)
        position = torch.arange(0, max_len, dtype=torch.float).unsqueeze(1)
        div_term = torch.exp(torch.arange(0, d_model, 2).float() * (-math.log(10000.0) / d_model))
        pe[:, 0::2] = torch.sin(position * div_term)
        pe[:, 1::2] = torch.cos(position * div_term)
        pe = pe.unsqueeze(0).transpose(0, 1)
        self.register_buffer('pe', pe)

    def forward(self, x):
        x = x + self.pe[:x.size(0), :]
        return self.dropout(x)

In [16]:
vocab_size = serializer.vocab_size() # the size of vocabulary
d_model = 512 # embedding dimension
dff = 2048 # the dimension of the feedforward network model in nn.TransformerEncoder
num_layers = 6 # the number of nn.TransformerEncoderLayer in nn.TransformerEncoder
nhead = 8 # the number of heads in the multiheadattention models
dropout = 0.1 # the dropout value
model = TransformerModel(vocab_size, d_model, nhead, dff, num_layers, dropout).to("cuda:0")

In [17]:
class CategoricalAccuracy(torch.nn.Module):
    def __init__(self):
        super().__init__()

    def forward(self, input: torch.Tensor, target: torch.Tensor):
        """
        :param input: [B, T, V]
        :param target: [B, T]
        :return:
        """
        input = input.softmax(-1)
        categorical_input = input.argmax(-1)
        bool_acc = categorical_input.long() == target.long()
        return bool_acc.sum().to(torch.float) / bool_acc.numel()

In [18]:
criterion = nn.CrossEntropyLoss()
accuracy_metric = CategoricalAccuracy()
lr = 5.0 # learning rate
optimizer = torch.optim.SGD(model.parameters(), lr=lr)
scheduler = torch.optim.lr_scheduler.StepLR(optimizer, 1.0, gamma=0.95)

def train():
    model.train() # Turn on the train mode
    vocab_size = serializer.vocab_size()
    total_loss = 0.
    total_acc = 0.
    for batch in training_loader:
        batch_x = batch[:, :-1].contiguous()
        batch_y = batch[:, 1:].contiguous()
        optimizer.zero_grad()
        output = model(batch_x)

        loss = criterion(output.view(-1, vocab_size), batch_y.flatten())
        loss.backward()
        torch.nn.utils.clip_grad_norm_(model.parameters(), 0.5)
        optimizer.step()

        total_loss += loss.item()
        total_acc += float(accuracy_metric(output, batch_y))

    return total_loss / len(training_loader), total_acc / len(training_loader)

def evaluate(eval_model, data_source):
    eval_model.eval() # Turn on the evaluation mode
    total_loss = 0.
    total_acc = 0.
    vocab_size = serializer.vocab_size()
    with torch.no_grad():
        for eval_batch in data_source:
            eval_x = eval_batch[:, :-1].contiguous()
            eval_y = eval_batch[:, 1:].contiguous()
            output = eval_model(eval_x)
            output_flat = output.view(-1, vocab_size)
            total_loss += criterion(output_flat, eval_y.flatten()).item()
            total_acc += float(accuracy_metric(output_flat, eval_y))

    return total_loss / len(data_source), total_acc / len(data_source)

In [19]:
EPOCHS = 100

In [None]:
epoch = -1
cumulative_epoch_time = 0

In [None]:
metric_log_path = '/content/gdrive/MyDrive/BP/logs/vanilla-transformer/' + DS_TYPE + '/training.csv'
time_log_path = '/content/gdrive/MyDrive/BP/logs/vanilla-transformer/' + DS_TYPE + '/time.csv'
best_test_loss = float("inf")
best_model = None
best_model_epoch = epoch

# Train Start
for _ in range(EPOCHS):
    epoch += 1
    epoch_start_time = time.time()
    train_loss, train_accuracy = train()
    test_loss, test_accuracy = evaluate(model, test_loader)
    time_elapsed = time.time() - epoch_start_time
    cumulative_epoch_time += time_elapsed
    print('-' * 112)
    print('| end of epoch {} | time: {:.2f}s | train loss {:.2f} | train accuracy {:.2f} | test loss {:.2f} | test accuracy {:.2f} |'.format(epoch+1, time_elapsed, train_loss, train_accuracy, test_loss, test_accuracy))
    print('-' * 112)

    # write csv metric information
    with open(metric_log_path, "a") as logf:
      logf.write(",".join([str(epoch+1), str(train_loss), str(train_accuracy), str(test_loss), str(test_accuracy)]) + "\n")
    
    # write learning time information
    gpu_name = torch.cuda.get_device_name(torch.device('cuda:0'))
    with open(time_log_path, "a") as logf:
      logf.write(",".join([str(epoch+1), str(cumulative_epoch_time), gpu_name]) + "\n")

    if test_loss < best_test_loss:
        best_test_loss = test_loss
        best_model = model
        best_model_epoch = epoch

    scheduler.step()
    torch.save(model.state_dict(), '/content/gdrive/MyDrive/BP/checkpoints/vanilla-transformer/' + DS_TYPE + '/epoch-{}.pth'.format(epoch))

torch.save(model.state_dict(), '/content/gdrive/MyDrive/BP/checkpoints/vanilla-transformer/' + DS_TYPE + '/epoch-{}-final.pth'.format(epoch))
torch.save(best_model.state_dict(), '/content/gdrive/MyDrive/BP/checkpoints/vanilla-transformer/' + DS_TYPE + '/best-model-epoch-{}.pth'.format(best_model_epoch))

----------------------------------------------------------------------------------------------------------------
| end of epoch 1 | time: 283.12s | train loss 4.75 | train accuracy 0.15 | test loss 3.46 | test accuracy 0.19 |
----------------------------------------------------------------------------------------------------------------
----------------------------------------------------------------------------------------------------------------
| end of epoch 2 | time: 111.06s | train loss 3.57 | train accuracy 0.18 | test loss 3.52 | test accuracy 0.20 |
----------------------------------------------------------------------------------------------------------------
----------------------------------------------------------------------------------------------------------------
| end of epoch 3 | time: 110.75s | train loss 3.51 | train accuracy 0.18 | test loss 3.40 | test accuracy 0.20 |
------------------------------------------------------------------------------------------------

In [None]:
def generate(model, primer=None, length=MAX_TOKENS):
  model.eval()
  inputs = primer
  if inputs is None:
    inputs = [SpecialTokens.START.value]
  
  for _ in range(length - len(inputs) + 1):
    np_arr = np.array([inputs])
    tensor_inp = torch.from_numpy(np_arr).to("cuda:0")
    
    result = model(tensor_inp)[0, -1]
    token = int(result.softmax(-1).argmax(-1))

    if token == SpecialTokens.END.value:
      break
    
    inputs.append(token)
  inputs.append(SpecialTokens.END.value)
  return inputs

In [None]:
generated = generate(best_model)
deserialized = serializer.deserialize(generated)
deserialized.save("/content/generated.midi")