Connect to Google Drive

In [None]:
from google.colab import drive
drive.mount('/content/drive',force_remount=True)
%cd /content/drive/Shareddrives/EC523_Project/CodeWorkspace

Mounted at /content/drive
/content/drive/Shareddrives/EC523_Project/CodeWorkspace


Imports for Tokenizer

In [None]:
!pip install miditok
!pip install tokenizers
!pip install setuptools_rust

In [None]:
import miditok
import miditoolkit
from miditok import *

Initialize the tokenizer

In [None]:
pitch_range = range(21, 109)
beat_res = {(0, 4): 8, (4, 12): 4}
nb_velocities = 32
additional_tokens = {'Chord': True, 'Rest': False, 'Tempo': True, 'Program': False, 'TimeSignature': False,
                     'rest_range': (2, 8),  # (half, 8 beats)
                     'nb_tempos': 32,  # nb of tempo bins
                     'tempo_range': (40, 250)}  # (min, max)
special_tokens = ["PAD", "BOS", "EOS", "MASK"]
tokenizer = REMI(pitch_range)


Code to Move MAESTRO Files into respective folders to separate between training, validation, and testing

Don't run twice

In [None]:
import csv
import shutil
from pathlib import Path

src_dir = '/content/drive/Shareddrives/EC523_Project/maestro-v3.0.0/'
dst_dir_train = '/content/drive/Shareddrives/EC523_Project/CodeWorkspace/MAESTRO/train'
dst_dir_valid = '/content/drive/Shareddrives/EC523_Project/CodeWorkspace/MAESTRO/validation'
dst_dir_test = '/content/drive/Shareddrives/EC523_Project/CodeWorkspace/MAESTRO/test'

with open('/content/drive/Shareddrives/EC523_Project/maestro-v3.0.0/maestro-v3.0.0.csv', 'r') as csvfile:
    # Create a CSV reader object
    reader = csv.reader(csvfile)
    for i, row in enumerate(reader):
        if row[2] == 'train':
          filename = src_dir+row[4]
          if filename[-4:] == '.wav':
            print('Skipping ', i)
          else:
            shutil.copy2(filename, dst_dir_train)
        if row[2] == 'validation':
          filename = src_dir+row[4]
          if filename[-4:] == '.wav':
            print('Skipping ', i)
          else:
            shutil.copy2(filename, dst_dir_valid)
        if row[2] == 'test':
          filename = src_dir+row[4]
          if filename[-4:] == '.wav':
            print('Skipping ', i)
          else:
            shutil.copy2(filename, dst_dir_test)

Code to train base tokenizer on training dataset, No need to run again, but can run on validation and testing once trained

In [None]:
from pathlib import Path
midi_paths = list(Path('MAESTRO','train').glob('**/*.midi'))


In [None]:
tokenizer.tokenize_midi_dataset(midi_paths, '/content/drive/Shareddrives/EC523_Project/CodeWorkspace/tokens_BPE_train')


Tokenizing MIDIs (CodeWorkspace/tokens_BPE_train): 100%|██████████| 962/962 [08:02<00:00,  1.99it/s]
Applying BPE to dataset: 100%|██████████| 961/961 [00:41<00:00, 22.88it/s]


In [None]:
# Set vocab size of the BPE tokenizer
vocab_size = 500

In [None]:
tokenizer.learn_bpe(
    vocab_size=vocab_size,
    tokens_paths=list(Path('tokens_noBPE').glob("**/*.json")),
    out_dir=Path('tokens_BPE'),
)

Loading token files: 100%|██████████| 961/961 [00:05<00:00, 179.21it/s]


In [None]:
tokenizer.apply_bpe_to_dataset(Path('tokens_noBPE'), Path('tokens_BPE'))

Applying BPE to dataset: 100%|██████████| 961/961 [00:46<00:00, 20.65it/s]


In [None]:
# Using trained tokenizer, converting test tokens to BPE versions
from pathlib import Path
midi_paths = list(Path('MAESTRO','test').glob('**/*.midi'))
tokenizer.tokenize_midi_dataset(midi_paths, '/content/drive/Shareddrives/EC523_Project/CodeWorkspace/tokens_BPE_test')

Tokenizing MIDIs (CodeWorkspace/tokens_BPE_test): 100%|██████████| 177/177 [01:03<00:00,  2.80it/s]
Applying BPE to dataset: 100%|██████████| 177/177 [00:05<00:00, 33.61it/s]


In [None]:
# Using trained tokenizer, converting validation tokens to BPE versions
from pathlib import Path
midi_paths = list(Path('MAESTRO','validation').glob('**/*.midi'))
tokenizer.tokenize_midi_dataset(midi_paths, '/content/drive/Shareddrives/EC523_Project/CodeWorkspace/tokens_BPE_valid')

Tokenizing MIDIs (CodeWorkspace/tokens_BPE_valid): 100%|██████████| 137/137 [00:54<00:00,  2.51it/s]
Applying BPE to dataset: 100%|██████████| 137/137 [00:04<00:00, 29.34it/s]


Saving and Loading Trained Tokenizer

In [None]:
import json

# Run after training tokenizer on training set
tokenizer.save_params('/content/drive/Shareddrives/EC523_Project/CodeWorkspace/REMItokenizer.json')

In [None]:
# Run to use the same tokenizer again
tokenizer.load_params('/content/drive/Shareddrives/EC523_Project/CodeWorkspace/REMItokenizer.json')

In [None]:
# Checking that the tokenizer 
vocab = tokenizer._bpe_model.get_vocab()
print(vocab)

{'ý': 220, 'J': 41, '\x86': 101, '\x8a£': 314, 'T\x8e\x9e': 335, 'w': 86, '\x94\x9f': 251, 'R\x8e\x9e': 366, 'I': 40, 'º': 153, '\x91\x9e': 226, '\\\x90\x9e': 430, 'Y\x8d\x9e': 364, 'O\x8d\x9e': 336, '½': 156, 'R\x90\x9e': 420, '\x8e¤': 395, 'î': 205, 's': 82, 'R': 49, '\x87¤': 346, 'M\x8a\x9e': 385, '\x8c¤': 370, 'V\x8c\x9e': 389, 'Q\x90\x9e': 407, 'µ': 148, 'U\x90\x9e': 458, '*': 9, 'U\x8e\x9e': 426, 'G': 38, '\x88\x9e': 232, 'ç': 198, 'x': 87, 'p': 79, 'X': 55, '\x94\xa0': 279, '\x82': 97, '¸': 151, 'c': 66, '\x96': 117, 'Y\x8f\x9e': 347, 'L\x8c\x9e': 462, ']\x8f\x9e': 369, '8': 23, '[': 58, '«': 138, 'W\x8e\x9e': 398, 'ä': 195, '\x8e': 109, 'ã': 194, '\x8c\x9e': 225, '\x85£': 495, '\x90\xa0': 255, '\\': 59, '\x96\x9e': 249, '¯': 142, ']\x90\x9e': 387, '`\x90\x9e': 396, '\x89¤': 339, '#': 2, '\x92¡': 291, '\x85¡': 313, 'Û': 186, '\x90¡': 285, '£': 130, 'Ì': 171, 'U\x8f\x9e': 435, 'R\x8b\x9e': 381, '\x86£': 328, 'H\x8f\x9e': 472, '\x85¢': 359, '\x98': 119, '×': 182, 'N\x8d\x9e': 463,

In [None]:
# Testing to see that the tokenizer successfully reconverts back tokens to midi file
import json
with open('/content/drive/Shareddrives/EC523_Project/CodeWorkspace/tokens_BPE/MIDI-UNPROCESSED_14-15_R1_2014_MID--AUDIO_14_R1_2014_wav--1.json', 'r') as f:
  data = json.load(f)['ids']
midi = tokenizer.tokens_to_midi(data)
print(midi)

ticks per beat: 384
max tick: 730752
tempo changes: 1
time sig: 0
key sig: 0
markers: 0
lyrics: False
instruments: 1


Imports from Bachsformer - Codebase for GPT model

In [None]:
!git clone https://github.com/pier-maker92/bachsformer.git
%cd /content/drive/Shareddrives/EC523_Project/CodeWorkspace/bachsformer/transformer_decoder_only

Cloning into 'bachsformer'...
remote: Enumerating objects: 118, done.[K
remote: Counting objects: 100% (118/118), done.[K
remote: Compressing objects: 100% (105/105), done.[K
remote: Total 118 (delta 27), reused 95 (delta 11), pack-reused 0[K
Receiving objects: 100% (118/118), 21.58 MiB | 8.67 MiB/s, done.
Resolving deltas: 100% (27/27), done.
/content/drive/Shareddrives/EC523_Project/CodeWorkspace/bachsformer/transformer_decoder_only


In [None]:
import math

import torch
import torch.nn as nn
from torch.nn import functional as F
from utils import CfgNode as CN

In [None]:
!pip install transformers

Looking in indexes: https://pypi.org/simple, https://us-python.pkg.dev/colab-wheels/public/simple/


In [None]:
class NewGELU(nn.Module):
    """
    Implementation of the GELU activation function currently in Google BERT repo (identical to OpenAI GPT).
    Reference: Gaussian Error Linear Units (GELU) paper: https://arxiv.org/abs/1606.08415
    """
    def forward(self, x):
        return 0.5 * x * (1.0 + torch.tanh(math.sqrt(2.0 / math.pi) * (x + 0.044715 * torch.pow(x, 3.0))))

class CausalSelfAttention(nn.Module):
    """
    A vanilla multi-head masked self-attention layer with a projection at the end.
    It is possible to use torch.nn.MultiheadAttention here but I am including an
    explicit implementation here to show that there is nothing too scary here.
    """

    def __init__(self, config):
        super().__init__()
        assert config.n_embd % config.n_head == 0
        # key, query, value projections for all heads, but in a batch
        self.c_attn = nn.Linear(config.n_embd, 3 * config.n_embd)
        # output projection
        self.c_proj = nn.Linear(config.n_embd, config.n_embd)
        # regularization
        self.attn_dropout = nn.Dropout(config.attn_pdrop)
        self.resid_dropout = nn.Dropout(config.resid_pdrop)
        # causal mask to ensure that attention is only applied to the left in the input sequence
        self.register_buffer("bias", torch.tril(torch.ones(config.block_size, config.block_size))
                                     .view(1, 1, config.block_size, config.block_size))
        self.n_head = config.n_head
        self.n_embd = config.n_embd

    def forward(self, x):
        B, T, C = x.size() # batch size, sequence length, embedding dimensionality (n_embd)

        # calculate query, key, values for all heads in batch and move head forward to be the batch dim
        q, k ,v  = self.c_attn(x).split(self.n_embd, dim=2)
        k = k.view(B, T, self.n_head, C // self.n_head).transpose(1, 2) # (B, nh, T, hs)
        q = q.view(B, T, self.n_head, C // self.n_head).transpose(1, 2) # (B, nh, T, hs)
        v = v.view(B, T, self.n_head, C // self.n_head).transpose(1, 2) # (B, nh, T, hs)

        # causal self-attention; Self-attend: (B, nh, T, hs) x (B, nh, hs, T) -> (B, nh, T, T)
        att = (q @ k.transpose(-2, -1)) * (1.0 / math.sqrt(k.size(-1)))
        att = att.masked_fill(self.bias[:,:,:T,:T] == 0, float('-inf'))
        att = F.softmax(att, dim=-1)
        att = self.attn_dropout(att)
        y = att @ v # (B, nh, T, T) x (B, nh, T, hs) -> (B, nh, T, hs)
        y = y.transpose(1, 2).contiguous().view(B, T, C) # re-assemble all head outputs side by side

        # output projection
        y = self.resid_dropout(self.c_proj(y))
        return y

class Block(nn.Module):
    """ an unassuming Transformer block """

    def __init__(self, config):
        super().__init__()
        self.ln_1 = nn.LayerNorm(config.n_embd)
        self.attn = CausalSelfAttention(config)
        self.ln_2 = nn.LayerNorm(config.n_embd)
        self.mlp = nn.ModuleDict(dict(
            c_fc    = nn.Linear(config.n_embd, 4 * config.n_embd),
            c_proj  = nn.Linear(4 * config.n_embd, config.n_embd),
            act     = NewGELU(),
            dropout = nn.Dropout(config.resid_pdrop),
        ))
        m = self.mlp
        self.mlpf = lambda x: m.dropout(m.c_proj(m.act(m.c_fc(x)))) # MLP forward

    def forward(self, x):
        x = x + self.attn(self.ln_1(x))
        x = x + self.mlpf(self.ln_2(x))
        return x

class GPT(nn.Module):
    """ GPT Language Model """

    @staticmethod
    def get_default_config():
        C = CN()
        # either model_type or (n_layer, n_head, n_embd) must be given in the config
        C.model_type = 'gpt'
        C.n_layer = None
        C.n_head = None
        C.n_embd =  None
        # these options must be filled in externally
        C.vocab_size = None
        C.block_size = None
        # dropout hyperparameters
        C.embd_pdrop = 0.1
        C.resid_pdrop = 0.1
        C.attn_pdrop = 0.1
        return C

    def __init__(self, config):
        super().__init__()
        assert config.vocab_size is not None
        assert config.block_size is not None
        self.block_size = config.block_size

        type_given = config.model_type is not None
        params_given = all([config.n_layer is not None, config.n_head is not None, config.n_embd is not None])
        assert type_given ^ params_given # exactly one of these (XOR)
        if type_given:
            # translate from model_type to detailed configuration
            config.merge_from_dict({
                # names follow the huggingface naming conventions
                # GPT-1
                'openai-gpt':   dict(n_layer=12, n_head=12, n_embd=768),  # 117M params
                # GPT-2 configs
                'gpt2':         dict(n_layer=12, n_head=12, n_embd=768),  # 124M params
                'gpt2-medium':  dict(n_layer=24, n_head=16, n_embd=1024), # 350M params
                'gpt2-large':   dict(n_layer=36, n_head=20, n_embd=1280), # 774M params
                'gpt2-xl':      dict(n_layer=48, n_head=25, n_embd=1600), # 1558M params
                # Gophers
                'gopher-44m':   dict(n_layer=8, n_head=16, n_embd=512),
                # (there are a number more...)
                # I made these tiny models up
                'gpt-mini':     dict(n_layer=12, n_head=12, n_embd=192),
                'gpt-micro':    dict(n_layer=4, n_head=4, n_embd=128),
                'gpt-nano':     dict(n_layer=3, n_head=3, n_embd=48),
                'gpt-bach':     dict(n_layer=4, n_head=8, n_embd=128),
            }[config.model_type])

        self.transformer = nn.ModuleDict(dict(
            wte = nn.Embedding(config.vocab_size, config.n_embd),
            wpe = nn.Embedding(config.block_size, config.n_embd),
            drop = nn.Dropout(config.embd_pdrop),
            h = nn.ModuleList([Block(config) for _ in range(config.n_layer)]),
            ln_f = nn.LayerNorm(config.n_embd),
        ))
        self.lm_head = nn.Linear(config.n_embd, config.vocab_size, bias=False)

        # init all weights, and apply a special scaled init to the residual projections, per GPT-2 paper
        self.apply(self._init_weights)
        for pn, p in self.named_parameters():
            if pn.endswith('c_proj.weight'):
                torch.nn.init.normal_(p, mean=0.0, std=0.02/math.sqrt(2 * config.n_layer))

        # report number of parameters (note we don't count the decoder parameters in lm_head)
        n_params = sum(p.numel() for p in self.transformer.parameters())
        print("number of parameters: %.2fM" % (n_params/1e6,))

    def _init_weights(self, module):
        if isinstance(module, nn.Linear):
            torch.nn.init.normal_(module.weight, mean=0.0, std=0.02)
            if module.bias is not None:
                torch.nn.init.zeros_(module.bias)
        elif isinstance(module, nn.Embedding):
            torch.nn.init.normal_(module.weight, mean=0.0, std=0.02)
        elif isinstance(module, nn.LayerNorm):
            torch.nn.init.zeros_(module.bias)
            torch.nn.init.ones_(module.weight)

    @classmethod
    def from_pretrained(cls, model_type):
        """
        Initialize a pretrained GPT model by copying over the weights
        from a huggingface/transformers checkpoint.
        """
        assert model_type in {'gpt2', 'gpt2-medium', 'gpt2-large', 'gpt2-xl'}
        from transformers import GPT2LMHeadModel

        # create a from-scratch initialized minGPT model
        config = cls.get_default_config()
        config.model_type = model_type
        config.vocab_size = 50257 # openai's model vocabulary
        config.block_size = 1024  # openai's model block_size
        model = GPT(config)
        sd = model.state_dict()

        # init a huggingface/transformers model
        model_hf = GPT2LMHeadModel.from_pretrained(model_type)
        sd_hf = model_hf.state_dict()

        # copy while ensuring all of the parameters are aligned and match in names and shapes
        keys = [k for k in sd_hf if not k.endswith('attn.masked_bias')] # ignore these
        transposed = ['attn.c_attn.weight', 'attn.c_proj.weight', 'mlp.c_fc.weight', 'mlp.c_proj.weight']
        # basically the openai checkpoints use a "Conv1D" module, but we only want to use a vanilla nn.Linear.
        # this means that we have to transpose these weights when we import them
        assert len(keys) == len(sd)
        for k in keys:
            if any(k.endswith(w) for w in transposed):
                # special treatment for the Conv1D weights we need to transpose
                assert sd_hf[k].shape[::-1] == sd[k].shape
                with torch.no_grad():
                    sd[k].copy_(sd_hf[k].t())
            else:
                # vanilla copy over the other parameters
                assert sd_hf[k].shape == sd[k].shape
                with torch.no_grad():
                    sd[k].copy_(sd_hf[k])

        return model

    def configure_optimizers(self, train_config):
        """
        This long function is unfortunately doing something very simple and is being very defensive:
        We are separating out all parameters of the model into two buckets: those that will experience
        weight decay for regularization and those that won't (biases, and layernorm/embedding weights).
        We are then returning the PyTorch optimizer object.
        """

        # separate out all parameters to those that will and won't experience regularizing weight decay
        decay = set()
        no_decay = set()
        whitelist_weight_modules = (torch.nn.Linear, )
        blacklist_weight_modules = (torch.nn.LayerNorm, torch.nn.Embedding)
        for mn, m in self.named_modules():
            for pn, p in m.named_parameters():
                fpn = '%s.%s' % (mn, pn) if mn else pn # full param name
                # random note: because named_modules and named_parameters are recursive
                # we will see the same tensors p many many times. but doing it this way
                # allows us to know which parent module any tensor p belongs to...
                if pn.endswith('bias'):
                    # all biases will not be decayed
                    no_decay.add(fpn)
                elif pn.endswith('weight') and isinstance(m, whitelist_weight_modules):
                    # weights of whitelist modules will be weight decayed
                    decay.add(fpn)
                elif pn.endswith('weight') and isinstance(m, blacklist_weight_modules):
                    # weights of blacklist modules will NOT be weight decayed
                    no_decay.add(fpn)

        # validate that we considered every parameter
        param_dict = {pn: p for pn, p in self.named_parameters()}
        inter_params = decay & no_decay
        union_params = decay | no_decay
        assert len(inter_params) == 0, "parameters %s made it into both decay/no_decay sets!" % (str(inter_params), )
        assert len(param_dict.keys() - union_params) == 0, "parameters %s were not separated into either decay/no_decay set!" \
                                                    % (str(param_dict.keys() - union_params), )

        # create the pytorch optimizer object
        optim_groups = [
            {"params": [param_dict[pn] for pn in sorted(list(decay))], "weight_decay": train_config.weight_decay},
            {"params": [param_dict[pn] for pn in sorted(list(no_decay))], "weight_decay": 0.0},
        ]
        optimizer = torch.optim.AdamW(optim_groups, lr=train_config.learning_rate, betas=train_config.betas)
        return optimizer

    def forward(self, idx, targets=None):
        device = idx.device
        b, t = idx.size()
        assert t <= self.block_size, f"Cannot forward sequence of length {t}, block size is only {self.block_size}"
        pos = torch.arange(0, t, dtype=torch.long, device=device).unsqueeze(0) # shape (1, t)

        # forward the GPT model itself
        tok_emb = self.transformer.wte(idx) # token embeddings of shape (b, t, n_embd)
        pos_emb = self.transformer.wpe(pos) # position embeddings of shape (1, t, n_embd)
        x = self.transformer.drop(tok_emb + pos_emb)
        for block in self.transformer.h:
            x = block(x)
        x = self.transformer.ln_f(x)
        logits = self.lm_head(x)

        # if we are given some desired targets also calculate the loss
        loss = None
        if targets is not None:
            loss = F.cross_entropy(logits.view(-1, logits.size(-1)), targets.view(-1), ignore_index=-1)

        return logits, loss

    @torch.no_grad()
    def generate(self, idx, max_new_tokens, temperature=1.0, do_sample=False, top_k=None):
        """
        Take a conditioning sequence of indices idx (LongTensor of shape (b,t)) and complete
        the sequence max_new_tokens times, feeding the predictions back into the model each time.
        Most likely you'll want to make sure to be in model.eval() mode of operation for this.
        """
        for _ in range(max_new_tokens):
            # if the sequence context is growing too long we must crop it at block_size
            idx_cond = idx if idx.size(1) <= self.block_size else idx[:, -self.block_size:]
            # forward the model to get the logits for the index in the sequence
            logits, _ = self(idx_cond)
            # pluck the logits at the final step and scale by desired temperature
            logits = logits[:, -1, :] / temperature
            # optionally crop the logits to only the top k options
            if top_k is not None:
                v, _ = torch.topk(logits, top_k)
                logits[logits < v[:, [-1]]] = -float('Inf')
            # apply softmax to convert logits to (normalized) probabilities
            probs = F.softmax(logits, dim=-1)
            # either sample from the distribution or take the most likely element
            if do_sample:
                idx_next = torch.multinomial(probs, num_samples=1)
            else:
                _, idx_next = torch.topk(probs, k=1, dim=-1)
            # append sampled index to the running sequence and continue
            idx = torch.cat((idx, idx_next), dim=1)

        return idx


In [None]:
import time
from collections import defaultdict

import torch
from torch.utils.data.dataloader import DataLoader

In [None]:
class Trainer:
    @staticmethod
    def get_default_config():
        C = CN()
        # device to train on
        C.device = 'auto'
        # dataloder parameters
        C.num_workers = 4
        # optimizer parameters
        C.max_iters = None
        C.batch_size = 64
        C.learning_rate = 3e-4
        C.betas = (0.9, 0.95)
        C.weight_decay = 0.1 # only applied on matmul weights
        C.grad_norm_clip = 1.0
        return C

    def __init__(self, config, model, train_dataset):
        self.config = config
        self.model = model
        self.optimizer = None
        self.train_dataset = train_dataset
        self.callbacks = defaultdict(list)

        # determine the device we'll train on
        if config.device == 'auto':
            self.device = 'cuda' if torch.cuda.is_available() else 'cpu'
        else:
            self.device = config.device
        self.model = self.model.to(self.device)
        print("running on device", self.device)

        # variables that will be assigned to trainer class later for logging and etc
        self.iter_num = 0
        self.iter_time = 0.0
        self.iter_dt = 0.0

    def add_callback(self, onevent: str, callback):
        self.callbacks[onevent].append(callback)

    def set_callback(self, onevent: str, callback):
        self.callbacks[onevent] = [callback]

    def trigger_callbacks(self, onevent: str):
        for callback in self.callbacks.get(onevent, []):
            callback(self)
    
    def trainloader_setup(self,config):
        self.train_dataset.shuffle_it()
        # setup the dataloader
        train_loader = DataLoader(
            self.train_dataset,
            sampler=torch.utils.data.RandomSampler(self.train_dataset, replacement=True, num_samples=self.train_dataset.__len__()),
            shuffle=False,
            pin_memory=True,
            batch_size=config.batch_size,
            num_workers=config.num_workers,
        )
        return train_loader

    def run(self):
        model, config = self.model, self.config

        train_loader = self.trainloader_setup(config)

        # setup the optimizer
        self.optimizer = model.configure_optimizers(config)

        
        model.train()
        self.iter_num = 0
        self.iter_time = time.time()
        data_iter = iter(train_loader)
        while True:

            # fetch the next batch (x, y) and re-init iterator if needed
            try:
                batch = next(data_iter)
            except StopIteration:
                train_loader = self.trainloader_setup(config)
                data_iter = iter(train_loader)
                batch = next(data_iter)
            batch = [t.to(self.device) for t in batch]
            x, y = batch

            # forward the model
            logits, self.loss = model(x, y)

            # backprop and update the parameters
            model.zero_grad(set_to_none=True)
            self.loss.backward()
            torch.nn.utils.clip_grad_norm_(model.parameters(), config.grad_norm_clip)
            self.optimizer.step()

            self.trigger_callbacks('on_batch_end')
            self.iter_num += 1
            tnow = time.time()
            self.iter_dt = tnow - self.iter_time
            self.iter_time = tnow

            # termination conditions
            if config.max_iters is not None and self.iter_num >= config.max_iters:
                break

Creating Dataset from Tokens

Process to convert json token ids into transformer input:

1) For each json file, take 'ids', convert to list, drop remainder after dividing my block size (128)

2) create Dataset Module where the x is taking [:-1] of data and y is taking [1:] from data, same as homework format.

3) Input into Dataloader

This approach is good for generation, not for sentiment analysis or translation.

Function to create dataset

In [None]:
import json
import torch
from torch.utils.data import Dataset
import math

# Only for 1 json file, need to use in a loop and concatenate all the x and y outputs into 2 tensors
class MakeDataset(Dataset):
  def __init__(self, data, total_length, block_size=128):
    # Load the JSON files from filepath
    self.block_size = block_size
    self.totlength = total_length
    self.data = data

  def __len__(self):
    return math.ceil(self.totlength / (self.block_size + 1))

  def __getitem__(self, idx):
    i = np.random.randint(0, len(self.data) - (self.block_size + 1))
    chunk = self.data[i:i+self.block_size+1]
    x = torch.tensor(chunk[:-1], dtype=torch.long)
    y = torch.tensor(chunk[1:], dtype=torch.long)
    return x, y

Function to break each song to block size then concatenate all tokens stored in json files in given folderpath

In [None]:
import os
def ConcatTokens(filepath, block_size):
    totlength = 0;
    ids=[]
    json_files = [f for f in os.listdir(filepath) if f.endswith('.json')]
    for file_name in json_files:
      with open(os.path.join(filepath, file_name), 'r') as f:
        data = json.load(f)['ids'][0]
        data = data[:len(data)//(block_size+1)*(block_size+1)]
        #print(len(data))
        totlength = totlength + len(data)
        ids += data

    return ids, totlength

Save the created dataset

In [None]:
import pickle
#with open('/content/drive/Shareddrives/EC523_Project/CodeWorkspace/train300.pkl', 'wb') as f:
#  pickle.dump(tempx, f)

Load the created dataset

In [None]:
import pickle
# Load train300 pickle file
with open('/content/drive/Shareddrives/EC523_Project/CodeWorkspace/trainfull.pkl', 'rb') as f:
  dataset = pickle.load(f)

Code to decide where the folder path for the tokens are and to concatenate all tokens by block size of model (192 for GPT-mini)

In [None]:
import json
import os
folder_path = '/content/drive/Shareddrives/EC523_Project/CodeWorkspace/train_tokenswithBPE_350'
folder_path_full = '/content/drive/Shareddrives/EC523_Project/CodeWorkspace/tokens_BPE_train'

block_size = 192
ids, totlength = ConcatTokens(folder_path_full, block_size)

In [None]:
print(len(ids))
print(totlength)

14327741
14327741


Make the dataset into a Dataset variable

In [None]:
dataset = MakeDataset(ids, totlength, block_size)

In [None]:
print(len(dataset))

74237


In [None]:
%cd /content/drive/Shareddrives/EC523_Project/CodeWorkspace
import pickle
#with open('/content/drive/Shareddrives/EC523_Project/CodeWorkspace/trainfull.pkl', 'wb') as f:
#  pickle.dump(dataset, f)

/content/drive/Shareddrives/EC523_Project/CodeWorkspace


In [None]:
import pickle
# Load train300 pickle file
#with open('/content/drive/Shareddrives/EC523_Project/CodeWorkspace/trainfull.pkl', 'rb') as f:
#  dataset = pickle.load(f)

Configurations and imports for the GPT model

In [None]:
import math
import logging
from tqdm import tqdm
import numpy as np
import torch
import torch.optim as optim
from torch.optim.lr_scheduler import LambdaLR
from torch.utils.data.dataloader import DataLoader

In [None]:
logger = logging.getLogger(__name__)
class TrainerConfig:
    # optimization parameters
    max_epochs = 10
    batch_size = 64
    learning_rate = 3e-4
    betas = (0.9, 0.95)
    grad_norm_clip = 1.0
    weight_decay = 0.1 # only applied on matmul weights
    # learning rate decay params: linear warmup followed by cosine decay to 10% of original
    lr_decay = False
    warmup_tokens = 375e6 # these two numbers come from the GPT-3 paper, but may not be good defaults elsewhere
    final_tokens = 260e9 # (at what point we reach 10% of original LR)
    # checkpoint settings
    ckpt_path = None
    num_workers = 0 # for DataLoader

    def __init__(self, **kwargs):
        for k,v in kwargs.items():
            setattr(self, k, v)
# initialize a baby GPT model
#model = classGPT(vocab_size =500, n_embd=128, n_head=4, block_size =128, n_layer=2)

In [None]:
config = TrainerConfig(max_epochs=100, batch_size=64, learning_rate=6e-4,lr_decay=True, warmup_tokens=1024, final_tokens=150*len(dataset),num_workers=4)
dtype = torch.float
device = 'cpu'

model_config = GPT.get_default_config()
model_config.model_type = 'gpt-mini'
model_config.vocab_size = 500
model_config.block_size = 198

model = GPT(model_config).to(device)
model_name = "bachsformer"

if torch.cuda.is_available():
  device = torch.cuda.current_device()
  model.to(device)
  optimizer = model.configure_optimizers(config)




number of parameters: 5.47M


In [None]:
batch_size = 64
steps_per_epoch = dataset.__len__()//batch_size
train_config = Trainer.get_default_config()
train_config.learning_rate = 6e-4 
train_config.max_iters = steps_per_epoch*500
train_config.num_workers = 0
train_config.device=device
train_config.batch_size = batch_size
trainer = Trainer(train_config, model, dataset)
try:
  model.load_state_dict(torch.load(model_name))
  print("model loaded from pretrained")
except: pass

running on device 0


In [None]:
import math
import logging
from tqdm import tqdm
import numpy as np
import torch
import torch.optim as optim
from torch.optim.lr_scheduler import LambdaLR
from torch.utils.data.dataloader import DataLoader
import matplotlib.pyplot as plt


Training Loop for GPT model

Saves the model with lowest loss during training

In [None]:
optimizer = model.configure_optimizers(config)
criterion = nn.CrossEntropyLoss()

tokens = 0
best_loss = float('inf')
best_epoch = 0


for epoch in range(config.max_epochs):
    model.train()
    data = dataset 
    loader = DataLoader(data, shuffle=True, pin_memory=True,
                        batch_size=train_config.batch_size,
                        num_workers=train_config.num_workers)
    losses = []
    pbar = tqdm(enumerate(loader), total=len(loader)) 
    for iter, (x, y) in pbar:
        # place data on the correct device
        x = x.to(device)
        y = y.to(device)
        # forward the model
        """ CODE HERE """
        outputs, _ = model(x)
        loss = criterion(outputs.view(-1, outputs.size(-1)), y.view(-1))
        losses.append(loss.item())
        # backward and optimize
        optimizer.zero_grad()
        loss.backward()
        torch.nn.utils.clip_grad_norm_(model.parameters(), max_norm=config.grad_norm_clip)
        optimizer.step()
        
        """ CODE HERE END """
        # decay the learning rate based on our progress
        if config.lr_decay:
            tokens += (y >= 0).sum() # number of tokens processed this step (i.e. label is not -100)
            if tokens < config.warmup_tokens:
                # linear warmup
                lr_mult = float(tokens) / float(max(1, config.warmup_tokens))
            else:
                # cosine learning rate decay
                progress = float(tokens - config.warmup_tokens) / float(max(1, config.final_tokens - config.warmup_tokens))
                lr_mult = max(0.1, 0.5 * (1.0 + math.cos(math.pi * progress)))
            lr = config.learning_rate * lr_mult
            for param_group in optimizer.param_groups:
                param_group['lr'] = lr
        else:
            lr = config.learning_rate
        # report progress
        pbar.set_description(f"epoch {epoch+1} iter {iter}: train loss {loss.item():.5f}. lr {lr:e}")
    avg_loss = sum(losses) / len(losses)
    if avg_loss < best_loss:
        best_loss = avg_loss
        best_epoch = epoch
        checkpoint = {
            'epoch': epoch,
            'state_dict': model.state_dict(),
            'optimizer': optimizer.state_dict(),
            'best_loss': best_loss,
            'best_epoch': best_epoch
        }
        torch.save(checkpoint, 'GPTminiepoch100.pt')
        
    # save the epoch and training loss information separately
    loss_info = {'epoch': epoch, 'train_loss': avg_loss}


epoch 1 iter 1159: train loss 3.49309. lr 1.087915e-04: 100%|██████████| 1160/1160 [08:18<00:00,  2.33it/s]
epoch 2 iter 1159: train loss 3.21029. lr 2.436528e-04: 100%|██████████| 1160/1160 [08:16<00:00,  2.34it/s]
epoch 3 iter 1159: train loss 3.11786. lr 5.630105e-04: 100%|██████████| 1160/1160 [08:16<00:00,  2.34it/s]
epoch 4 iter 1159: train loss 3.02892. lr 6.000000e-05: 100%|██████████| 1160/1160 [08:16<00:00,  2.34it/s]
epoch 5 iter 1159: train loss 2.85999. lr 3.922598e-04: 100%|██████████| 1160/1160 [08:16<00:00,  2.34it/s]
epoch 6 iter 1159: train loss 2.77482. lr 4.612366e-04: 100%|██████████| 1160/1160 [08:16<00:00,  2.34it/s]
epoch 7 iter 1159: train loss 2.71374. lr 6.000000e-05: 100%|██████████| 1160/1160 [08:16<00:00,  2.34it/s]
epoch 8 iter 1159: train loss 2.63309. lr 5.181416e-04: 100%|██████████| 1160/1160 [08:16<00:00,  2.34it/s]
epoch 9 iter 1159: train loss 2.57080. lr 3.197471e-04: 100%|██████████| 1160/1160 [08:16<00:00,  2.34it/s]
epoch 10 iter 1159: train lo

Traceback (most recent call last):
  File "/usr/local/lib/python3.10/dist-packages/IPython/core/interactiveshell.py", line 3553, in run_code
    exec(code_obj, self.user_global_ns, self.user_ns)
  File "<ipython-input-31-a68248606223>", line 61, in <cell line: 9>
    torch.save(checkpoint, 'GPTminiepoch100.pt')
  File "/usr/local/lib/python3.10/dist-packages/torch/serialization.py", line 440, in save
    with _open_zipfile_writer(f) as opened_zipfile:
  File "/usr/local/lib/python3.10/dist-packages/torch/serialization.py", line 315, in _open_zipfile_writer
    return container(name_or_buffer)
  File "/usr/local/lib/python3.10/dist-packages/torch/serialization.py", line 288, in __init__
    super().__init__(torch._C.PyTorchFileWriter(str(name)))
RuntimeError: File GPTminiepoch100.pt cannot be opened.

During handling of the above exception, another exception occurred:

Traceback (most recent call last):
  File "/usr/local/lib/python3.10/dist-packages/IPython/core/interactiveshell.py", l

Load the saved model

In [None]:
import torch
import torch.nn as nn
checkpoint = torch.load('/content/drive/Shareddrives/EC523_Project/CodeWorkspace/bachsformer/transformer_decoder_only/GPTminiepoch100.pt')
model.load_state_dict(checkpoint['state_dict'])


<All keys matched successfully>

In [None]:
checkpoint = torch.load('/content/drive/Shareddrives/EC523_Project/CodeWorkspace/bachsformer/transformer_decoder_only/GPTminiepoch100.pt')
model.load_state_dict(checkpoint['state_dict'])
optimizer.load_state_dict(checkpoint['optimizer'])
epoch = checkpoint['epoch']
loss = checkpoint['best_loss']

Function to generate and sample the generated music

In [None]:
def top_k_logits(logits, k):
    v, ix = torch.topk(logits, k)
    out = logits.clone()
    out[out < v[:, [-1]]] = -float('Inf')
    return out

def sample(model, x, steps, temperature=1.0, sample=False, top_k=None):
    """
    take a conditioning sequence of indices in x (of shape (b,t)) and predict the next token in
    the sequence, feeding the predictions back into the model each time. 
    """
    block_size = dataset.block_size
    print(block_size)
    model.eval()
    for k in range(steps):
        x_cond = x if x.size(1) <= block_size else x[:, -block_size:] # crop context if needed
        logits, _ = model(x_cond)
        # pluck the logits at the final step and scale by temperature
        logits = logits[:, -1, :] / temperature
        # optionally crop probabilities to only the top k options
        if top_k is not None:
            logits = top_k_logits(logits, top_k)
        # apply softmax to convert to probabilities
        probs = F.softmax(logits, dim=-1)
        # sample from the distribution or take the most likely
        if sample:
            ix = torch.multinomial(probs, num_samples=1)
        else:
            _, ix = torch.topk(probs, k=1, dim=-1)
        # append to the sequence and continue
        x = torch.cat((x, ix), dim=1)
    return x

Grab the test data and concatenate the tokens

In [None]:
import json
folder_path = '/content/drive/Shareddrives/EC523_Project/CodeWorkspace/tokens_BPE_test'

block_size = 192
ids, totlength = ConcatTokens(folder_path, block_size)
print(len(ids))

1851256


Code to generate 1 random song

In [None]:
import random
random_start = random.randint(0, len(ids)-block_size)
print(random_start)
testtestids = torch.tensor(ids[random_start:random_start+block_size], dtype=torch.long).to(device)
print(testtestids)
x = testtestids.unsqueeze(0)
y = sample(model, x, 2000, temperature=0.5, sample=True, top_k=10)[0]
print(y.tolist())
result = y.tolist()
result = result[192:]
print(result)
midi = tokenizer.tokens_to_midi([result])
print(midi)
midi.dump('/content/drive/Shareddrives/EC523_Project/CodeWorkspace/result/GPTmini/file.mid')

1760516
tensor([193,  60, 115, 131, 198,  49, 254, 200,  50, 247, 202,  53, 270, 204,
         59, 251, 209,  30, 280,  40, 282, 210,  57, 273,  62, 309, 211,  50,
        255, 217,  44, 241, 220, 466,  60, 325, 257,  48, 241, 193,  52, 280,
        195,  39, 294, 196,  30, 268, 197,  51, 105, 135,  59, 110, 141, 203,
         47, 239, 205,  48, 253, 208,  52, 265, 212,  57, 262, 217,  31, 102,
        152, 218,  41,  97, 133, 219,  47, 101, 140,  57, 107, 138, 220,  50,
        101, 147, 300,  37, 312, 198,  38, 297, 202,  40, 110, 132, 208,  41,
        101, 133, 209,  56, 107, 142, 215,  53, 102, 133, 220,  50, 101, 132,
        397,  47, 271, 203,  19,  98, 134, 206,  40, 289, 209,  48, 328, 214,
         55, 105, 135, 220,  43, 289, 274,  50, 256, 196,  48, 301, 200,  31,
        101, 151,  43,  99, 140, 201,  60, 107, 136, 206,  48, 239, 210,  53,
        103, 136, 214,  42, 339,  52, 246, 220,  41, 103, 147, 234,  50, 284,
         60, 107, 134, 194,  49, 268, 198,  52, 106, 138

Function to generate multiple music and store with iterative names

In [None]:
def generatemusic(model, data, block_size, emblength, temperature, top_k, folderpath, numbergen):
  for i in range(numbergen):
    random_start = random.randint(0, len(data)-block_size)
    testtestids = torch.tensor(data[random_start:random_start+block_size], dtype=torch.long).to(device)
    x = testtestids.unsqueeze(0)
    y = sample(model, x, emblength, temperature, sample=True, top_k=top_k)[0]
    result = y.tolist()
    result = result[192:]
    midi = tokenizer.tokens_to_midi([result])
    midi.dump(folderpath+f'{i}.mid')

In [None]:

folder_path = '/content/drive/Shareddrives/EC523_Project/CodeWorkspace/result/GPTmini/temp08topk30/'

generatemusic(model, ids, block_size, 2000, 0.8, 30, folder_path, 30)

128
128
128
128
128
128
128
128
128
128
128
128
128
128
128
128
128
128
128
128
128
128
128
128
128
128
128
128
128
128


Running Validation on the Model

In [None]:
import json
import os
folder_path_valid = '/content/drive/Shareddrives/EC523_Project/CodeWorkspace/tokens_BPE_valid'

block_size = 192
ids, totlength = ConcatTokens(folder_path_valid, block_size)
print(len(ids))
print(totlength)

1632008
1632008


In [None]:
dataset_valid = MakeDataset(ids, totlength, block_size)

In [None]:
# Validation Mode
model.eval()
data = dataset_valid
loader = DataLoader(data, shuffle=True, pin_memory=True,
                    batch_size=config.batch_size,
                    num_workers=config.num_workers)
# Disable gradient calculation to save memory
with torch.no_grad():
  pbar = tqdm(enumerate(loader), total=len(loader)) 
  for iter, (x, y) in pbar:
    # place data on the correct device
    x = x.to(device)
    y = y.to(device)

    # Forward pass on the validation dataset
    outputs = model(x)

    # Calculate the loss between the predicted outputs and the ground truth labels
    loss = torch.nn.functional.cross_entropy(outputs.logits.view(-1, outputs.logits.shape[-1]), y.view(-1))

# Calculate the perplexity score from the validation loss
perplexity = torch.exp(loss)