Jetzt versuch mit Beispiel aus https://github.com/LukeDitria/pytorch_tutorials/blob/main/section14_transformers/solutions/Pytorch2_Transformer_Text_Generation.ipynb.

Dazugehöriges Video: https://youtu.be/7J4Xn0LnnEA?list=PLyHaDji6oZkV4sRUVoJdvZm2Sk7ohQ9yD

(Eventuelle Idee: ich baue einen Tokenizer für Die einzelnen Töne. -> Jeder snapshot wird getokenized. so wie die Einzelnen wörter bei einem Transformer für text. Sollte gut machbar sein bei 12 Tönen in einer Oktave -> könnten zu viele mögliche Tokens werden bei 88 Tönen. Aber es werden ja meißtens nicht alle töne gleichzeitig gespielt.)

In [1]:
# imports
import torch
import torch.nn as nn
import torch.optim as optim
import torch.nn.functional as F
from torch import Tensor

from data_preperation import dataset_snapshot

import math
import numpy as np

from tqdm.notebook import trange, tqdm
import matplotlib.pyplot as plt
import random

In [2]:
# Define hyperparameters

#SOS_TOKEN = np.full((1,1), (2 ** 12))   # SOS token index (außerhalb was mit 12 binärstellen angezeigt werden kann)
#PAD_TOKEN = np.full((1,1), ((2 ** 12) + 1)) # padding token
SOS_TOKEN = (2 ** 12)   # SOS token index (außerhalb was mit 12 binärstellen angezeigt werden kann)
PAD_TOKEN = ((2 ** 12) + 1) # padding token


# Learning rate for the optimizer
learning_rate = 1e-4

# Number of epochs for training
nepochs = 20

# Batch size for data loaders
batch_size = 128

# Maximum sequence length for inputs
max_len = 200

# Root directory of the dataset
data_set_root = "/home/falaxdb/Repos/Learn-ml/Transformer-pytorch/piano_data/maestro_v3/hands_split_into_seperate_midis"

In [3]:
# Create snapshots
dataset_as_snapshots = dataset_snapshot.process_dataset_multithreaded("/home/falaxdb/Repos/minus1/datasets/maestro_v3_split/hands_split_into_seperate_midis", 0.05)
# filter snapshots to 88 piano notes
dataset_as_snapshots = dataset_snapshot.filter_piano_range(dataset_as_snapshots)
# compress data into one octave
dataset_as_snapshots =  dataset_snapshot.compress_existing_dataset_to_12keys(dataset_as_snapshots)

for song in dataset_as_snapshots:
    print("song:")
    for track in song:
        print(track.shape)

Processed dataset (1038/1038): 100%|██████████| 1038/1038 [00:14<00:00, 69.77it/s]


Processed 1038 of 1038 files
song:
(2166, 12)
(2166, 12)
song:
(1504, 12)
(1504, 12)
song:
(2864, 12)
(2864, 12)
song:
(5460, 12)
(5460, 12)
song:
(7414, 12)
(7414, 12)
song:
(4439, 12)
(4439, 12)
song:
(5962, 12)
(5962, 12)
song:
(1202, 12)
(1202, 12)
song:
(6928, 12)
(6928, 12)
song:
(6958, 12)
(6958, 12)
song:
(3674, 12)
(3674, 12)
song:
(7610, 12)
(7610, 12)
song:
(11528, 12)
(11528, 12)
song:
(5209, 12)
(5209, 12)
song:
(10557, 12)
(10557, 12)
song:
(15354, 12)
(15354, 12)
song:
(2165, 12)
(2165, 12)
song:
(10709, 12)
(10709, 12)
song:
(14329, 12)
(14329, 12)
song:
(15736, 12)
(15736, 12)
song:
(2657, 12)
(2657, 12)
song:
(7481, 12)
(7481, 12)
song:
(5534, 12)
(5534, 12)
song:
(3555, 12)
(3555, 12)
song:
(1723, 12)
(1723, 12)
song:
(3852, 12)
(3852, 12)
song:
(4350, 12)
(4350, 12)
song:
(5433, 12)
(5433, 12)
song:
(8687, 12)
(8687, 12)
song:
(21709, 12)
(21709, 12)
song:
(1978, 12)
(1978, 12)
song:
(12607, 12)
(12607, 12)
song:
(5751, 12)
(5751, 12)
song:
(2726, 12)
(2726, 12)
son

# Try converting the data into token indices (i think transformers work best with data in this format)

For one octave: i think i can handle the multi one hot encoded Vektors like binary data and just convert it into base 10

In [4]:
def binary_to_base10(single_snapshot):
    return int("".join(map(lambda x: str(int(x)), single_snapshot)), 2)

def base10_to_binary_one_hot(number, vector_length):
    # Convert number to binary string without '0b' prefix
    binary_str = bin(number)[2:]
    
    # Ensure the binary string is of the correct length by padding with leading zeros
    binary_str = binary_str.zfill(vector_length)
    
    # Map each binary digit to an integer and store it in a list
    one_hot_vector = list(map(lambda x: int(x), binary_str))
    
    return one_hot_vector

def convert_snapshots_to_base10(dataset_as_snapshots):
    converted_dataset = []
    for song in dataset_as_snapshots:
        track_1, track_2 = song
        track_1_base10 = np.array([binary_to_base10(snapshot) for snapshot in track_1])
        track_2_base10 = np.array([binary_to_base10(snapshot) for snapshot in track_2])
        converted_dataset.append([track_1_base10, track_2_base10])
    return converted_dataset

dataset_as_snapshots = convert_snapshots_to_base10(dataset_as_snapshots)




In [5]:
print("Shapes after converting:")

for song in dataset_as_snapshots:
    print("song:")
    for track in song:
        print(track.shape)

print("One track:")

print(dataset_as_snapshots[0][0])
print("Shape:", dataset_as_snapshots[0][0].shape)
    

Shapes after converting:
song:
(2166,)
(2166,)
song:
(1504,)
(1504,)
song:
(2864,)
(2864,)
song:
(5460,)
(5460,)
song:
(7414,)
(7414,)
song:
(4439,)
(4439,)
song:
(5962,)
(5962,)
song:
(1202,)
(1202,)
song:
(6928,)
(6928,)
song:
(6958,)
(6958,)
song:
(3674,)
(3674,)
song:
(7610,)
(7610,)
song:
(11528,)
(11528,)
song:
(5209,)
(5209,)
song:
(10557,)
(10557,)
song:
(15354,)
(15354,)
song:
(2165,)
(2165,)
song:
(10709,)
(10709,)
song:
(14329,)
(14329,)
song:
(15736,)
(15736,)
song:
(2657,)
(2657,)
song:
(7481,)
(7481,)
song:
(5534,)
(5534,)
song:
(3555,)
(3555,)
song:
(1723,)
(1723,)
song:
(3852,)
(3852,)
song:
(4350,)
(4350,)
song:
(5433,)
(5433,)
song:
(8687,)
(8687,)
song:
(21709,)
(21709,)
song:
(1978,)
(1978,)
song:
(12607,)
(12607,)
song:
(5751,)
(5751,)
song:
(2726,)
(2726,)
song:
(4922,)
(4922,)
song:
(28675,)
(28675,)
song:
(6727,)
(6727,)
song:
(7060,)
(7060,)
song:
(5374,)
(5374,)
song:
(6820,)
(6820,)
song:
(5829,)
(5829,)
song:
(9477,)
(9477,)
song:
(16635,)
(16635,)
song:
(15

In [6]:
import numpy as np
import torch
from torch.utils.data import DataLoader, Dataset
from sklearn.model_selection import train_test_split


# Function to add SOS and EOS tokens to each chunk
def add_sos_eos_to_chunks(chunks):
    new_chunks = []
    for chunk in chunks:
        # new_chunk = np.vstack([SOS_TOKEN, chunk, EOS_TOKEN]) eos token probably not neccessary
        print(chunk.shape)
        
        #new_chunk = np.vstack([SOS_TOKEN, chunk])
        new_chunk = np.insert(chunk,0, SOS_TOKEN)
        new_chunks.append(new_chunk)
    return new_chunks

# Function to split sequences into chunks
def split_into_chunks(sequence, chunk_size):
    print("sequence:", sequence.shape)
    return [sequence[i:i + chunk_size] for i in range(0, len(sequence), chunk_size)]

# Function to filter out short chunks while maintaining pairs
def filter_short_chunks(chunks_1, chunks_2, min_length):
    filtered_chunks_1 = []
    filtered_chunks_2 = []
    for chunk_1, chunk_2 in zip(chunks_1, chunks_2):
        if len(chunk_1) >= min_length and len(chunk_2) >= min_length:
            filtered_chunks_1.append(chunk_1)
            filtered_chunks_2.append(chunk_2)
    return filtered_chunks_1, filtered_chunks_2

# Custom Dataset class
class PianoDataset(Dataset):
    def __init__(self, data):
        self.data = data
    
    def __len__(self):
        return len(self.data)
    
    def __getitem__(self, idx):
        x, y = self.data[idx]
        return torch.tensor(x, dtype=torch.long), torch.tensor(y, dtype=torch.long)

# Prepare the dataset with paired sequences and SOS/EOS tokens for each chunk
def prepare_dataset(dataset_as_snapshots, chunk_size, min_length):
    data = []
    for song in dataset_as_snapshots:
        track_1, track_2 = song
        assert len(track_1) == len(track_2), "Tracks must have the same length"
        
        chunks_1 = split_into_chunks(track_1, chunk_size)
        chunks_2 = split_into_chunks(track_2, chunk_size)
        chunks_1, chunks_2 = filter_short_chunks(chunks_1, chunks_2, min_length)
        
        print("chunks diemsion:", chunks_1[0].size)
        
        # Add SOS and EOS tokens to each chunk
        chunks_1 = add_sos_eos_to_chunks(chunks_1)
        chunks_2 = add_sos_eos_to_chunks(chunks_2)
        
        for x, y in zip(chunks_1, chunks_2):
            data.append((x, y))
    return data

In [7]:
data = prepare_dataset(dataset_as_snapshots, max_len, max_len)

# Split the dataset using sklearn while maintaining pairs
train_data, temp_data = train_test_split(data, test_size=0.3, random_state=42, shuffle=True)
val_data, test_data = train_test_split(temp_data, test_size=0.5, random_state=42, shuffle=True)

# Create custom datasets
train_dataset = PianoDataset(train_data)
val_dataset = PianoDataset(val_data)
test_dataset = PianoDataset(test_data)

# Create DataLoaders for each subset with drop_last=True
train_loader = DataLoader(train_dataset, batch_size=batch_size, shuffle=True, drop_last=True)
val_loader = DataLoader(val_dataset, batch_size=batch_size, shuffle=False, drop_last=True)
test_loader = DataLoader(test_dataset, batch_size=batch_size, shuffle=False, drop_last=True)

# Iterate over the DataLoader (example with train_loader)
for batch in train_loader:
    X, y = batch
    print(X.shape, y.shape)
    # X and y should both have shape (batch_size, chunk_size + 2, feature dimension) because of SOS and EOS tokens

sequence: (2166,)
sequence: (2166,)
chunks diemsion: 200
(200,)
(200,)
(200,)
(200,)
(200,)
(200,)
(200,)
(200,)
(200,)
(200,)
(200,)
(200,)
(200,)
(200,)
(200,)
(200,)
(200,)
(200,)
(200,)
(200,)
sequence: (1504,)
sequence: (1504,)
chunks diemsion: 200
(200,)
(200,)
(200,)
(200,)
(200,)
(200,)
(200,)
(200,)
(200,)
(200,)
(200,)
(200,)
(200,)
(200,)
sequence: (2864,)
sequence: (2864,)
chunks diemsion: 200
(200,)
(200,)
(200,)
(200,)
(200,)
(200,)
(200,)
(200,)
(200,)
(200,)
(200,)
(200,)
(200,)
(200,)
(200,)
(200,)
(200,)
(200,)
(200,)
(200,)
(200,)
(200,)
(200,)
(200,)
(200,)
(200,)
(200,)
(200,)
sequence: (5460,)
sequence: (5460,)
chunks diemsion: 200
(200,)
(200,)
(200,)
(200,)
(200,)
(200,)
(200,)
(200,)
(200,)
(200,)
(200,)
(200,)
(200,)
(200,)
(200,)
(200,)
(200,)
(200,)
(200,)
(200,)
(200,)
(200,)
(200,)
(200,)
(200,)
(200,)
(200,)
(200,)
(200,)
(200,)
(200,)
(200,)
(200,)
(200,)
(200,)
(200,)
(200,)
(200,)
(200,)
(200,)
(200,)
(200,)
(200,)
(200,)
(200,)
(200,)
(200,)
(200,)
(2

# model definition

In [8]:
# Sinusoidal positional embeddings
class SinusoidalPosEmb(nn.Module):
    """
    Sinusoidal positional embeddings module.
    """

    def __init__(self, dim):
        super().__init__()
        self.dim = dim

    def forward(self, x):
        # Calculate sinusoidal positional embeddings
        device = x.device
        half_dim = self.dim // 2
        emb = math.log(10000) / (half_dim - 1)
        emb = torch.exp(torch.arange(half_dim, device=device) * -emb)
        emb = x[:, None] * emb[None, :]
        emb = torch.cat((emb.sin(), emb.cos()), dim=-1)
        return emb

    
# Transformer block with Attention and causal masking
class TransformerBlock(nn.Module):
    """
    Transformer block with self-attention and causal masking.
    """

    def __init__(self, hidden_size=128, num_heads=4):
        super(TransformerBlock, self).__init__()

        # Layer normalization for input
        self.norm1 = nn.LayerNorm(hidden_size)

        # Multi-head self-attention mechanism
        self.multihead_attn = nn.MultiheadAttention(hidden_size, 
                                                    num_heads=num_heads, 
                                                    batch_first=True,
                                                    dropout=0.1)

        # Layer normalization for attention output
        self.norm2 = nn.LayerNorm(hidden_size)

        # Feedforward neural network
        self.mlp = nn.Sequential(
            nn.Linear(hidden_size, hidden_size * 4),
            nn.ELU(),
            nn.Linear(hidden_size * 4, hidden_size)
        )

    def forward(self, x, padding_mask):
        # Create causal mask for Attention
        bs, l, h = x.shape
        mask = torch.triu(torch.ones(l, l, device=x.device), 1).bool()

        # Layer normalization
        norm_x = self.norm1(x)

        # Apply multi-head Attention
        x = self.multihead_attn(norm_x, norm_x, norm_x, attn_mask=mask, key_padding_mask=padding_mask)[0] + x

        # Layer normalization
        norm_x = self.norm2(x)

        # Apply feedforward neural network
        x = self.mlp(norm_x) + x
        return x

    
# "Decoder-Only" Style Transformer with Attention
class Transformer(nn.Module):
    """
    "Decoder-Only" Style Transformer with self-attention.
    """

    def __init__(self, num_emb, hidden_size=128, num_layers=3, num_heads=4):
        super(Transformer, self).__init__()

        # Token embeddings
        self.embedding = nn.Embedding(num_emb, hidden_size)

        # Positional embeddings
        self.pos_emb = SinusoidalPosEmb(hidden_size)

        # List of Transformer blocks
        self.blocks = nn.ModuleList([
            TransformerBlock(hidden_size, num_heads) for _ in range(num_layers)
        ])

        # Output layer
        self.fc_out = nn.Linear(hidden_size, num_emb)

    def forward(self, input_seq):
        # Mask for padding tokens
        input_key_mask = input_seq == 0

        # Embedding input sequence
        input_embs = self.embedding(input_seq)
        bs, l, h = input_embs.shape

        # Add positional embeddings to token embeddings
        seq_indx = torch.arange(l, device=input_seq.device)
        pos_emb = self.pos_emb(seq_indx).reshape(1, l, h).expand(bs, l, h)
        embs = input_embs + pos_emb

        # Pass through Transformer blocks
        for block in self.blocks:
            embs = block(embs, padding_mask=input_key_mask)

        # Output predictions
        return self.fc_out(embs)

# initialize model and optimizer

In [9]:
# Check if GPU is available, set device accordingly
device = torch.device("cuda" if torch.cuda.is_available() else 'cpu')

# Embedding Size
hidden_size = 256

# Number of transformer blocks
num_layers = 8

# MultiheadAttention Heads
num_heads = 8

# Create model
# num emb = wie viele verschiedene Tokens es geben kann bei 12 Tönen 2 ** 12 Möglichkeiten + 2 special tokens
tf_generator = Transformer(num_emb=4098, num_layers=num_layers, 
                           hidden_size=hidden_size, num_heads=num_heads).to(device)

# Initialize the optimizer with above parameters
optimizer = optim.Adam(tf_generator.parameters(), lr=learning_rate)

# Scaler for mixed precision training
scaler = torch.cuda.amp.GradScaler()

# Define the loss function
loss_fn = nn.CrossEntropyLoss(reduction="none")

# Custom transform that will randomly replace a token with <pad>
# td = TokenDrop(prob=0.2)

# Initialize training loss logger and entropy logger
training_loss_logger = []
entropy_logger = []

In [10]:
# Let's see how many Parameters our Model has!
num_model_params = 0
for param in tf_generator.parameters():
    num_model_params += param.flatten().shape[0]

print("-This Model Has %d (Approximately %d Million) Parameters!" % (num_model_params, num_model_params//1e6))

-This Model Has 8420354 (Approximately 8 Million) Parameters!


# Training

In [11]:
def train_loop(model, opt, loss_fn, dataloader, device):
    model.train()
    total_loss = 0
    
    for batch in dataloader:
        # Move data to GPU
        input_melody, expected_harmony = batch
        input_melody, expected_harmony = input_melody.to(device), expected_harmony.to(device)
        
        # shift input und output für das training zeug mit start token
        input_melody = input_melody[:, 0:-1]
        expected_harmony = expected_harmony[:, 1:]
        
        # Generate predictions
        with torch.cuda.amp.autocast():
            pred = tf_generator(input_melody)
        
        #print("Training: prediction (model output) shape:", logits.shape)
        # Debug shapes and ranges
        # debug_shapes_and_ranges(X, y, logits)
        
        # Calculate loss with masked cross-entropy
        # ich glaube 0 steht in vorlage für padding token index -> habe ich hier anders
        mask = (expected_harmony != 4096).float()
        loss = (loss_fn(pred.transpose(1, 2), expected_harmony) * mask).sum()/mask.sum()
        
        # Backpropagation
        optimizer.zero_grad()
        scaler.scale(loss).backward()
        scaler.step(optimizer)
        scaler.update()
    
        total_loss += loss.detach().item()
        
    return total_loss / len(dataloader)

from torch.distributions import Categorical

for epoch in trange(0, nepochs, leave=False, desc="Epoch"):    
    tf_generator.train()
    steps = 0
    for batch in train_loader:
        # Convert text to tokenized input
        # text_tokens = train_tranform(list(text)).to(device)
        # bs = text_tokens.shape[0]
        
        # Randomly drop input tokens
        # input_text = td(text_tokens[:, 0:-1])
        # output_text = text_tokens[:, 1:]
        
        input_melody, expected_harmony = batch
        input_melody, expected_harmony = input_melody.to(device), expected_harmony.to(device)
        
        # print("Input melody:", input_melody.shape)
        
        # shift input und output für das training zeug
        input_melody = input_melody[:, 0:-1]
        expected_harmony = expected_harmony[:, 1:]
        
        
        
        # Generate predictions
        with torch.cuda.amp.autocast():
            pred = tf_generator(input_melody)

        # Calculate loss with masked cross-entropy
        # ich glaube 0 steht in vorlage für padding token index -> habe ich hier anders
        mask = (expected_harmony != 4096).float()
        loss = (loss_fn(pred.transpose(1, 2), expected_harmony) * mask).sum()/mask.sum()
        
        
        # Backpropagation
        optimizer.zero_grad()
        scaler.scale(loss).backward()
        scaler.step(optimizer)
        scaler.update()
        
        # Log training loss and entropy
        print("loss:", loss)
        training_loss_logger.append(loss.item())
        with torch.no_grad():
            dist = Categorical(logits=pred)
            entropy_logger.append(dist.entropy().mean().item())

training

In [12]:
from timeit import default_timer as timer
NUM_EPOCHS = 10

for epoch in range(1, NUM_EPOCHS+1):
    start_time = timer()
    train_loss = train_loop(tf_generator, optimizer, loss_fn, train_loader, device)
    end_time = timer()
    # val_loss = validation_loop(model, loss_fn, val_loader)
    print((f"Epoch: {epoch}, Train loss: {train_loss:.3f} "f"Epoch time = {(end_time - start_time):.3f}s"))

Epoch: 1, Train loss: 3.307 Epoch time = 11.001s
Epoch: 2, Train loss: 2.700 Epoch time = 10.819s
Epoch: 3, Train loss: 2.635 Epoch time = 10.813s
Epoch: 4, Train loss: 2.596 Epoch time = 10.827s
Epoch: 5, Train loss: 2.569 Epoch time = 10.826s
Epoch: 6, Train loss: 2.543 Epoch time = 10.832s
Epoch: 7, Train loss: 2.523 Epoch time = 10.817s
Epoch: 8, Train loss: 2.507 Epoch time = 10.829s
Epoch: 9, Train loss: 2.489 Epoch time = 10.819s
Epoch: 10, Train loss: 2.473 Epoch time = 10.912s


# Testing

In [41]:

# Set temperature for sampling
temp = 0.75

X, y = next(iter(test_loader))

print(f"melody: {X.shape}, Harmony: {y.shape}")

# get single sequence
X, y = X[1], y[1]

print(f"melody: {X}, Harmony: {y}")

melody: torch.Size([128, 201]), Harmony: torch.Size([128, 201])
melody: tensor([4096,    0,  128,  129,  128,  128,  128,  128,  128,  128,  128,  128,
         128,  128,  128,  130,  130,  128,    0,    0,    0,    0,    0,    4,
           0,    0,    0,    0,  128,    0, 1040, 1552, 1025,    0,    0,    0,
           0, 1024,    0,  128,    0,    2,   34,    2,    2,    0,    0,    0,
           0,  272,  256,    0,    0,    0,  512,  512,    0,    0,    0,    0,
           4,    0,    0,    0,    0,  144, 1042, 1154, 1154,  130,   16,   16,
         208,  192,   64, 1090,    0,    0,   16,   16,  272,  256,  512,  512,
        1024, 1024, 1026, 1026,    0,    0,    0,    0,    0,    0,    0,    0,
           0,    0,    4,    4,    0,  576,  580,  580,  580,   64,   64,    0,
           0,    0,    0,   68,   68,    4, 2244, 2244, 2244, 2244, 2244,   68,
          64,    0,    0,   68,   68,   64,  582,  582,  578,  512,  512,  512,
         512,    0,    0,  576,  576,   64, 2628

In [42]:
log_tokens = [X.unsqueeze(0)]
print(log_tokens)

[tensor([[4096,    0,  128,  129,  128,  128,  128,  128,  128,  128,  128,  128,
          128,  128,  128,  130,  130,  128,    0,    0,    0,    0,    0,    4,
            0,    0,    0,    0,  128,    0, 1040, 1552, 1025,    0,    0,    0,
            0, 1024,    0,  128,    0,    2,   34,    2,    2,    0,    0,    0,
            0,  272,  256,    0,    0,    0,  512,  512,    0,    0,    0,    0,
            4,    0,    0,    0,    0,  144, 1042, 1154, 1154,  130,   16,   16,
          208,  192,   64, 1090,    0,    0,   16,   16,  272,  256,  512,  512,
         1024, 1024, 1026, 1026,    0,    0,    0,    0,    0,    0,    0,    0,
            0,    0,    4,    4,    0,  576,  580,  580,  580,   64,   64,    0,
            0,    0,    0,   68,   68,    4, 2244, 2244, 2244, 2244, 2244,   68,
           64,    0,    0,   68,   68,   64,  582,  582,  578,  512,  512,  512,
          512,    0,    0,  576,  576,   64, 2628, 2628, 2628, 2628, 2628, 2624,
            0,    0,    0, 

In [43]:
from torch.distributions import Categorical

tf_generator.eval()

# Generate tokens
with torch.no_grad():    
    for i in range(200):
        # Concatenate tokens from previous iterations
        input_tokens = torch.cat(log_tokens, 1)
        input_tokens = input_tokens.to(device)
        print(input_tokens)
        print(input_tokens.shape)
        
        
        # Get model predictions for the next token
        data_pred = tf_generator(input_tokens)
        
        # Sample the next token from the distribution of probabilities
        dist = Categorical(logits=data_pred[:, -1] / temp)
        next_tokens = dist.sample().reshape(1, 1)
         
        # Append the sampled token to the list of generated tokens
        log_tokens.append(next_tokens.cpu())
        
        # Check for end-of-sequence token and stop generation
        if next_tokens.item() == -1:
            break


tensor([[4096,    0,  128,  129,  128,  128,  128,  128,  128,  128,  128,  128,
          128,  128,  128,  130,  130,  128,    0,    0,    0,    0,    0,    4,
            0,    0,    0,    0,  128,    0, 1040, 1552, 1025,    0,    0,    0,
            0, 1024,    0,  128,    0,    2,   34,    2,    2,    0,    0,    0,
            0,  272,  256,    0,    0,    0,  512,  512,    0,    0,    0,    0,
            4,    0,    0,    0,    0,  144, 1042, 1154, 1154,  130,   16,   16,
          208,  192,   64, 1090,    0,    0,   16,   16,  272,  256,  512,  512,
         1024, 1024, 1026, 1026,    0,    0,    0,    0,    0,    0,    0,    0,
            0,    0,    4,    4,    0,  576,  580,  580,  580,   64,   64,    0,
            0,    0,    0,   68,   68,    4, 2244, 2244, 2244, 2244, 2244,   68,
           64,    0,    0,   68,   68,   64,  582,  582,  578,  512,  512,  512,
          512,    0,    0,  576,  576,   64, 2628, 2628, 2628, 2628, 2628, 2624,
            0,    0,    0,  

In [44]:
all_tokens = log_tokens
print("=========")
print()
print(len(all_tokens))
print("==================")
all_tokens = np.squeeze(torch.cat(all_tokens, 1).numpy())
print(all_tokens)
print(all_tokens.shape)


201
[4096    0  128  129  128  128  128  128  128  128  128  128  128  128
  128  130  130  128    0    0    0    0    0    4    0    0    0    0
  128    0 1040 1552 1025    0    0    0    0 1024    0  128    0    2
   34    2    2    0    0    0    0  272  256    0    0    0  512  512
    0    0    0    0    4    0    0    0    0  144 1042 1154 1154  130
   16   16  208  192   64 1090    0    0   16   16  272  256  512  512
 1024 1024 1026 1026    0    0    0    0    0    0    0    0    0    0
    4    4    0  576  580  580  580   64   64    0    0    0    0   68
   68    4 2244 2244 2244 2244 2244   68   64    0    0   68   68   64
  582  582  578  512  512  512  512    0    0  576  576   64 2628 2628
 2628 2628 2628 2624    0    0    0  512  576  576  594  594  594  594
  594  512    0   64    0  576  576  576  516  516    4    0    0    0
    0    0    0    0 2120  200  200    8    0    0    0    0    0   72
 1096 1352 1352 1352 1352 1352   64    0    0    0    0   72   72 2376
 

convert generated tokens back to snapshots

In [45]:
harmony_tokens = all_tokens[-200:]
melody_tokens = X.numpy()
# remove start token
melody_tokens = melody_tokens[1:]

# Convert harmony to snapshots
harmony_snapshots = []
for token in harmony_tokens:
    harmony_snapshots.append(base10_to_binary_one_hot(token, 12))

# Convert melody to snapshots
melody_snapshots = []
for token in melody_tokens:
    melody_snapshots.append(base10_to_binary_one_hot(token, 12))

harmony_snapshots = np.array(harmony_snapshots)
print("Harmony_tokens:", harmony_tokens)

print("harmony snapshots:")
for snapshot in harmony_snapshots:
    print(snapshot)
    
print("===================")

melody_snapshots = np.array(melody_snapshots)
print("melody tokens:", melody_tokens)

print("melody snapshots:")
for snapshot in melody_snapshots:
    print(snapshot)

Harmony_tokens: [   0    1   64   16    0    0    0    0    0    0    0    0    0    0
    0    0    0    0    2    0    0    0    0    0    0    0    0    0
 1024  128    0    0    0    0    0    0    0    8    0 1026    0    0
    0    0    0    0    0    0    0    0    0    4    0    0    0    0
    0    0    0    0    0    0    0    0    0    0    0    0    0 1024
    0    0    0    0    0    0   64 1025    0    0   64    1    0    0
    0    0    0    4  512    0    0    0    0    0    0    0    0 2048
    4    0    0 2048    0    0    0    0    0    0    0    0    0    0
    0    0    0    0  512    0    0    0    0    0    0    0    0    0
    0    0    0    0  512    0   64   64  512    0    9    0    0    0
    0    0    0    0    0    0    0    0   64    0    4    0    0    0
    0    0    0   64    0    0    0    0    2   80    0   64   64   64
  512    0    0    0    0    0    0    0   64    0    0    0    0    0
    0    0   64   64   64    0    0    0    0   64    4    0 

blow up to 88 keys and convert to midi

In [46]:
def pad_to_88_keys(one_hot_vector, start_key=21, octaves_higher=0, total_keys=88):
    """
    Pad a one-hot encoded vector to fit 88 keys of a piano and place it a specified number of octaves higher.

    Parameters:
    one_hot_vector (np.ndarray): Input one-hot encoded vector.
    start_key (int): The starting key in the 88-key piano.
    octaves_higher (int): Number of octaves to shift the starting key higher.
    total_keys (int): The total number of keys on the piano (default is 88).

    Returns:
    np.ndarray: Padded one-hot encoded vector with 88 keys.
    """
    # Calculate the new starting key based on the number of octaves higher
    start_key = start_key + (octaves_higher * 12)
    
    # Initialize the full 88-key vector with zeros
    padded_vector = np.zeros(total_keys, dtype=int)
    end_key = start_key + len(one_hot_vector)
    
    if end_key > total_keys:
        raise ValueError("The one-hot vector is too long to fit in the 88 keys starting from the given start_key.")
    
    padded_vector[start_key:end_key] = one_hot_vector
    return padded_vector

def pad_sequence_of_one_hot_vectors(sequence, start_key=21, octaves_higher=0, total_keys=88):
    """
    Pad a sequence of one-hot encoded vectors to fit 88 keys of a piano, placing each one a specified number of octaves higher.

    Parameters:
    sequence (list of np.ndarray): Sequence of one-hot encoded vectors.
    start_key (int): The starting key in the 88-key piano.
    octaves_higher (int): Number of octaves to shift the starting key higher.
    total_keys (int): The total number of keys on the piano (default is 88).

    Returns:
    np.ndarray: 2D array where each row is a padded one-hot encoded vector with 88 keys.
    """
    padded_vectors = [pad_to_88_keys(vector, start_key, octaves_higher, total_keys) for vector in sequence]
    return np.stack(padded_vectors)


In [47]:
from data_visualization import snapshot_to_midi



melody_snapshots = pad_sequence_of_one_hot_vectors(melody_snapshots, 21, 3)
harmony_snapshots = pad_sequence_of_one_hot_vectors(harmony_snapshots, 21, 1)




melody_harmony = [melody_snapshots, harmony_snapshots]
snapshot_to_midi.create_midi_from_snapshots(melody_harmony, ["melody", "Generated harmony"], 0.05, "/home/falaxdb/Repos/minus1/transformer_decoder_training/jupyter_notebooks/piano_tests/outputs", "tokenized.mid")

Processing track 0: melody with snapshot shape (200, 88)
  Time step 0, keys type: <class 'numpy.ndarray'>, keys shape: (88,)
  Time step 1, keys type: <class 'numpy.ndarray'>, keys shape: (88,)
  Time step 2, keys type: <class 'numpy.ndarray'>, keys shape: (88,)
  Time step 3, keys type: <class 'numpy.ndarray'>, keys shape: (88,)
  Time step 4, keys type: <class 'numpy.ndarray'>, keys shape: (88,)
  Time step 5, keys type: <class 'numpy.ndarray'>, keys shape: (88,)
  Time step 6, keys type: <class 'numpy.ndarray'>, keys shape: (88,)
  Time step 7, keys type: <class 'numpy.ndarray'>, keys shape: (88,)
  Time step 8, keys type: <class 'numpy.ndarray'>, keys shape: (88,)
  Time step 9, keys type: <class 'numpy.ndarray'>, keys shape: (88,)
  Time step 10, keys type: <class 'numpy.ndarray'>, keys shape: (88,)
  Time step 11, keys type: <class 'numpy.ndarray'>, keys shape: (88,)
  Time step 12, keys type: <class 'numpy.ndarray'>, keys shape: (88,)
  Time step 13, keys type: <class 'numpy.nd