# now with new dataset preparation

I now want to do correct sequence prediction and not just Right hand to left hand

## for training

I just use one Sequence with sos token as input and the same sequence shifted 1 to the right as prediction target.

For this i use the new dataset

In [1]:
# imports
import torch
import torch.nn as nn
import torch.optim as optim
import torch.nn.functional as F
from torch import Tensor

import math
import numpy as np

from tqdm.notebook import trange, tqdm
import matplotlib.pyplot as plt
import random

In [2]:
# Check if GPU is available, set device accordingly
device = torch.device("cuda" if torch.cuda.is_available() else 'cpu')

# Load Data

In [3]:
from data_preperation import dataset_snapshot
from transformer_decoder_training.dataprep_transformer import dataprep_1
from sklearn.model_selection import train_test_split

#load data
dataset_as_snapshots = dataset_snapshot.process_dataset_multithreaded("/home/falaxdb/Repos/minus1/datasets/maestro_v3_split/hands_split_into_seperate_midis", 0.05)
# filter snapshots to 88 piano notes
dataset_as_snapshots = dataset_snapshot.filter_piano_range(dataset_as_snapshots)

dataset_as_snapshots = dataset_snapshot.compress_existing_dataset_to_12keys(dataset_as_snapshots)

Processed dataset (1038/1038): 100%|██████████| 1038/1038 [00:14<00:00, 72.19it/s]


Processed 1038 of 1038 files


In [4]:
# split songs into train, test and val
train_data, temp_data = train_test_split(dataset_as_snapshots, test_size=0.3, random_state=42, shuffle=True)
val_data, test_data = train_test_split(temp_data, test_size=0.5, random_state=42, shuffle=True)

# see if split is correct
print("Train data:", len(train_data))
print("test data:", len(test_data))
print("val data:", len(val_data))

for song in train_data:
    for track in song:
        print(track.shape)
    break

Train data: 363
test data: 78
val data: 78
(6088, 12)
(6088, 12)


## Create Dataset

In [5]:
# Define special Tokens
# Token dimension needs to fit Data
sos_token = np.full((1, 24), 1)
pad_token = np.full((1, 24), 2)
pad_token = torch.tensor(pad_token, device=device)

# Define other parameters
batch_size = 64
seq_length = 512
stride = 256

In [6]:
# create dataset + dataloader
from torch.utils.data import DataLoader
from transformer_decoder_training.dataset_transformer.dataset_2 import AdvancedPianoDataset

train_dataset = AdvancedPianoDataset(train_data, seq_length, stride, sos_token)
val_dataset = AdvancedPianoDataset(val_data, seq_length, stride, sos_token)
test_dataset = AdvancedPianoDataset(test_data, seq_length, stride, sos_token)

print("Check length of datasets. should roughly match split ratio")
print("train dataset:", len(train_dataset))
print("val dataset:", len(val_dataset))
print("test dataset:", len(test_dataset))
print("")

# Create DataLoaders for each subset with drop_last=True
train_loader = DataLoader(train_dataset, batch_size=batch_size, shuffle=True, drop_last=True)
val_loader = DataLoader(val_dataset, batch_size=batch_size, shuffle=False, drop_last=True)
test_loader = DataLoader(test_dataset, batch_size=batch_size, shuffle=False, drop_last=True)

# Test if data looks correct
# sos token should be at beginning of every sequence
# sequence should be 2 times the size of a track snapshot
for batch in train_loader:
    print("Visualize shape of batch:")
    print("shape of one batch:", batch.shape)
    print("==============")
    
    print("Test for sos token as first token in sequence")
    print("First token in seq:", batch[0][0])
    print("=============")
    
    print("Test print one snapshot:")
    print("First half of values should be left hand, second half should be right hand")
    print(batch[0][1])
    break


Check length of datasets. should roughly match split ratio
train dataset: 13799
val dataset: 2848
test dataset: 3209

Visualize shape of batch:
shape of one batch: torch.Size([64, 513, 24])
Test for sos token as first token in sequence
First token in seq: tensor([1., 1., 1., 1., 1., 1., 1., 1., 1., 1., 1., 1., 1., 1., 1., 1., 1., 1.,
        1., 1., 1., 1., 1., 1.])
Test print one snapshot:
First half of values should be left hand, second half should be right hand
tensor([0., 0., 0., 0., 1., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0.,
        0., 1., 0., 0., 0., 1.])


# Initialize Model

In [7]:
# set parameters
# Learning rate for the optimizer
learning_rate = 1e-3
# Number of epochs for training
nepochs = 20
# Embedding Size
hidden_size = 256
# Number of transformer blocks
num_layers = 8
# MultiheadAttention Heads
num_heads = 8

In [8]:
from transformer_decoder_training.models.transformer_decoder_1 import Transformer

model = Transformer(num_emb=24, num_layers=num_layers, hidden_size=hidden_size, num_heads=num_heads).to(device)

# Initialize the optimizer with above parameters
optimizer = optim.Adam(model.parameters(), lr=learning_rate)

# Define the loss function
# loss function should be one that can handle multi one hot encoded vectors
# Klammern nicht vergessen
loss_fn = nn.BCELoss()

In [9]:
# check number of model parameters
num_model_params = 0
for param in model.parameters():
    num_model_params += param.flatten().shape[0]

print("-This Model Has %d (Approximately %d Million) Parameters!" % (num_model_params, num_model_params//1e6))

-This Model Has 6330648 (Approximately 6 Million) Parameters!


# Training

In [10]:
def train_loop(model, opt, loss_fn, dataloader, pad_token, device):
    model.train()
    total_loss = 0
    
    for batch in dataloader:
        # Move data to GPU
        src_sequence = batch.to(device)
        
        # create input and expected sequence -> move expected sequence one to the right
        input_sequences = src_sequence[:, :-1]
        expected_sequence = src_sequence[:, 1:]
        
        # Generate predictions
        pred = model(input_sequences, pad_token)
        
        #print("Prediction shape:", pred.shape)
        #print(pred)
        #print("expected harmony_shape:", expected_harmony.shape)
        #print(expected_harmony)
        
        # Calculate loss with masked cross-entropy
        # ich glaube 0 steht in vorlage für padding token index -> habe ich hier anders
        #mask = (expected_harmony != pad_token).float() Maske verwenden, um Padding positions im output zu canceln
        # masked_pred = pred * mask
        loss = loss_fn(pred, expected_sequence)
        
        # Backpropagation
        opt.zero_grad()
        loss.backward()
        opt.step()
    
        total_loss += loss.detach().item()
        
    return total_loss / len(dataloader)

def validation_loop(model, loss_fn, dataloader,pad_token, device):
    model.eval()
    total_loss = 0
    
    with torch.no_grad():
        for batch in dataloader:
            # Move data to GPU
            src_sequence = batch.to(device)
            
            # Create input and expected sequences
            input_sequences = src_sequence[:, :-1, :]
            expected_sequence = src_sequence[:, 1:, :]
            
            # Generate predictions
            pred = model(input_sequences, pad_token)
            
            # Calculate loss without flattening
            loss = loss_fn(pred, expected_sequence)
            
            total_loss += loss.detach().item()
    
    return total_loss / len(dataloader)

In [11]:
from timeit import default_timer as timer
NUM_EPOCHS = 15

for epoch in range(1, NUM_EPOCHS+1):
    start_time = timer()
    train_loss = train_loop(model, optimizer, loss_fn, train_loader, pad_token, device)
    end_time = timer()
    val_loss = validation_loop(model, loss_fn, val_loader, pad_token, device)
    print((f"Epoch: {epoch}, Train loss: {train_loss:.3f}, Val loss: {val_loss:.3f}, "f"Epoch time = {(end_time - start_time):.3f}s"))

Epoch: 1, Train loss: 0.180, Val loss: 0.129, Epoch time = 44.243s
Epoch: 2, Train loss: 0.126, Val loss: 0.126, Epoch time = 43.879s
Epoch: 3, Train loss: 0.123, Val loss: 0.122, Epoch time = 43.895s
Epoch: 4, Train loss: 0.120, Val loss: 0.121, Epoch time = 43.887s
Epoch: 5, Train loss: 0.118, Val loss: 0.119, Epoch time = 44.041s
Epoch: 6, Train loss: 0.117, Val loss: 0.118, Epoch time = 43.881s
Epoch: 7, Train loss: 0.115, Val loss: 0.116, Epoch time = 43.871s
Epoch: 8, Train loss: 0.114, Val loss: 0.115, Epoch time = 43.873s
Epoch: 9, Train loss: 0.112, Val loss: 0.113, Epoch time = 43.864s
Epoch: 10, Train loss: 0.111, Val loss: 0.112, Epoch time = 43.849s
Epoch: 11, Train loss: 0.110, Val loss: 0.111, Epoch time = 43.848s
Epoch: 12, Train loss: 0.108, Val loss: 0.109, Epoch time = 43.855s
Epoch: 13, Train loss: 0.107, Val loss: 0.109, Epoch time = 43.855s
Epoch: 14, Train loss: 0.106, Val loss: 0.108, Epoch time = 43.860s
Epoch: 15, Train loss: 0.105, Val loss: 0.107, Epoch time

### Last training output:
Learning rate: 1e-4

Epoch: 1, Train loss: 0.126, Val loss: 0.133, Epoch time = 44.974s  
Epoch: 2, Train loss: 0.124, Val loss: 0.132, Epoch time = 44.998s  
Epoch: 3, Train loss: 0.123, Val loss: 0.130, Epoch time = 44.866s  
Epoch: 4, Train loss: 0.121, Val loss: 0.129, Epoch time = 44.857s  
Epoch: 5, Train loss: 0.120, Val loss: 0.127, Epoch time = 44.856s  
Epoch: 6, Train loss: 0.118, Val loss: 0.126, Epoch time = 44.866s  
Epoch: 7, Train loss: 0.118, Val loss: 0.125, Epoch time = 44.853s  
Epoch: 8, Train loss: 0.117, Val loss: 0.125, Epoch time = 44.867s  
Epoch: 9, Train loss: 0.116, Val loss: 0.124, Epoch time = 44.860s  
Epoch: 10, Train loss: 0.115, Val loss: 0.123, Epoch time = 44.859s 

learning rate: 1e-3

Epoch: 1, Train loss: 0.180, Val loss: 0.129, Epoch time = 44.243s  
Epoch: 2, Train loss: 0.126, Val loss: 0.126, Epoch time = 43.879s  
Epoch: 3, Train loss: 0.123, Val loss: 0.122, Epoch time = 43.895s  
Epoch: 4, Train loss: 0.120, Val loss: 0.121, Epoch time = 43.887s  
Epoch: 5, Train loss: 0.118, Val loss: 0.119, Epoch time = 44.041s  
Epoch: 6, Train loss: 0.117, Val loss: 0.118, Epoch time = 43.881s  
Epoch: 7, Train loss: 0.115, Val loss: 0.116, Epoch time = 43.871s  
Epoch: 8, Train loss: 0.114, Val loss: 0.115, Epoch time = 43.873s  
Epoch: 9, Train loss: 0.112, Val loss: 0.113, Epoch time = 43.864s  
Epoch: 10, Train loss: 0.111, Val loss: 0.112, Epoch time = 43.849s     
Epoch: 11, Train loss: 0.110, Val loss: 0.111, Epoch time = 43.848s     
Epoch: 12, Train loss: 0.108, Val loss: 0.109, Epoch time = 43.855s     
Epoch: 13, Train loss: 0.107, Val loss: 0.109, Epoch time = 43.855s     
Epoch: 14, Train loss: 0.106, Val loss: 0.108, Epoch time = 43.860s     
Epoch: 15, Train loss: 0.105, Val loss: 0.107, Epoch time = 43.850s     



In [12]:
# see: https://pytorch.org/tutorials/beginner/basics/saveloadrun_tutorial.html#save-and-load-the-model

torch.save(model.state_dict(), "/home/falaxdb/Repos/minus1/transformer_decoder_training/saved_files/saved_models/model_1_notebook_v6.pth")

# Evaluation

In [17]:
from transformer_decoder_training.inference.inference_2 import inference

# just do one single sequence
for batch in test_loader:
    # get single sequence
    # blow it up to one batch again
    sequence = torch.unsqueeze(batch[0], 0)
    print(sequence.shape)
    
    # split into context sequence and truth sequence
    context_seq = sequence[: ,:200]
    continuing_seq = sequence[:, 200:]
    
    output_tokens, melody_output_tokens = inference(model, context_seq, continuing_seq, 0.25, pad_token, device)
    
    
    
    

torch.Size([1, 513, 24])
Next token shape: torch.Size([1, 24])


AssertionError: 