# Jetzt nochmal ein Decoder only ohne tokenization zu int mit hoffentlich korrekter inferenz


In [1]:
# imports
import torch
import torch.nn as nn
import torch.optim as optim
import torch.nn.functional as F
from torch import Tensor

from data_preperation import dataset_snapshot

import math
import numpy as np

from tqdm.notebook import trange, tqdm
import matplotlib.pyplot as plt
import random

# prepare data

In [2]:
# Check if GPU is available, set device accordingly
device = torch.device("cuda" if torch.cuda.is_available() else 'cpu')

In [3]:
# Set Special tokens, chunk size, etc.
sos_token = np.full((1, 88), 1)
pad_token = np.full((1, 88), 2)
pad_token = torch.tensor(pad_token, device=device)

seq_length = 512
batch_size = 64

In [4]:
from transformer_decoder_training.dataprep_transformer import dataprep_1
from sklearn.model_selection import train_test_split
# prepare data for dataset

#load data
dataset_as_snapshots = dataset_snapshot.process_dataset_multithreaded("/home/falaxdb/Repos/minus1/datasets/maestro_v3_split/hands_split_into_seperate_midis", 0.05)
# filter snapshots to 88 piano notes
dataset_as_snapshots = dataset_snapshot.filter_piano_range(dataset_as_snapshots)

# Convert data into Chunks and add special tokens
data = dataprep_1.prepare_dataset(dataset_as_snapshots, seq_length, seq_length, sos_token)

print("Ammount of sequence pairs:", len(data))

# Split the dataset using sklearn while maintaining pairs
train_data, temp_data = train_test_split(data, test_size=0.3, random_state=42, shuffle=True)
val_data, test_data = train_test_split(temp_data, test_size=0.5, random_state=42, shuffle=True)

Processed dataset (1038/1038): 100%|██████████| 1038/1038 [00:14<00:00, 71.47it/s]


Processed 1038 of 1038 files
Ammount of sequence pairs: 10069


In [5]:
from torch.utils.data import DataLoader
from transformer_decoder_training.dataset_transformer.dataset_1 import PianoDataset

# Create custom datasets
train_dataset = PianoDataset(train_data)
val_dataset = PianoDataset(val_data)
test_dataset = PianoDataset(test_data)

# Create DataLoaders for each subset with drop_last=True
train_loader = DataLoader(train_dataset, batch_size=batch_size, shuffle=True, drop_last=True)
val_loader = DataLoader(val_dataset, batch_size=batch_size, shuffle=False, drop_last=True)
test_loader = DataLoader(test_dataset, batch_size=batch_size, shuffle=False, drop_last=True)

# Iterate over the DataLoader (example with train_loader)
for batch in train_loader:
    X, y = batch
    print(X.shape, y.shape)
    # X and y should both have shape (batch_size, chunk_size + 1, feature dimension) because of SOS (and EOS) tokens

torch.Size([64, 513, 88]) torch.Size([64, 513, 88])
torch.Size([64, 513, 88]) torch.Size([64, 513, 88])
torch.Size([64, 513, 88]) torch.Size([64, 513, 88])
torch.Size([64, 513, 88]) torch.Size([64, 513, 88])
torch.Size([64, 513, 88]) torch.Size([64, 513, 88])
torch.Size([64, 513, 88]) torch.Size([64, 513, 88])
torch.Size([64, 513, 88]) torch.Size([64, 513, 88])
torch.Size([64, 513, 88]) torch.Size([64, 513, 88])
torch.Size([64, 513, 88]) torch.Size([64, 513, 88])
torch.Size([64, 513, 88]) torch.Size([64, 513, 88])
torch.Size([64, 513, 88]) torch.Size([64, 513, 88])
torch.Size([64, 513, 88]) torch.Size([64, 513, 88])
torch.Size([64, 513, 88]) torch.Size([64, 513, 88])
torch.Size([64, 513, 88]) torch.Size([64, 513, 88])
torch.Size([64, 513, 88]) torch.Size([64, 513, 88])
torch.Size([64, 513, 88]) torch.Size([64, 513, 88])
torch.Size([64, 513, 88]) torch.Size([64, 513, 88])
torch.Size([64, 513, 88]) torch.Size([64, 513, 88])
torch.Size([64, 513, 88]) torch.Size([64, 513, 88])
torch.Size([

# Initialize model

In [6]:
# set parameters

# Learning rate for the optimizer
learning_rate = 1e-4

# Number of epochs for training
nepochs = 20

# Embedding Size
hidden_size = 256

# Number of transformer blocks
num_layers = 8

# MultiheadAttention Heads
num_heads = 8

In [7]:
from transformer_decoder_training.models.transformer_decoder_1 import Transformer

# Create model
# (num emb = wie viele verschiedene Tokens es geben kann bei 12 Tönen 2 ** 12 Möglichkeiten + 2 special tokens)
# num_emb: Da ja keine int indexe mehr -> wahrscheinlich 88 wegen 88 Keys
tf_generator = Transformer(num_emb=88, num_layers=num_layers, 
                           hidden_size=hidden_size, num_heads=num_heads).to(device)
# Initialize the optimizer with above parameters
optimizer = optim.Adam(tf_generator.parameters(), lr=learning_rate)

# Define the loss function
# Klammern nicht vergessen
loss_fn = nn.BCELoss()

In [8]:
# check number of model parameters
num_model_params = 0
for param in tf_generator.parameters():
    num_model_params += param.flatten().shape[0]

print("-This Model Has %d (Approximately %d Million) Parameters!" % (num_model_params, num_model_params//1e6))

-This Model Has 6363480 (Approximately 6 Million) Parameters!


# Training

In [9]:
def train_loop(model, opt, loss_fn, dataloader, pad_token, device):
    model.train()
    total_loss = 0
    
    for batch in dataloader:
        # Move data to GPU
        input_melody, expected_harmony = batch
        input_melody, expected_harmony = input_melody.to(device), expected_harmony.to(device)
        
        # shift input und output für das training zeug mit start token
        input_melody = input_melody[:, 0:-1]
        expected_harmony = expected_harmony[:, 1:]
        
        # Generate predictions
        pred = tf_generator(input_melody, pad_token)
        
        #print("Prediction shape:", pred.shape)
        #print(pred)
        #print("expected harmony_shape:", expected_harmony.shape)
        #print(expected_harmony)
        
        # Calculate loss with masked cross-entropy
        # ich glaube 0 steht in vorlage für padding token index -> habe ich hier anders
        #mask = (expected_harmony != pad_token).float() Maske verwenden, um Padding positions im output zu canceln
        # masked_pred = pred * mask
        loss = loss_fn(pred, expected_harmony)
        
        # Backpropagation
        opt.zero_grad()
        loss.backward()
        opt.step()
    
        total_loss += loss.detach().item()
        
    return total_loss / len(dataloader)

In [10]:
from timeit import default_timer as timer
NUM_EPOCHS = 10

for epoch in range(1, NUM_EPOCHS+1):
    start_time = timer()
    train_loss = train_loop(tf_generator, optimizer, loss_fn, train_loader, pad_token, device)
    end_time = timer()
    # val_loss = validation_loop(model, loss_fn, val_loader)
    print((f"Epoch: {epoch}, Train loss: {train_loss:.3f} "f"Epoch time = {(end_time - start_time):.3f}s"))


Epoch: 1, Train loss: 0.076 Epoch time = 23.436s
Epoch: 2, Train loss: 0.049 Epoch time = 23.168s
Epoch: 3, Train loss: 0.049 Epoch time = 23.173s
Epoch: 4, Train loss: 0.049 Epoch time = 23.182s
Epoch: 5, Train loss: 0.048 Epoch time = 23.215s
Epoch: 6, Train loss: 0.048 Epoch time = 23.206s
Epoch: 7, Train loss: 0.048 Epoch time = 23.220s
Epoch: 8, Train loss: 0.047 Epoch time = 23.188s
Epoch: 9, Train loss: 0.046 Epoch time = 23.174s
Epoch: 10, Train loss: 0.045 Epoch time = 23.226s


In [11]:
# see: https://pytorch.org/tutorials/beginner/basics/saveloadrun_tutorial.html#saving-and-loading-models-with-shapes

torch.save(tf_generator, "./saved_models/model_1_notebook_v5.pth")