In [41]:
num_epochs=50
sequence_length=16
batch_size=16
max_notes=100
dataset_path='smaller-dataset'

In [50]:
from sklearn.preprocessing import MinMaxScaler
import torch
import torch.nn as nn
import torch.optim as optim
import numpy as np
import pretty_midi
import glob
from torch.utils.data import DataLoader, TensorDataset
import torch.nn.functional as F

# Define scalers for velocity and timing features
pitch_scaler = MinMaxScaler(feature_range=(0, 1))
velocity_scaler = MinMaxScaler(feature_range=(0, 1))
start_scaler = MinMaxScaler(feature_range=(0, 1))
duration_scaler = MinMaxScaler(feature_range=(0, 1))

np.set_printoptions(suppress=True)


In [43]:
# Function to extract MIDI features (limited to first 50 notes)
def extract_midi_features(midi_file):
    midi_data = pretty_midi.PrettyMIDI(midi_file)
    notes = []  # Initialize notes inside the function
    for instrument in midi_data.instruments:
        for note in instrument.notes:
            notes.append([
                note.pitch, 
                note.velocity,
                note.start, 
                note.end - note.start
            ])
            if len(notes) >= max_notes:  # Stop collecting after 50 notes
                return np.array(notes)  # Return early
    return np.array(notes[:max_notes])  # Ensure no more than 50 notes

# Load all MIDI files (including both .midi and .mid)
midi_files = glob.glob(dataset_path+"/*.midi") + glob.glob(dataset_path+"/*.mid")
data = []

for f in midi_files:
    notes = extract_midi_features(f)
    #print(notes.shape)
    #print(len(notes))
    #print(len(notes) - sequence_length - 1)
    for i in range(0, len(notes) - sequence_length - 1):
        data.append(notes[i:i + sequence_length])  # Create 16-note sequences

data = [d for d in data if d.shape == (sequence_length, 4)]  # Filter inconsistent samples

In [49]:
dataset = torch.tensor(np.array(data), dtype=torch.float32)
print(dataset.shape)
print(dataset[0].numpy())


torch.Size([1245, 16, 4])
[[74.         92.          1.0234375   0.0625    ]
 [57.         79.          2.03125     0.06380209]
 [62.         86.          2.5338542   0.04296875]
 [81.         93.          1.5455729   1.0533854 ]
 [74.         82.          3.0247395   0.4466146 ]
 [78.         97.          3.015625    0.51432294]
 [73.         73.          3.5182292   0.04427083]
 [76.         79.          3.5273438   0.04947917]
 [71.         78.          3.6614583   0.0390625 ]
 [74.         72.          3.6666667   0.04817708]
 [69.         76.          3.796875    0.03125   ]
 [73.         79.          3.7851562   0.07161459]
 [67.         78.          3.9283855   0.03776042]
 [71.         83.          3.9270833   0.05729167]
 [66.         76.          4.0638022   0.03515625]
 [69.         77.          4.0625      0.05078125]]


In [59]:
# Normalize the dataset for only start and end columns


# Compute L2 norm for the last two columns **before** normalization
#original_norms = torch.norm(dataset[:, :, 2:], p=2, dim=2, keepdim=True)

# Apply L2 normalization only to the last two columns along `dim=2`
#normalized_floats = F.normalize(dataset[:, :, 2:], p=2, dim=2)

# Combine integer columns and normalized float columns
#normalized_tensor = torch.cat((dataset[:, :, :2], normalized_floats), dim=2)


# Normalize the dataset for all columns


# Compute L2 norm for all 4 columns along `dim=2`
original_norms = torch.norm(dataset, p=2, dim=2, keepdim=True)  # Shape (n, 16, 1)

# Normalize by dividing each column by its L2 norm
normalized_tensor = dataset / (original_norms + 1e-8)  # Avoid division by zero


print(normalized_tensor.shape)
print(normalized_tensor[0].numpy())

dataset = normalized_tensor

torch.Size([1245, 16, 4])
[[0.62673616 0.7791855  0.00845366 0.00051625]
 [0.5850849  0.8109071  0.01025959 0.00032226]
 [0.58477557 0.8111403  0.00943051 0.00015992]
 [0.65675914 0.7540568  0.0067     0.00456639]
 [0.6699371  0.7423627  0.0089561  0.0013224 ]
 [0.62663233 0.77927357 0.00791939 0.00135067]
 [0.7070736  0.7070736  0.00968517 0.00012187]
 [0.69326216 0.7206278  0.00912097 0.00012794]
 [0.6731143  0.7394777  0.00947994 0.00010114]
 [0.7166923  0.69732225 0.00968419 0.00012724]
 [0.67215586 0.7403456  0.00974106 0.00008017]
 [0.67863685 0.7344152  0.00929473 0.00017585]
 [0.6515611  0.75853384 0.00972434 0.00009347]
 [0.6500098  0.75987065 0.00915409 0.00013355]
 [0.6556543  0.7549959  0.00993378 0.00008594]
 [0.6673289  0.7447004  0.00967068 0.00012088]]


In [60]:
# Create DataLoader for batch training
train_loader = DataLoader(dataset, batch_size=batch_size, shuffle=False)

# Transformer Model with Positional Encoding
class PositionalEncoding(nn.Module):
    #def __init__(self, d_model, max_len=16):
    def __init__(self, d_model, max_len=sequence_length):

        super(PositionalEncoding, self).__init__()
        position = torch.arange(max_len).unsqueeze(1)
        div_term = torch.exp(torch.arange(0, d_model, 2) * (-np.log(10000.0) / d_model))
        pe = torch.zeros(max_len, d_model)
        pe[:, 0::2] = torch.sin(position * div_term)
        pe[:, 1::2] = torch.cos(position * div_term)
        self.pe = pe.unsqueeze(0)

    def forward(self, x):
        return x + self.pe.to(x.device)

class MidiTransformer(nn.Module):
    def __init__(self, input_dim=4, model_dim=128, num_heads=4, num_layers=3, ff_dim=512):
        super(MidiTransformer, self).__init__()
        self.embedding = nn.Linear(input_dim, model_dim)
        self.pos_encoder = PositionalEncoding(model_dim)
        encoder_layers = nn.TransformerEncoderLayer(
            d_model=model_dim, nhead=num_heads, dim_feedforward=ff_dim, batch_first=True
        )
        self.transformer_encoder = nn.TransformerEncoder(encoder_layers, num_layers)
        self.fc = nn.Linear(model_dim, input_dim)  

    def forward(self, x):
        x = self.embedding(x)  
        x = self.pos_encoder(x)  
        x = self.transformer_encoder(x)  
        x = self.fc(x)  
        return x


In [61]:
# Model, loss function, and optimizer
device = torch.device("cuda" if torch.cuda.is_available() else "cpu")
model = MidiTransformer().to(device)
criterion = nn.MSELoss()
#criterion = nn.L1Loss()
optimizer = optim.Adam(model.parameters(), lr=0.001)

# Training loop with batches
for epoch in range(num_epochs):
    total_loss = 0
    for batch in train_loader:
        batch = batch.to(device)  # Move batch to GPU
        optimizer.zero_grad()
        outputs = model(batch)  
        loss = criterion(outputs, batch)  
        loss.backward()
        optimizer.step()
        total_loss += loss.item()

    if epoch % 10 == 0:
        print(f"Epoch {epoch}, Loss: {total_loss / len(train_loader)}")


Epoch 0, Loss: 0.06967059296006575
Epoch 10, Loss: 0.0008469862409700186
Epoch 20, Loss: 0.001085345895822124
Epoch 30, Loss: 0.0003567144606047525
Epoch 40, Loss: 0.00042770833608348115


In [90]:
# Generate a new MIDI sequence from a random input
random_index = np.random.randint(0, len(dataset))
#random_sequence = dataset[random_index].unsqueeze(0)
random_sequence = dataset[0].unsqueeze(0)

random_sequence = random_sequence.to(device)  # Move input to same device
generated_sequence = model(random_sequence).detach().cpu()

print(generated_sequence[0].numpy())
generated_sequence  = generated_sequence * original_norms  # Restore original values
generated_sequence = generated_sequence.numpy()
print(generated_sequence[0])


[[ 0.636406    0.7917115   0.00350369 -0.00241182]
 [ 0.5794409   0.81270367 -0.00222071  0.00358654]
 [ 0.6022088   0.80662715  0.00383696 -0.00620071]
 [ 0.654855    0.7430473  -0.00030951  0.00388243]
 [ 0.65601045  0.7563926   0.00329684  0.00259059]
 [ 0.6143858   0.77904606  0.00252246  0.00500582]
 [ 0.70056057  0.70913094  0.0074934   0.00213918]
 [ 0.69353664  0.720341    0.00916249 -0.00242757]
 [ 0.67089057  0.74255204  0.00305285  0.00013651]
 [ 0.70257676  0.6913601   0.00609456 -0.00663397]
 [ 0.6643923   0.7486783   0.00866148  0.00852416]
 [ 0.67919576  0.7362311   0.00467242 -0.0046528 ]
 [ 0.6516422   0.75461155  0.00669618  0.00458929]
 [ 0.65871984  0.7699835   0.0026414   0.00095144]
 [ 0.6431219   0.7620988   0.00492819  0.00492529]
 [ 0.67372376  0.7403782   0.00528182 -0.00007175]]
[[75.14174    93.47897     0.4136879  -0.2847681 ]
 [56.450153   79.175026   -0.21634573  0.3494075 ]
 [63.848335   85.5215      0.4068082  -0.6574214 ]
 [80.76516    91.642166   -0.0

In [91]:
# Convert generated sequence to MIDI
def sequence_to_midi(sequence, output_file="generated.mid"):
    midi = pretty_midi.PrettyMIDI()
    instrument = pretty_midi.Instrument(program=0)  

    # Adjust start times to ensure the sequence starts at 0
    min_start_time = min(note_data[2] for note_data in sequence[0])
    
    for note_data in sequence[0]:  # Adjusted to handle batch dimension
        pitch, velocity, start, duration = note_data
        pitch = int(pitch)  # Ensure pitch is an integer
        velocity = int(velocity)  # Ensure velocity is an integer
        start = float(start) - min_start_time  # Adjust start time
        duration = float(duration)  # Ensure duration is a float
        end = start + duration
        note = pretty_midi.Note(
            velocity=velocity, pitch=pitch, start=start, end=end
        )
        instrument.notes.append(note)

    midi.instruments.append(instrument)
    midi.write(output_file)
    for noteInformation in midi.instruments[0].notes:
        print(noteInformation)
    print(f"Generated MIDI saved as {output_file}")

sequence_to_midi(generated_sequence, "generated.mid")


Note(start=0.630034, end=0.345266, pitch=75, velocity=93)
Note(start=0.000000, end=0.349407, pitch=56, velocity=79)
Note(start=0.623154, end=-0.034267, pitch=63, velocity=85)
Note(start=0.178173, end=0.657004, pitch=80, velocity=91)
Note(start=0.580509, end=0.866661, pitch=72, velocity=83)
Note(start=0.530329, end=1.153427, pitch=76, velocity=96)
Note(start=0.989983, end=1.210837, pitch=72, velocity=73)
Note(start=1.220798, end=0.954672, pitch=76, velocity=78)
Note(start=0.538360, end=0.552759, pitch=70, velocity=78)
Note(start=0.845622, end=0.160650, pitch=72, velocity=71)
Note(start=1.105488, end=1.980534, pitch=68, velocity=76)
Note(start=0.718951, end=0.218456, pitch=73, velocity=79)
Note(start=0.904914, end=1.376830, pitch=67, velocity=77)
Note(start=0.504864, end=0.608789, pitch=71, velocity=84)
Note(start=0.712432, end=1.208225, pitch=64, velocity=76)
Note(start=0.762472, end=0.755053, pitch=69, velocity=76)
Generated MIDI saved as generated.mid
