In [109]:
num_epochs=50
sequence_length=16
batch_size=16
max_notes=100
dataset_path='dataset'

In [110]:
from sklearn.preprocessing import MinMaxScaler
import torch
import torch.nn as nn
import torch.optim as optim
import numpy as np
import pretty_midi
import glob
from torch.utils.data import DataLoader, TensorDataset
import torch.nn.functional as F

np.set_printoptions(suppress=True)


In [111]:
# Function to extract MIDI features (limited to first 50 notes)
def extract_midi_features(midi_file):
    midi_data = pretty_midi.PrettyMIDI(midi_file)
    notes = []  # Initialize notes inside the function
    for instrument in midi_data.instruments:
        for note in instrument.notes:
            notes.append([
                note.pitch, 
                note.velocity,
                note.start, 
                note.end - note.start
            ])
            if len(notes) >= max_notes: # Stop collecting after max_notes notes
                return np.array(notes)  # Return early
    return np.array(notes[:max_notes])  # Ensure no more than max_notes notes

# Load all MIDI files (including both .midi and .mid)
midi_files = glob.glob(dataset_path+"/*.midi") + glob.glob(dataset_path+"/*.mid")
data = []

for f in midi_files:
    notes = extract_midi_features(f)
    #print(notes.shape)
    #print(len(notes))
    #print(len(notes) - sequence_length - 1)
    for i in range(0, len(notes) - sequence_length - 1):
        data.append(notes[i:i + sequence_length])  # Create 16-note sequences

data = [d for d in data if d.shape == (sequence_length, 4)]  # Filter inconsistent samples

In [112]:
dataset = torch.tensor(np.array(data), dtype=torch.float32)
print(dataset.shape)
print(dataset[0].numpy())

torch.Size([37350, 16, 4])
[[74.         92.          1.0234375   0.0625    ]
 [57.         79.          2.03125     0.06380209]
 [62.         86.          2.5338542   0.04296875]
 [81.         93.          1.5455729   1.0533854 ]
 [74.         82.          3.0247395   0.4466146 ]
 [78.         97.          3.015625    0.51432294]
 [73.         73.          3.5182292   0.04427083]
 [76.         79.          3.5273438   0.04947917]
 [71.         78.          3.6614583   0.0390625 ]
 [74.         72.          3.6666667   0.04817708]
 [69.         76.          3.796875    0.03125   ]
 [73.         79.          3.7851562   0.07161459]
 [67.         78.          3.9283855   0.03776042]
 [71.         83.          3.9270833   0.05729167]
 [66.         76.          4.0638022   0.03515625]
 [69.         77.          4.0625      0.05078125]]


In [113]:
# Normalize the dataset for only start and end columns


# Compute L2 norm for the last two columns **before** normalization
#original_norms = torch.norm(dataset[:, :, 2:], p=2, dim=2, keepdim=True)

# Apply L2 normalization only to the last two columns along `dim=2`
#normalized_floats = F.normalize(dataset[:, :, 2:], p=2, dim=2)

# Combine integer columns and normalized float columns
#normalized_tensor = torch.cat((dataset[:, :, :2], normalized_floats), dim=2)


# Normalize the dataset for all columns


# Compute L2 norm for all 4 columns along `dim=2`
original_norms = torch.norm(dataset, p=2, dim=2, keepdim=True)  # Shape (n, 16, 1)

# Normalize by dividing each column by its L2 norm
normalized_tensor = dataset / (original_norms + 1e-8)  # Avoid division by zero


print(normalized_tensor.shape)
print(normalized_tensor[0].numpy())

dataset = normalized_tensor

torch.Size([37350, 16, 4])
[[0.62673503 0.7791841  0.00866789 0.00052934]
 [0.5849884  0.81077343 0.02084663 0.0006548 ]
 [0.58463454 0.8109447  0.02389321 0.00040518]
 [0.65670526 0.7539949  0.01253069 0.00854029]
 [0.669708   0.7421089  0.02737422 0.00404191]
 [0.6264634  0.77906346 0.02422024 0.00413083]
 [0.70669645 0.70669645 0.03405918 0.00042858]
 [0.6929323  0.7202849  0.03216066 0.00045113]
 [0.67273927 0.7390657  0.03469305 0.00037013]
 [0.7162743  0.69691557 0.03549107 0.00046632]
 [0.6717284  0.7398747  0.03696331 0.00030422]
 [0.6782462  0.73399246 0.03516805 0.00066537]
 [0.65111685 0.75801665 0.03817669 0.00036696]
 [0.64961725 0.7594117  0.035931   0.00052419]
 [0.6551529  0.75441855 0.04033957 0.00034898]
 [0.66684544 0.74416083 0.03926173 0.00049077]]


In [114]:
# Create DataLoader for batch training
train_loader = DataLoader(dataset, batch_size=batch_size, shuffle=False)

# Transformer Model with Positional Encoding
class PositionalEncoding(nn.Module):
    #def __init__(self, d_model, max_len=16):
    def __init__(self, d_model, max_len=sequence_length):

        super(PositionalEncoding, self).__init__()
        position = torch.arange(max_len).unsqueeze(1)
        div_term = torch.exp(torch.arange(0, d_model, 2) * (-np.log(10000.0) / d_model))
        pe = torch.zeros(max_len, d_model)
        pe[:, 0::2] = torch.sin(position * div_term)
        pe[:, 1::2] = torch.cos(position * div_term)
        self.pe = pe.unsqueeze(0)

    def forward(self, x):
        return x + self.pe.to(x.device)

class MidiTransformer(nn.Module):
    def __init__(self, input_dim=4, model_dim=128, num_heads=4, num_layers=3, ff_dim=512):
        super(MidiTransformer, self).__init__()
        self.embedding = nn.Linear(input_dim, model_dim)
        self.pos_encoder = PositionalEncoding(model_dim)
        encoder_layers = nn.TransformerEncoderLayer(
            d_model=model_dim, nhead=num_heads, dim_feedforward=ff_dim, batch_first=True
        )
        self.transformer_encoder = nn.TransformerEncoder(encoder_layers, num_layers)
        self.fc = nn.Linear(model_dim, input_dim)  

    def forward(self, x):
        x = self.embedding(x)  
        x = self.pos_encoder(x)  
        x = self.transformer_encoder(x)  
        x = self.fc(x)  
        return x

In [115]:
# Model, loss function, and optimizer
device = torch.device("cuda" if torch.cuda.is_available() else "cpu")
model = MidiTransformer().to(device)
criterion = nn.MSELoss()
#criterion = nn.L1Loss()
optimizer = optim.Adam(model.parameters(), lr=0.001)

# Training loop with batches
for epoch in range(num_epochs+1):
    total_loss = 0
    for batch in train_loader:
        batch = batch.to(device)  # Move batch to GPU
        optimizer.zero_grad()
        outputs = model(batch)  
        loss = criterion(outputs, batch)  
        loss.backward()
        optimizer.step()
        total_loss += loss.item()

    if epoch % 10 == 0:
        print(f"Epoch {epoch}, Loss: {total_loss / len(train_loader)}")

Epoch 0, Loss: 0.008840966022813222
Epoch 10, Loss: 0.009223174200305013
Epoch 20, Loss: 0.009109416207282683
Epoch 30, Loss: 0.009058846050985078
Epoch 40, Loss: 0.005570649229761859
Epoch 50, Loss: 0.0017266576996145917


In [122]:
# Generate a new MIDI sequence from a random input
random_index = np.random.randint(0, len(dataset))
random_sequence = dataset[random_index].unsqueeze(0)

#Debugging, use the first sequence in the dataset
#random_sequence = dataset[0].unsqueeze(0)

# Move input to same device
random_sequence = random_sequence.to(device)
generated_sequence = model(random_sequence).detach().cpu()

print('Normalised generated sequence')
print(generated_sequence[0].numpy())
generated_sequence  = generated_sequence * original_norms  # Restore original values
generated_sequence = generated_sequence.numpy()

print('Denormalised generated sequence')
print(generated_sequence[0])
print(generated_sequence[0].shape)


Normalised generated sequence
[[0.9139707  0.36068946 0.22042212 0.00587455]
 [0.8762852  0.42888236 0.21606359 0.00569246]
 [0.88637334 0.41734454 0.1705006  0.00481143]
 [0.7945842  0.5815809  0.16272038 0.00491119]
 [0.7853111  0.5956868  0.14717865 0.00488988]
 [0.7771217  0.59135723 0.15587364 0.00416913]
 [0.8734889  0.43395472 0.1704607  0.00443151]
 [0.78880155 0.57309914 0.16180132 0.004393  ]
 [0.76443243 0.61712754 0.14709535 0.00476993]
 [0.7882243  0.5831661  0.17466989 0.00553053]
 [0.721988   0.6724297  0.15030624 0.00399992]
 [0.77486783 0.5962827  0.15068053 0.00410053]
 [0.9259299  0.3330135  0.2152053  0.005779  ]
 [0.81618005 0.5494229  0.1094403  0.0033633 ]
 [0.7850682  0.58102465 0.15622258 0.00417258]
 [0.7293182  0.6679263  0.12171569 0.00321227]]
Denormalised generated sequence
[[107.91456     42.58741     26.02573      0.69362104]
 [ 85.38332     41.789364    21.052767     0.5546609 ]
 [ 93.999146    44.259037    18.081446     0.51024824]
 [ 98.00641     71.7

In [127]:
# Convert generated sequence to MIDI
def sequence_to_midi(sequence, output_file="generated.mid"):
    midi = pretty_midi.PrettyMIDI()
    instrument = pretty_midi.Instrument(program=0)  

    # Adjust start times to ensure the sequence starts at 0
    min_start_time = min(note_data[2] for note_data in sequence[0])
    
    for note_data in sequence[0]:  # Adjusted to handle batch dimension
        pitch, velocity, start, duration = note_data
        pitch = int(pitch)  # Ensure pitch is an integer
        velocity = int(velocity)  # Ensure velocity is an integer
        start = float(start) - min_start_time  # Adjust start time
        duration = float(duration)  # Ensure duration is a float
        end = start + duration
        note = pretty_midi.Note(
            velocity=velocity, pitch=pitch, start=start, end=end
        )
        instrument.notes.append(note)

    midi.instruments.append(instrument)
    midi.write(output_file)
    for noteInformation in midi.instruments[0].notes:
        print(noteInformation)
    print('Total notes generated')
    print (len(midi.instruments[0].notes))
    print(f"Generated MIDI saved as {output_file}")

sequence_to_midi(generated_sequence, "generated.mid")

Note(start=14.064439, end=14.758060, pitch=107, velocity=42)
Note(start=9.091475, end=9.646136, pitch=85, velocity=41)
Note(start=6.120154, end=6.630403, pitch=93, velocity=44)
Note(start=8.109130, end=8.714891, pitch=98, velocity=71)
Note(start=4.301349, end=4.841660, pitch=86, velocity=65)
Note(start=7.446298, end=7.965390, pitch=96, velocity=73)
Note(start=5.646878, end=6.104642, pitch=90, velocity=44)
Note(start=5.784888, end=6.266708, pitch=86, velocity=62)
Note(start=3.562955, end=4.066367, pitch=80, velocity=65)
Note(start=6.084270, end=6.655642, pitch=81, velocity=60)
Note(start=3.478180, end=3.889052, pitch=74, velocity=69)
Note(start=4.256534, end=4.697876, pitch=83, velocity=64)
Note(start=10.183358, end=10.778018, pitch=95, velocity=34)
Note(start=0.000000, end=0.367592, pitch=89, velocity=60)
Note(start=3.776546, end=4.196892, pitch=79, velocity=58)
Note(start=0.632905, end=0.965286, pitch=75, velocity=69)
Total notes generated
16
Generated MIDI saved as generated.mid


In [128]:
# TODO: Implement autoregressive generation of multiple sequences
# THIS IS STILL INCOMPLETE, DONT'T RUN THIS CELL

# Autoregressive generation of multiple sequences
num_generate = 50  # Number of sequences to generate
generated_sequences_ARG = []

#Use the random sequence from the previous cell
current_sequence = random_sequence

for _ in range(num_generate):
    with torch.no_grad():
        generated_sequence_ARG = model(current_sequence).detach().cpu()
    
    # Denormalize the generated sequence
    denormalized_sequence = generated_sequence_ARG * original_norms[:generated_sequence_ARG.shape[0]]
    generated_sequences_ARG.append(denormalized_sequence.numpy())
    
    # Use the last part of the generated sequence as the input for the next step
    current_sequence = torch.cat((current_sequence[:, 1:], generated_sequence_ARG[:, -1:].to(device)), dim=1)

# Concatenate all generated sequences into one
concatenated_sequence = np.concatenate(generated_sequences_ARG, axis=1)

# Convert the concatenated sequence to MIDI and save
sequence_to_midi(concatenated_sequence, output_file="generated_concatenated.mid")


Note(start=12.885345, end=13.386805, pitch=103, velocity=50)
Note(start=10.877676, end=11.349033, pitch=87, velocity=38)
Note(start=13.113882, end=13.633283, pitch=91, velocity=46)
Note(start=10.897652, end=11.378659, pitch=96, velocity=74)
Note(start=10.179216, end=10.744538, pitch=86, velocity=66)
Note(start=8.588202, end=8.977052, pitch=96, velocity=75)
Note(start=10.735756, end=11.302320, pitch=91, velocity=43)
Note(start=10.839348, end=11.385722, pitch=89, velocity=60)
Note(start=7.434288, end=7.809566, pitch=80, velocity=65)
Note(start=8.546562, end=8.940252, pitch=79, velocity=62)
Note(start=6.808209, end=7.152131, pitch=75, velocity=68)
Note(start=5.667882, end=6.012553, pitch=83, velocity=65)
Note(start=20.993242, end=21.654337, pitch=92, velocity=37)
Note(start=10.112076, end=10.631052, pitch=89, velocity=59)
Note(start=8.277502, end=8.665854, pitch=77, velocity=61)
Note(start=7.916067, end=8.292740, pitch=76, velocity=67)
Note(start=15.094929, end=15.663943, pitch=109, veloc

In [119]:
#Save Model
torch.save(model.state_dict(), 'model.pth')