In [1]:
num_epochs=100
sequence_length=16
batch_size=16
max_notes=100
dataset_path='smaller-dataset'

In [2]:
from sklearn.preprocessing import MinMaxScaler
import torch
import torch.nn as nn
import torch.optim as optim
import numpy as np
import pretty_midi
import glob
from torch.utils.data import DataLoader, TensorDataset
import torch.nn.functional as F

np.set_printoptions(suppress=True)


In [3]:
# Function to extract MIDI features (limited to first 50 notes)
def extract_midi_features(midi_file):
    midi_data = pretty_midi.PrettyMIDI(midi_file)
    notes = []  # Initialize notes inside the function
    for instrument in midi_data.instruments:
        for note in instrument.notes:
            notes.append([
                note.pitch, 
                note.velocity,
                note.start, 
                note.end - note.start
            ])
            if len(notes) >= max_notes:  # Stop collecting after 50 notes
                return np.array(notes)  # Return early
    return np.array(notes[:max_notes])  # Ensure no more than 50 notes

# Load all MIDI files (including both .midi and .mid)
midi_files = glob.glob(dataset_path+"/*.midi") + glob.glob(dataset_path+"/*.mid")
data = []

for f in midi_files:
    notes = extract_midi_features(f)
    #print(notes.shape)
    #print(len(notes))
    #print(len(notes) - sequence_length - 1)
    for i in range(0, len(notes) - sequence_length - 1):
        data.append(notes[i:i + sequence_length])  # Create 16-note sequences

data = [d for d in data if d.shape == (sequence_length, 4)]  # Filter inconsistent samples

In [4]:
dataset = torch.tensor(np.array(data), dtype=torch.float32)
print(dataset.shape)
print(dataset[0].numpy())

torch.Size([1245, 16, 4])
[[74.         92.          1.0234375   0.0625    ]
 [57.         79.          2.03125     0.06380209]
 [62.         86.          2.5338542   0.04296875]
 [81.         93.          1.5455729   1.0533854 ]
 [74.         82.          3.0247395   0.4466146 ]
 [78.         97.          3.015625    0.51432294]
 [73.         73.          3.5182292   0.04427083]
 [76.         79.          3.5273438   0.04947917]
 [71.         78.          3.6614583   0.0390625 ]
 [74.         72.          3.6666667   0.04817708]
 [69.         76.          3.796875    0.03125   ]
 [73.         79.          3.7851562   0.07161459]
 [67.         78.          3.9283855   0.03776042]
 [71.         83.          3.9270833   0.05729167]
 [66.         76.          4.0638022   0.03515625]
 [69.         77.          4.0625      0.05078125]]


In [5]:
# Normalize the dataset for only start and end columns


# Compute L2 norm for the last two columns **before** normalization
#original_norms = torch.norm(dataset[:, :, 2:], p=2, dim=2, keepdim=True)

# Apply L2 normalization only to the last two columns along `dim=2`
#normalized_floats = F.normalize(dataset[:, :, 2:], p=2, dim=2)

# Combine integer columns and normalized float columns
#normalized_tensor = torch.cat((dataset[:, :, :2], normalized_floats), dim=2)


# Normalize the dataset for all columns


# Compute L2 norm for all 4 columns along `dim=2`
original_norms = torch.norm(dataset, p=2, dim=2, keepdim=True)  # Shape (n, 16, 1)

# Normalize by dividing each column by its L2 norm
normalized_tensor = dataset / (original_norms + 1e-8)  # Avoid division by zero


print(normalized_tensor.shape)
print(normalized_tensor[0].numpy())

dataset = normalized_tensor

torch.Size([1245, 16, 4])
[[0.62673503 0.7791841  0.00866789 0.00052934]
 [0.5849884  0.81077343 0.02084663 0.0006548 ]
 [0.58463454 0.8109447  0.02389321 0.00040518]
 [0.65670526 0.7539949  0.01253069 0.00854029]
 [0.669708   0.7421089  0.02737422 0.00404191]
 [0.6264634  0.77906346 0.02422024 0.00413083]
 [0.70669645 0.70669645 0.03405918 0.00042858]
 [0.6929323  0.7202849  0.03216066 0.00045113]
 [0.67273927 0.7390657  0.03469305 0.00037013]
 [0.7162743  0.69691557 0.03549107 0.00046632]
 [0.6717284  0.7398747  0.03696331 0.00030422]
 [0.6782462  0.73399246 0.03516805 0.00066537]
 [0.65111685 0.75801665 0.03817669 0.00036696]
 [0.64961725 0.7594117  0.035931   0.00052419]
 [0.6551529  0.75441855 0.04033957 0.00034898]
 [0.66684544 0.74416083 0.03926173 0.00049077]]


In [6]:
# Create DataLoader for batch training
train_loader = DataLoader(dataset, batch_size=batch_size, shuffle=False)

# Transformer Model with Positional Encoding
class PositionalEncoding(nn.Module):
    #def __init__(self, d_model, max_len=16):
    def __init__(self, d_model, max_len=sequence_length):

        super(PositionalEncoding, self).__init__()
        position = torch.arange(max_len).unsqueeze(1)
        div_term = torch.exp(torch.arange(0, d_model, 2) * (-np.log(10000.0) / d_model))
        pe = torch.zeros(max_len, d_model)
        pe[:, 0::2] = torch.sin(position * div_term)
        pe[:, 1::2] = torch.cos(position * div_term)
        self.pe = pe.unsqueeze(0)

    def forward(self, x):
        return x + self.pe.to(x.device)

class MidiTransformer(nn.Module):
    def __init__(self, input_dim=4, model_dim=128, num_heads=4, num_layers=3, ff_dim=512):
        super(MidiTransformer, self).__init__()
        self.embedding = nn.Linear(input_dim, model_dim)
        self.pos_encoder = PositionalEncoding(model_dim)
        encoder_layers = nn.TransformerEncoderLayer(
            d_model=model_dim, nhead=num_heads, dim_feedforward=ff_dim, batch_first=True
        )
        self.transformer_encoder = nn.TransformerEncoder(encoder_layers, num_layers)
        self.fc = nn.Linear(model_dim, input_dim)  

    def forward(self, x):
        x = self.embedding(x)  
        x = self.pos_encoder(x)  
        x = self.transformer_encoder(x)  
        x = self.fc(x)  
        return x

In [7]:
# Model, loss function, and optimizer
device = torch.device("cuda" if torch.cuda.is_available() else "cpu")
model = MidiTransformer().to(device)
criterion = nn.MSELoss()
#criterion = nn.L1Loss()
optimizer = optim.Adam(model.parameters(), lr=0.001)

# Training loop with batches
for epoch in range(num_epochs):
    total_loss = 0
    for batch in train_loader:
        batch = batch.to(device)  # Move batch to GPU
        optimizer.zero_grad()
        outputs = model(batch)  
        loss = criterion(outputs, batch)  
        loss.backward()
        optimizer.step()
        total_loss += loss.item()

    if epoch % 10 == 0:
        print(f"Epoch {epoch}, Loss: {total_loss / len(train_loader)}")

Epoch 0, Loss: 0.07705881389287803
Epoch 10, Loss: 0.0028131764832860194
Epoch 20, Loss: 0.000850944641196969
Epoch 30, Loss: 0.00043971953429998114
Epoch 40, Loss: 0.0007619886026264001
Epoch 50, Loss: 0.0023877007994227684
Epoch 60, Loss: 0.0027241222858906556
Epoch 70, Loss: 0.0007888156769928547
Epoch 80, Loss: 0.0013064250377236675
Epoch 90, Loss: 0.0005292564881249116


In [12]:
# Generate a new MIDI sequence from a random input
random_index = np.random.randint(0, len(dataset))
random_sequence = dataset[random_index].unsqueeze(0)

#Debugging, use the first sequence in the dataset
#random_sequence = dataset[0].unsqueeze(0)

# Move input to same device
random_sequence = random_sequence.to(device)
generated_sequence = model(random_sequence).detach().cpu()

print('Normalised generated sequence')
print(generated_sequence[0].numpy())
generated_sequence  = generated_sequence * original_norms  # Restore original values
generated_sequence = generated_sequence.numpy()

print('Denormalised generated sequence')
print(generated_sequence[0])
print(generated_sequence[0].shape)


Normalised generated sequence
[[0.7138512  0.67814696 0.1373692  0.00260267]
 [0.72721756 0.674667   0.14440227 0.00597372]
 [0.6889131  0.7056998  0.12626888 0.00262562]
 [0.7254584  0.65772516 0.16151646 0.00875267]
 [0.6718141  0.72244436 0.12044735 0.00186816]
 [0.72960997 0.65365565 0.1517369  0.00216918]
 [0.7086204  0.6819473  0.14244959 0.00290954]
 [0.733914   0.6409203  0.17304909 0.00765897]
 [0.6891918  0.7012614  0.13304697 0.00387315]
 [0.724594   0.6667927  0.1557137  0.00446884]
 [0.7527462  0.6321353  0.1719191  0.00637556]
 [0.72311807 0.6700804  0.15435657 0.00564384]
 [0.70444834 0.66056526 0.1799793  0.00361912]
 [0.752563   0.60318244 0.18904835 0.00936138]
 [0.7791785  0.5646117  0.21260403 0.00985785]
 [0.68622065 0.71236914 0.15532014 0.00341237]]
Denormalised generated sequence
[[84.286      80.07032    16.219488    0.30730352]
 [70.8585     65.73809    14.070243    0.58206624]
 [73.058655   74.83887    13.390708    0.27844498]
 [89.48022    81.1258     19.921

In [13]:
# Convert generated sequence to MIDI
def sequence_to_midi(sequence, output_file="generated.mid"):
    midi = pretty_midi.PrettyMIDI()
    instrument = pretty_midi.Instrument(program=0)  

    # Adjust start times to ensure the sequence starts at 0
    min_start_time = min(note_data[2] for note_data in sequence[0])
    
    for note_data in sequence[0]:  # Adjusted to handle batch dimension
        pitch, velocity, start, duration = note_data
        pitch = int(pitch)  # Ensure pitch is an integer
        velocity = int(velocity)  # Ensure velocity is an integer
        start = float(start) - min_start_time  # Adjust start time
        duration = float(duration)  # Ensure duration is a float
        end = start + duration
        note = pretty_midi.Note(
            velocity=velocity, pitch=pitch, start=start, end=end
        )
        instrument.notes.append(note)

    midi.instruments.append(instrument)
    midi.write(output_file)
    for noteInformation in midi.instruments[0].notes:
        print(noteInformation)
    print('Total notes generated')
    print (len(midi.instruments[0].notes))
    print(f"Generated MIDI saved as {output_file}")

sequence_to_midi(generated_sequence, "generated.mid")

Note(start=2.910548, end=3.217852, pitch=84, velocity=80)
Note(start=0.761303, end=1.343369, pitch=70, velocity=65)
Note(start=0.081768, end=0.360213, pitch=73, velocity=74)
Note(start=6.612987, end=7.692568, pitch=89, velocity=81)
Note(start=0.000000, end=0.206424, pitch=74, velocity=79)
Note(start=5.583589, end=5.853671, pitch=90, velocity=81)
Note(start=1.405751, end=1.706300, pitch=73, velocity=70)
Note(start=5.670880, end=6.510907, pitch=80, velocity=70)
Note(start=0.732658, end=1.141426, pitch=72, velocity=74)
Note(start=2.778213, end=3.239899, pitch=74, velocity=68)
Note(start=4.350606, end=5.005504, pitch=77, velocity=64)
Note(start=3.304540, end=3.911990, pitch=77, velocity=72)
Note(start=5.210950, end=5.583358, pitch=72, velocity=67)
Note(start=7.353124, end=8.376277, pitch=82, velocity=65)
Note(start=8.108756, end=9.101834, pitch=78, velocity=56)
Note(start=2.762385, end=3.115471, pitch=71, velocity=73)
Total notes generated
16
Generated MIDI saved as generated.mid


In [14]:
# TODO: Implement autoregressive generation of multiple sequences
# THIS IS STILL INCOMPLETE, DONT'T RUN THIS CELL

# Autoregressive generation of multiple sequences
num_generate = 50  # Number of sequences to generate
generated_sequences_ARG = []

#Use the random sequence from the previous cell
current_sequence = random_sequence

for _ in range(num_generate):
    with torch.no_grad():
        generated_sequence_ARG = model(current_sequence).detach().cpu()
    
    # Denormalize the generated sequence
    denormalized_sequence = generated_sequence_ARG * original_norms[:generated_sequence_ARG.shape[0]]
    generated_sequences_ARG.append(denormalized_sequence.numpy())
    
    # Use the last part of the generated sequence as the input for the next step
    current_sequence = torch.cat((current_sequence[:, 1:], generated_sequence_ARG[:, -1:].to(device)), dim=1)

# Concatenate all generated sequences into one
concatenated_sequence = np.concatenate(generated_sequences_ARG, axis=1)

# Convert the concatenated sequence to MIDI and save
sequence_to_midi(concatenated_sequence, output_file="generated_concatenated.mid")


Note(start=25.672432, end=26.008972, pitch=84, velocity=79)
Note(start=23.153475, end=23.201864, pitch=67, velocity=65)
Note(start=22.263683, end=22.709381, pitch=72, velocity=75)
Note(start=26.985950, end=27.580763, pitch=90, velocity=85)
Note(start=22.453178, end=22.593594, pitch=74, velocity=78)
Note(start=26.280148, end=26.040527, pitch=90, velocity=80)
Note(start=22.251720, end=22.568604, pitch=73, velocity=70)
Note(start=27.922207, end=28.165421, pitch=79, velocity=71)
Note(start=23.319237, end=23.755016, pitch=73, velocity=74)
Note(start=24.521404, end=25.109718, pitch=74, velocity=69)
Note(start=25.096199, end=25.654839, pitch=76, velocity=65)
Note(start=24.592213, end=24.805155, pitch=77, velocity=70)
Note(start=28.981915, end=29.777056, pitch=77, velocity=68)
Note(start=29.483387, end=30.112024, pitch=80, velocity=69)
Note(start=30.379784, end=30.961115, pitch=79, velocity=55)
Note(start=24.614216, end=25.119289, pitch=70, velocity=72)
Note(start=25.859703, end=26.126053, pit

In [15]:
#Save Model
torch.save(model.state_dict(), 'model.pth')