In [86]:
num_epochs=50
sequence_length=16
batch_size=16


In [87]:
from sklearn.preprocessing import MinMaxScaler
import torch
import torch.nn as nn
import torch.optim as optim
import numpy as np
import pretty_midi
import glob
from torch.utils.data import DataLoader, TensorDataset

# Define scalers for velocity and timing features
pitch_scaler = MinMaxScaler(feature_range=(0, 1))
velocity_scaler = MinMaxScaler(feature_range=(0, 1))
start_scaler = MinMaxScaler(feature_range=(0, 1))
duration_scaler = MinMaxScaler(feature_range=(0, 1))

np.set_printoptions(suppress=True)


In [88]:
# Function to extract MIDI features (limited to first 50 notes)
def extract_midi_features(midi_file, max_notes=50):
    midi_data = pretty_midi.PrettyMIDI(midi_file)
    notes = []  # Initialize notes inside the function
    for instrument in midi_data.instruments:
        for note in instrument.notes:
            notes.append([
                note.pitch, 
                note.velocity,
                note.start, 
                note.end - note.start
            ])
            if len(notes) >= max_notes:  # Stop collecting after 50 notes
                return np.array(notes)  # Return early

    return np.array(notes[:max_notes])  # Ensure no more than 50 notes

# Load all MIDI files (including both .midi and .mid)
midi_files = glob.glob("medium-dataset/*.midi") + glob.glob("smaller-dataset/*.mid")
data = []

for f in midi_files:
    notes = extract_midi_features(f)
    for i in range(0, len(notes) - sequence_length - 1):
        data.append(notes[i:i + sequence_length])  # Create 16-note sequences

data = [d for d in data if d.shape == (sequence_length, 4)]  # Filter inconsistent samples

In [89]:
print(len(data))
print(data[(16*4)+2])


3300
[[38.         65.          1.02083333  0.19401042]
 [41.         68.          1.0625      0.18489583]
 [45.         72.          1.11067708  0.1484375 ]
 [50.         82.          1.15364583  0.12239583]
 [62.         68.          1.16796875  0.11328125]
 [65.         70.          1.20182292  0.10546875]
 [69.         75.          1.22786458  0.09375   ]
 [74.         91.          1.26041667  0.08072917]
 [74.         87.          1.46223958  0.05598958]
 [62.         84.          1.6171875   0.1328125 ]
 [50.         73.          1.63020833  0.12369792]
 [53.         74.          1.63020833  0.14973958]
 [74.         92.          1.62369792  0.17447917]
 [70.         78.          1.8046875   0.04947917]
 [82.         84.          1.79557292  0.06901042]
 [70.         83.          1.95572917  0.02734375]]


In [90]:
dataset = torch.tensor(np.array(data), dtype=torch.float32)


In [91]:
print(dataset.shape)
print(dataset[0])
print(dataset[50].numpy())

torch.Size([3300, 16, 4])
tensor([[7.4000e+01, 9.2000e+01, 1.0234e+00, 6.2500e-02],
        [5.7000e+01, 7.9000e+01, 2.0312e+00, 6.3802e-02],
        [6.2000e+01, 8.6000e+01, 2.5339e+00, 4.2969e-02],
        [8.1000e+01, 9.3000e+01, 1.5456e+00, 1.0534e+00],
        [7.4000e+01, 8.2000e+01, 3.0247e+00, 4.4661e-01],
        [7.8000e+01, 9.7000e+01, 3.0156e+00, 5.1432e-01],
        [7.3000e+01, 7.3000e+01, 3.5182e+00, 4.4271e-02],
        [7.6000e+01, 7.9000e+01, 3.5273e+00, 4.9479e-02],
        [7.1000e+01, 7.8000e+01, 3.6615e+00, 3.9062e-02],
        [7.4000e+01, 7.2000e+01, 3.6667e+00, 4.8177e-02],
        [6.9000e+01, 7.6000e+01, 3.7969e+00, 3.1250e-02],
        [7.3000e+01, 7.9000e+01, 3.7852e+00, 7.1615e-02],
        [6.7000e+01, 7.8000e+01, 3.9284e+00, 3.7760e-02],
        [7.1000e+01, 8.3000e+01, 3.9271e+00, 5.7292e-02],
        [6.6000e+01, 7.6000e+01, 4.0638e+00, 3.5156e-02],
        [6.9000e+01, 7.7000e+01, 4.0625e+00, 5.0781e-02]])
[[73.         58.         10.618489    0.6510

In [92]:
# Create DataLoader for batch training
train_loader = DataLoader(dataset, batch_size=batch_size, shuffle=False)

# Transformer Model with Positional Encoding
class PositionalEncoding(nn.Module):
    #def __init__(self, d_model, max_len=16):
    def __init__(self, d_model, max_len=sequence_length):

        super(PositionalEncoding, self).__init__()
        position = torch.arange(max_len).unsqueeze(1)
        div_term = torch.exp(torch.arange(0, d_model, 2) * (-np.log(10000.0) / d_model))
        pe = torch.zeros(max_len, d_model)
        pe[:, 0::2] = torch.sin(position * div_term)
        pe[:, 1::2] = torch.cos(position * div_term)
        self.pe = pe.unsqueeze(0)

    def forward(self, x):
        return x + self.pe.to(x.device)

class MidiTransformer(nn.Module):
    def __init__(self, input_dim=4, model_dim=128, num_heads=4, num_layers=3, ff_dim=512):
        super(MidiTransformer, self).__init__()
        self.embedding = nn.Linear(input_dim, model_dim)
        self.pos_encoder = PositionalEncoding(model_dim)
        encoder_layers = nn.TransformerEncoderLayer(
            d_model=model_dim, nhead=num_heads, dim_feedforward=ff_dim, batch_first=True
        )
        self.transformer_encoder = nn.TransformerEncoder(encoder_layers, num_layers)
        self.fc = nn.Linear(model_dim, input_dim)  

    def forward(self, x):
        x = self.embedding(x)  
        x = self.pos_encoder(x)  
        x = self.transformer_encoder(x)  
        x = self.fc(x)  
        return x


In [93]:
# Model, loss function, and optimizer
device = torch.device("cuda" if torch.cuda.is_available() else "cpu")
model = MidiTransformer().to(device)
criterion = nn.MSELoss()
#criterion = nn.L1Loss()
optimizer = optim.Adam(model.parameters(), lr=0.001)

# Training loop with batches
for epoch in range(num_epochs):
    total_loss = 0
    for batch in train_loader:
        batch = batch.to(device)  # Move batch to GPU
        optimizer.zero_grad()
        outputs = model(batch)  
        loss = criterion(outputs, batch)  
        loss.backward()
        optimizer.step()
        total_loss += loss.item()

    if epoch % 10 == 0:
        print(f"Epoch {epoch}, Loss: {total_loss / len(train_loader)}")


Epoch 0, Loss: 917.625752011359
Epoch 10, Loss: 140.33146879408093
Epoch 20, Loss: 140.26810083066783
Epoch 30, Loss: 140.33954135692062
Epoch 40, Loss: 140.2749450425595


In [98]:
# Generate a new MIDI sequence from a random input
random_index = np.random.randint(0, len(dataset))
random_sequence = dataset[random_index].unsqueeze(0)
random_sequence = dataset[0].unsqueeze(0)

random_sequence = random_sequence.to(device)  # Move input to same device
generated_sequence = model(random_sequence).detach().cpu().numpy()

print(generated_sequence)


[[[60.06596    58.814827    4.5659733   0.39132792]
  [58.48512    57.35673     4.4054675   0.40775374]
  [60.762115   59.542717    4.6201606   0.38315064]
  [59.827644   58.64504     4.5489664   0.45623904]
  [58.999317   57.74767     4.4615703   0.39882952]
  [58.094154   56.928776    4.279761    0.47441226]
  [59.07927    58.011024    4.6278954   0.43505064]
  [57.32234    56.069103    4.20523     0.39189267]
  [58.514164   57.394146    4.4932437   0.41309863]
  [59.92619    58.678883    4.8112893   0.3494756 ]
  [59.071262   57.859863    4.3917255   0.3588261 ]
  [59.930305   58.717457    4.5944724   0.3869324 ]
  [60.78053    59.524166    4.654535    0.38533413]
  [59.574783   58.43304     4.640315    0.37986404]
  [60.04579    58.93596     4.5089564   0.47225046]
  [58.550217   57.371128    4.6780643   0.41523826]]]


In [99]:
# Convert generated sequence to MIDI
def sequence_to_midi(sequence, output_file="generated.mid"):
    midi = pretty_midi.PrettyMIDI()
    instrument = pretty_midi.Instrument(program=0)  

    # Adjust start times to ensure the sequence starts at 0
    min_start_time = min(note_data[2] for note_data in sequence[0])
    
    for note_data in sequence[0]:  # Adjusted to handle batch dimension
        pitch, velocity, start, duration = note_data
        pitch = int(pitch)  # Ensure pitch is an integer
        velocity = int(velocity)  # Ensure velocity is an integer
        start = float(start) - min_start_time  # Adjust start time
        duration = float(duration)  # Ensure duration is a float
        end = start + duration
        note = pretty_midi.Note(
            velocity=velocity, pitch=pitch, start=start, end=end
        )
        instrument.notes.append(note)

    midi.instruments.append(instrument)
    midi.write(output_file)
    for noteInformation in midi.instruments[0].notes:
        print(noteInformation)
    print(f"Generated MIDI saved as {output_file}")

sequence_to_midi(generated_sequence, "generated.mid")


Note(start=0.360743, end=0.752071, pitch=60, velocity=58)
Note(start=0.200237, end=0.607991, pitch=58, velocity=57)
Note(start=0.414930, end=0.798081, pitch=60, velocity=59)
Note(start=0.343736, end=0.799975, pitch=59, velocity=58)
Note(start=0.256340, end=0.655170, pitch=58, velocity=57)
Note(start=0.074531, end=0.548943, pitch=58, velocity=56)
Note(start=0.422665, end=0.857716, pitch=59, velocity=58)
Note(start=0.000000, end=0.391893, pitch=57, velocity=56)
Note(start=0.288013, end=0.701112, pitch=58, velocity=57)
Note(start=0.606059, end=0.955535, pitch=59, velocity=58)
Note(start=0.186495, end=0.545321, pitch=59, velocity=57)
Note(start=0.389242, end=0.776175, pitch=59, velocity=58)
Note(start=0.449305, end=0.834639, pitch=60, velocity=59)
Note(start=0.435085, end=0.814949, pitch=59, velocity=58)
Note(start=0.303726, end=0.775977, pitch=60, velocity=58)
Note(start=0.472834, end=0.888072, pitch=58, velocity=57)
Generated MIDI saved as generated.mid
