In [26]:
sequenceLimit=100

In [27]:
from sklearn.preprocessing import MinMaxScaler
import torch
import torch.nn as nn
import torch.optim as optim
import numpy as np
import pretty_midi
import glob
from torch.utils.data import DataLoader, TensorDataset

# Define scalers for velocity and timing features
velocity_scaler = MinMaxScaler(feature_range=(0, 1))
time_scaler = MinMaxScaler(feature_range=(0, 1))

In [28]:

# Function to extract MIDI features
def extract_midi_features(midi_file):
    midi_data = pretty_midi.PrettyMIDI(midi_file)
    notes = []  # Initialize notes inside the function
    for instrument in midi_data.instruments:
        for note in instrument.notes:
            notes.append([
                note.pitch, 
                note.velocity,
                note.start, 
                note.end 
            ])
    
    return np.array(notes)  # Return all notes

# Load all MIDI files
midi_files = glob.glob("smaller-dataset/*.midi")
data = []
for f in midi_files:
    notes = extract_midi_features(f)
    for i in range(0, len(notes) - 15):
        data.append(notes[i:i+16])  # Create 16-note sequences
data = [d for d in data if d.shape == (16, 4)]  # Filter inconsistent samples

In [49]:
print(dataset.shape)
data = data[:sequenceLimit]
dataset = torch.tensor(np.array(data), dtype=torch.float32)
print(dataset.shape)
print(dataset[0])

torch.Size([100, 16, 4])
torch.Size([100, 16, 4])
tensor([[74.0000, 92.0000,  1.0234,  1.0859],
        [57.0000, 79.0000,  2.0312,  2.0951],
        [62.0000, 86.0000,  2.5339,  2.5768],
        [81.0000, 93.0000,  1.5456,  2.5990],
        [74.0000, 82.0000,  3.0247,  3.4714],
        [78.0000, 97.0000,  3.0156,  3.5299],
        [73.0000, 73.0000,  3.5182,  3.5625],
        [76.0000, 79.0000,  3.5273,  3.5768],
        [71.0000, 78.0000,  3.6615,  3.7005],
        [74.0000, 72.0000,  3.6667,  3.7148],
        [69.0000, 76.0000,  3.7969,  3.8281],
        [73.0000, 79.0000,  3.7852,  3.8568],
        [67.0000, 78.0000,  3.9284,  3.9661],
        [71.0000, 83.0000,  3.9271,  3.9844],
        [66.0000, 76.0000,  4.0638,  4.0990],
        [69.0000, 77.0000,  4.0625,  4.1133]])


In [30]:
# Create DataLoader for batch training
batch_size = 32
train_loader = DataLoader(dataset, batch_size=batch_size, shuffle=True)

# Transformer Model with Positional Encoding
class PositionalEncoding(nn.Module):
    def __init__(self, d_model, max_len=16):
        super(PositionalEncoding, self).__init__()
        position = torch.arange(max_len).unsqueeze(1)
        div_term = torch.exp(torch.arange(0, d_model, 2) * (-np.log(10000.0) / d_model))
        pe = torch.zeros(max_len, d_model)
        pe[:, 0::2] = torch.sin(position * div_term)
        pe[:, 1::2] = torch.cos(position * div_term)
        self.pe = pe.unsqueeze(0)

    def forward(self, x):
        return x + self.pe.to(x.device)

class MidiTransformer(nn.Module):
    def __init__(self, input_dim=4, model_dim=128, num_heads=4, num_layers=3, ff_dim=512):
        super(MidiTransformer, self).__init__()
        self.embedding = nn.Linear(input_dim, model_dim)
        self.pos_encoder = PositionalEncoding(model_dim)
        encoder_layers = nn.TransformerEncoderLayer(
            d_model=model_dim, nhead=num_heads, dim_feedforward=ff_dim
        )
        self.transformer_encoder = nn.TransformerEncoder(encoder_layers, num_layers)
        self.fc = nn.Linear(model_dim, input_dim)  

    def forward(self, x):
        x = self.embedding(x)  
        x = self.pos_encoder(x)  
        x = self.transformer_encoder(x)  
        x = self.fc(x)  
        return x


In [36]:
# Model, loss function, and optimizer
model = MidiTransformer()
criterion = nn.MSELoss()
optimizer = optim.Adam(model.parameters(), lr=0.001)

# Training loop with batches
num_epochs = 100
for epoch in range(num_epochs):
    total_loss = 0
    for batch in train_loader:
        optimizer.zero_grad()
        outputs = model(batch)  
        loss = criterion(outputs, batch)  
        loss.backward()
        optimizer.step()
        total_loss += loss.item()

    if epoch % 10 == 0:
        print(f"Epoch {epoch}, Loss: {total_loss / len(train_loader)}")


Epoch 0, Loss: 2585.9283447265625
Epoch 10, Loss: 2097.56591796875
Epoch 20, Loss: 1668.2442321777344
Epoch 30, Loss: 1255.4702453613281
Epoch 40, Loss: 819.5365447998047
Epoch 50, Loss: 525.4490509033203
Epoch 60, Loss: 282.96331787109375
Epoch 70, Loss: 148.54865646362305
Epoch 80, Loss: 67.87928676605225
Epoch 90, Loss: 43.35159683227539


In [44]:
# Generate a new MIDI sequence from a random input

random_sequence_rangen = torch.randn(1, 16, 4) 
random_index = np.random.randint(0, len(dataset))
random_sequence = dataset[random_index].unsqueeze(0)

print(random_sequence_rangen)
print('difference')
print(random_sequence) 

tensor([[[-1.0836, -0.4635,  0.2884,  1.1038],
         [-1.6744, -1.4108,  0.7726, -1.4022],
         [-1.2442, -0.4962, -0.4393,  0.1786],
         [ 0.4780,  1.1954, -0.6019, -1.0079],
         [-0.5620, -0.3752,  0.7985, -0.4743],
         [-1.0315, -0.9461,  0.1202,  0.9997],
         [ 0.3500,  0.5563, -0.9470,  1.1756],
         [ 0.2446,  1.6437, -0.0086,  1.3778],
         [-2.4257,  0.0209, -0.0960, -0.8565],
         [-1.8921,  0.4859, -0.8509, -0.0085],
         [-0.3115,  0.5166,  0.2960,  0.7540],
         [ 1.0420, -1.5239, -1.3521,  1.5331],
         [-0.0267,  0.3463, -1.7751, -0.6858],
         [ 0.5072, -0.0839, -0.5785, -1.0536],
         [-0.0783,  2.1100, -0.2625,  1.3125],
         [-1.8182,  1.2010, -0.4186, -0.6343]]])
difference
tensor([[[81.0000, 93.0000,  1.5456,  2.5990],
         [74.0000, 82.0000,  3.0247,  3.4714],
         [78.0000, 97.0000,  3.0156,  3.5299],
         [73.0000, 73.0000,  3.5182,  3.5625],
         [76.0000, 79.0000,  3.5273,  3.5768],


In [45]:
generated_sequence = model(random_sequence).detach().numpy()
generated_sequence

array([[[69.20963  , 72.29453  ,  7.052752 ,  7.263224 ],
        [69.179214 , 72.22012  ,  7.246605 ,  7.5992837],
        [68.3742   , 72.14647  ,  6.9332657,  7.090119 ],
        [69.80344  , 71.97262  ,  7.5167904,  7.6648884],
        [69.0667   , 72.15718  ,  7.045172 ,  7.223279 ],
        [69.10411  , 72.18678  ,  7.329755 ,  7.458869 ],
        [70.00176  , 71.9509   ,  7.459717 ,  7.735195 ],
        [68.80994  , 72.23756  ,  6.9713926,  7.217267 ],
        [69.3366   , 72.33603  ,  7.3750324,  7.54369  ],
        [68.68713  , 72.143814 ,  7.1835613,  7.3560743],
        [68.794395 , 72.33178  ,  7.194614 ,  7.3136854],
        [68.68133  , 72.24837  ,  7.1595764,  7.2297783],
        [68.85937  , 72.2536   ,  7.0949516,  7.151725 ],
        [69.3394   , 72.37222  ,  7.34568  ,  7.423238 ],
        [69.14375  , 72.13284  ,  7.2052336,  7.2926397],
        [67.4187   , 72.11232  ,  6.746956 ,  6.7287765]]], dtype=float32)

In [47]:
# Convert generated sequence to MIDI
def sequence_to_midi(sequence, output_file="generated.mid"):
    midi = pretty_midi.PrettyMIDI()
    instrument = pretty_midi.Instrument(program=0)  

    for note_data in sequence:
        pitch, velocity, start, duration = map(int, note_data)
        end = start + duration
        note = pretty_midi.Note(
            velocity=max(0, min(127, velocity)),  
            pitch=max(0, min(127, pitch)),  
            start=max(0, start),
            end=max(0, end)
        )
        instrument.notes.append(note)

    midi.instruments.append(instrument)
    midi.write(output_file)
    print(f"Generated MIDI saved as {output_file}")

In [48]:
sequence_to_midi(generated_sequence[0], "generated.mid")


Generated MIDI saved as generated.mid
