In [1]:
num_epochs=50
sequence_length=16
batch_size=16


In [2]:
from sklearn.preprocessing import MinMaxScaler
import torch
import torch.nn as nn
import torch.optim as optim
import numpy as np
import pretty_midi
import glob
from torch.utils.data import DataLoader, TensorDataset

# Define scalers for velocity and timing features
pitch_scaler = MinMaxScaler(feature_range=(0, 1))
velocity_scaler = MinMaxScaler(feature_range=(0, 1))
start_scaler = MinMaxScaler(feature_range=(0, 1))
duration_scaler = MinMaxScaler(feature_range=(0, 1))

np.set_printoptions(suppress=True)


In [3]:
# Function to extract MIDI features (limited to first 50 notes)
def extract_midi_features(midi_file, max_notes=50):
    midi_data = pretty_midi.PrettyMIDI(midi_file)
    notes = []  # Initialize notes inside the function
    for instrument in midi_data.instruments:
        for note in instrument.notes:
            notes.append([
                note.pitch, 
                note.velocity,
                note.start, 
                note.end - note.start
            ])
            if len(notes) >= max_notes:  # Stop collecting after 50 notes
                return np.array(notes)  # Return early

    return np.array(notes[:max_notes])  # Ensure no more than 50 notes

# Load all MIDI files (including both .midi and .mid)
midi_files = glob.glob("smaller-dataset/*.midi") + glob.glob("smaller-dataset/*.mid")
data = []

for f in midi_files:
    notes = extract_midi_features(f)
    for i in range(0, len(notes) - sequence_length - 1):
        data.append(notes[i:i + sequence_length])  # Create 16-note sequences

data = [d for d in data if d.shape == (sequence_length, 4)]  # Filter inconsistent samples

In [4]:
print(len(data))
print(data[(16*4)+2])


495
[[38.         65.          1.02083333  0.19401042]
 [41.         68.          1.0625      0.18489583]
 [45.         72.          1.11067708  0.1484375 ]
 [50.         82.          1.15364583  0.12239583]
 [62.         68.          1.16796875  0.11328125]
 [65.         70.          1.20182292  0.10546875]
 [69.         75.          1.22786458  0.09375   ]
 [74.         91.          1.26041667  0.08072917]
 [74.         87.          1.46223958  0.05598958]
 [62.         84.          1.6171875   0.1328125 ]
 [50.         73.          1.63020833  0.12369792]
 [53.         74.          1.63020833  0.14973958]
 [74.         92.          1.62369792  0.17447917]
 [70.         78.          1.8046875   0.04947917]
 [82.         84.          1.79557292  0.06901042]
 [70.         83.          1.95572917  0.02734375]]


In [5]:
dataset = torch.tensor(np.array(data), dtype=torch.float32)


In [6]:
print(dataset.shape)
print(dataset[0])
print(dataset[50].numpy())

torch.Size([495, 16, 4])
tensor([[7.4000e+01, 9.2000e+01, 1.0234e+00, 6.2500e-02],
        [5.7000e+01, 7.9000e+01, 2.0312e+00, 6.3802e-02],
        [6.2000e+01, 8.6000e+01, 2.5339e+00, 4.2969e-02],
        [8.1000e+01, 9.3000e+01, 1.5456e+00, 1.0534e+00],
        [7.4000e+01, 8.2000e+01, 3.0247e+00, 4.4661e-01],
        [7.8000e+01, 9.7000e+01, 3.0156e+00, 5.1432e-01],
        [7.3000e+01, 7.3000e+01, 3.5182e+00, 4.4271e-02],
        [7.6000e+01, 7.9000e+01, 3.5273e+00, 4.9479e-02],
        [7.1000e+01, 7.8000e+01, 3.6615e+00, 3.9062e-02],
        [7.4000e+01, 7.2000e+01, 3.6667e+00, 4.8177e-02],
        [6.9000e+01, 7.6000e+01, 3.7969e+00, 3.1250e-02],
        [7.3000e+01, 7.9000e+01, 3.7852e+00, 7.1615e-02],
        [6.7000e+01, 7.8000e+01, 3.9284e+00, 3.7760e-02],
        [7.1000e+01, 8.3000e+01, 3.9271e+00, 5.7292e-02],
        [6.6000e+01, 7.6000e+01, 4.0638e+00, 3.5156e-02],
        [6.9000e+01, 7.7000e+01, 4.0625e+00, 5.0781e-02]])
[[73.         58.         10.618489    0.65104

In [7]:
# Create DataLoader for batch training
train_loader = DataLoader(dataset, batch_size=batch_size, shuffle=False)

# Transformer Model with Positional Encoding
class PositionalEncoding(nn.Module):
    #def __init__(self, d_model, max_len=16):
    def __init__(self, d_model, max_len=sequence_length):

        super(PositionalEncoding, self).__init__()
        position = torch.arange(max_len).unsqueeze(1)
        div_term = torch.exp(torch.arange(0, d_model, 2) * (-np.log(10000.0) / d_model))
        pe = torch.zeros(max_len, d_model)
        pe[:, 0::2] = torch.sin(position * div_term)
        pe[:, 1::2] = torch.cos(position * div_term)
        self.pe = pe.unsqueeze(0)

    def forward(self, x):
        return x + self.pe.to(x.device)

class MidiTransformer(nn.Module):
    def __init__(self, input_dim=4, model_dim=128, num_heads=4, num_layers=3, ff_dim=512):
        super(MidiTransformer, self).__init__()
        self.embedding = nn.Linear(input_dim, model_dim)
        self.pos_encoder = PositionalEncoding(model_dim)
        encoder_layers = nn.TransformerEncoderLayer(
            d_model=model_dim, nhead=num_heads, dim_feedforward=ff_dim, batch_first=True
        )
        self.transformer_encoder = nn.TransformerEncoder(encoder_layers, num_layers)
        self.fc = nn.Linear(model_dim, input_dim)  

    def forward(self, x):
        x = self.embedding(x)  
        x = self.pos_encoder(x)  
        x = self.transformer_encoder(x)  
        x = self.fc(x)  
        return x


In [8]:
# Model, loss function, and optimizer
device = torch.device("cuda" if torch.cuda.is_available() else "cpu")
model = MidiTransformer().to(device)
criterion = nn.MSELoss()
#criterion = nn.L1Loss()
optimizer = optim.Adam(model.parameters(), lr=0.001)

# Training loop with batches
for epoch in range(num_epochs):
    total_loss = 0
    for batch in train_loader:
        batch = batch.to(device)  # Move batch to GPU
        optimizer.zero_grad()
        outputs = model(batch)  
        loss = criterion(outputs, batch)  
        loss.backward()
        optimizer.step()
        total_loss += loss.item()

    if epoch % 10 == 0:
        print(f"Epoch {epoch}, Loss: {total_loss / len(train_loader)}")


Epoch 0, Loss: 1764.2919154013357
Epoch 10, Loss: 138.45356393629504
Epoch 20, Loss: 72.95742650185862
Epoch 30, Loss: 65.25593456145256
Epoch 40, Loss: 73.22924146344585


In [9]:
# Generate a new MIDI sequence from a random input
random_index = np.random.randint(0, len(dataset))
random_sequence = dataset[random_index].unsqueeze(0)
random_sequence = random_sequence.to(device)  # Move input to same device
generated_sequence = model(random_sequence).detach().cpu().numpy()

print(generated_sequence)


[[[66.655304   76.472046    3.10779     0.15729153]
  [67.4954     61.187046    3.6697247  -0.00852437]
  [59.087944   81.47005     2.634015    0.24200407]
  [66.74821    76.41547     3.1049583   0.10729881]
  [50.01523    78.80029     3.897826    0.14262837]
  [68.166725   68.31609     3.369052    0.1114278 ]
  [64.757774   79.75812     2.772558    0.2787254 ]
  [67.25861    75.00051     3.0400655   0.14565119]
  [63.20275    79.9392      3.2804787   0.0386076 ]
  [67.47527    61.77985     3.6242344  -0.02351683]
  [66.02391    77.57994     2.708847    0.1444951 ]
  [66.98171    59.27022     4.3225393   0.13257444]
  [67.9017     64.26326     3.7428174   0.2701029 ]
  [64.32998    49.909325    4.7993164   0.03800549]
  [67.40578    65.8104      4.0641675   0.08196817]
  [66.919304   54.435932    5.1972284   0.11293875]]]


In [11]:
# Convert generated sequence to MIDI
def sequence_to_midi(sequence, output_file="generated.mid"):
    midi = pretty_midi.PrettyMIDI()
    instrument = pretty_midi.Instrument(program=0)  

    # Adjust start times to ensure the sequence starts at 0
    min_start_time = min(note_data[2] for note_data in sequence[0])
    
    for note_data in sequence[0]:  # Adjusted to handle batch dimension
        pitch, velocity, start, duration = note_data
        pitch = int(pitch)  # Ensure pitch is an integer
        velocity = int(velocity)  # Ensure velocity is an integer
        start = float(start) - min_start_time  # Adjust start time
        duration = float(duration)  # Ensure duration is a float
        end = start + duration
        note = pretty_midi.Note(
            velocity=velocity, pitch=pitch, start=start, end=end
        )
        instrument.notes.append(note)

    midi.instruments.append(instrument)
    midi.write(output_file)
    for noteInformation in midi.instruments[0].notes:
        print(noteInformation)
    print(f"Generated MIDI saved as {output_file}")

sequence_to_midi(generated_sequence, "generated.mid")


Note(start=0.473775, end=0.631066, pitch=66, velocity=76)
Note(start=1.035710, end=1.027185, pitch=67, velocity=61)
Note(start=0.000000, end=0.242004, pitch=59, velocity=81)
Note(start=0.470943, end=0.578242, pitch=66, velocity=76)
Note(start=1.263811, end=1.406439, pitch=50, velocity=78)
Note(start=0.735037, end=0.846465, pitch=68, velocity=68)
Note(start=0.138543, end=0.417268, pitch=64, velocity=79)
Note(start=0.406050, end=0.551702, pitch=67, velocity=75)
Note(start=0.646464, end=0.685071, pitch=63, velocity=79)
Note(start=0.990219, end=0.966703, pitch=67, velocity=61)
Note(start=0.074832, end=0.219327, pitch=66, velocity=77)
Note(start=1.688524, end=1.821099, pitch=66, velocity=59)
Note(start=1.108802, end=1.378905, pitch=67, velocity=64)
Note(start=2.165301, end=2.203307, pitch=64, velocity=49)
Note(start=1.430152, end=1.512121, pitch=67, velocity=65)
Note(start=2.563213, end=2.676152, pitch=66, velocity=54)
Generated MIDI saved as generated.mid
