In [131]:
num_epochs=50
sequence_length=16
batch_size=16


In [132]:
from sklearn.preprocessing import MinMaxScaler
import torch
import torch.nn as nn
import torch.optim as optim
import numpy as np
import pretty_midi
import glob
from torch.utils.data import DataLoader, TensorDataset

# Define scalers for velocity and timing features
pitch_scaler = MinMaxScaler(feature_range=(0, 1))
velocity_scaler = MinMaxScaler(feature_range=(0, 1))
start_scaler = MinMaxScaler(feature_range=(0, 1))
duration_scaler = MinMaxScaler(feature_range=(0, 1))

np.set_printoptions(suppress=True)


In [133]:
# Function to extract MIDI features (limited to first 50 notes)
def extract_midi_features(midi_file, max_notes=50):
    midi_data = pretty_midi.PrettyMIDI(midi_file)
    notes = []  # Initialize notes inside the function
    for instrument in midi_data.instruments:
        for note in instrument.notes:
            notes.append([
                note.pitch, 
                note.velocity,
                note.start, 
                note.end - note.start
            ])
            if len(notes) >= max_notes:  # Stop collecting after 50 notes
                return np.array(notes)  # Return early

    return np.array(notes[:max_notes])  # Ensure no more than 50 notes

# Load all MIDI files (including both .midi and .mid)
midi_files = glob.glob("smaller-dataset/*.midi") + glob.glob("smaller-dataset/*.mid")
data = []

for f in midi_files:
    notes = extract_midi_features(f)
    for i in range(0, len(notes) - sequence_length - 1):
        data.append(notes[i:i + sequence_length])  # Create 16-note sequences

data = [d for d in data if d.shape == (sequence_length, 4)]  # Filter inconsistent samples

In [114]:
print(len(data))
print(data[(16*4)+2])


495
[[38.         65.          1.02083333  0.19401042]
 [41.         68.          1.0625      0.18489583]
 [45.         72.          1.11067708  0.1484375 ]
 [50.         82.          1.15364583  0.12239583]
 [62.         68.          1.16796875  0.11328125]
 [65.         70.          1.20182292  0.10546875]
 [69.         75.          1.22786458  0.09375   ]
 [74.         91.          1.26041667  0.08072917]
 [74.         87.          1.46223958  0.05598958]
 [62.         84.          1.6171875   0.1328125 ]
 [50.         73.          1.63020833  0.12369792]
 [53.         74.          1.63020833  0.14973958]
 [74.         92.          1.62369792  0.17447917]
 [70.         78.          1.8046875   0.04947917]
 [82.         84.          1.79557292  0.06901042]
 [70.         83.          1.95572917  0.02734375]]


In [134]:
# Stack data for scaling
all_data = np.vstack(data)
pitch_scaler.fit(all_data[:, 0].reshape(-1, 1))
velocity_scaler.fit(all_data[:, 1].reshape(-1, 1))
start_scaler.fit(all_data[:, 2].reshape(-1, 1))
duration_scaler.fit(all_data[:, 3].reshape(-1, 1))

# Apply scaling
scaled_data = []
for d in data:
    scaled_d = np.copy(d)
    scaled_d[:, 0] = pitch_scaler.transform(scaled_d[:, 0].reshape(-1, 1)).flatten()
    scaled_d[:, 1] = velocity_scaler.transform(scaled_d[:, 1].reshape(-1, 1)).flatten()
    scaled_d[:, 2] = start_scaler.transform(scaled_d[:, 2].reshape(-1, 1)).flatten()
    scaled_d[:, 3] = duration_scaler.transform(scaled_d[:, 3].reshape(-1, 1)).flatten()
    scaled_data.append(scaled_d)

dataset = torch.tensor(np.array(scaled_data), dtype=torch.float32)
dataset = torch.tensor(np.array(data), dtype=torch.float32)


In [135]:
print(dataset.shape)

#print(dataset[0])

# Print the tensor without scientific notation
#print(dataset[990].numpy())
print(dataset[50].numpy())

torch.Size([495, 16, 4])
[[73.         58.         10.618489    0.6510417 ]
 [50.         34.         11.428386    0.91015625]
 [71.         55.         11.0078125   1.3893229 ]
 [69.         54.         12.186198    0.3216146 ]
 [50.         27.         13.1432295   0.5546875 ]
 [68.         60.         13.0286455   1.046875  ]
 [59.         35.         13.1223955   1.6367188 ]
 [76.         65.         13.8151045   1.6419271 ]
 [49.         35.         14.5963545   0.9401042 ]
 [57.         37.         14.558594    0.98828125]
 [73.         59.         15.286458    0.2903646 ]
 [68.         60.         16.059896    0.85286456]
 [57.         31.         16.125       0.9270833 ]
 [69.         51.         16.799479    0.30859375]
 [49.         37.         16.11849     1.0013021 ]
 [63.         55.         17.5625      0.8828125 ]]


In [136]:
# Create DataLoader for batch training
train_loader = DataLoader(dataset, batch_size=batch_size, shuffle=False)

# Transformer Model with Positional Encoding
class PositionalEncoding(nn.Module):
    #def __init__(self, d_model, max_len=16):
    def __init__(self, d_model, max_len=sequence_length):

        super(PositionalEncoding, self).__init__()
        position = torch.arange(max_len).unsqueeze(1)
        div_term = torch.exp(torch.arange(0, d_model, 2) * (-np.log(10000.0) / d_model))
        pe = torch.zeros(max_len, d_model)
        pe[:, 0::2] = torch.sin(position * div_term)
        pe[:, 1::2] = torch.cos(position * div_term)
        self.pe = pe.unsqueeze(0)

    def forward(self, x):
        return x + self.pe.to(x.device)

class MidiTransformer(nn.Module):
    def __init__(self, input_dim=4, model_dim=128, num_heads=4, num_layers=3, ff_dim=512):
        super(MidiTransformer, self).__init__()
        self.embedding = nn.Linear(input_dim, model_dim)
        self.pos_encoder = PositionalEncoding(model_dim)
        encoder_layers = nn.TransformerEncoderLayer(
            d_model=model_dim, nhead=num_heads, dim_feedforward=ff_dim, batch_first=True
        )
        self.transformer_encoder = nn.TransformerEncoder(encoder_layers, num_layers)
        self.fc = nn.Linear(model_dim, input_dim)  

    def forward(self, x):
        x = self.embedding(x)  
        x = self.pos_encoder(x)  
        x = self.transformer_encoder(x)  
        x = self.fc(x)  
        return x


In [137]:
# Model, loss function, and optimizer
device = torch.device("cuda" if torch.cuda.is_available() else "cpu")
model = MidiTransformer().to(device)
criterion = nn.MSELoss()
#criterion = nn.L1Loss()
optimizer = optim.Adam(model.parameters(), lr=0.001)

# Training loop with batches
for epoch in range(num_epochs):
    total_loss = 0
    for batch in train_loader:
        batch = batch.to(device)  # Move batch to GPU
        optimizer.zero_grad()
        outputs = model(batch)  
        loss = criterion(outputs, batch)  
        loss.backward()
        optimizer.step()
        total_loss += loss.item()

    if epoch % 10 == 0:
        print(f"Epoch {epoch}, Loss: {total_loss / len(train_loader)}")


Epoch 0, Loss: 1769.8519838394657
Epoch 10, Loss: 145.14221474432176
Epoch 20, Loss: 115.74394226074219
Epoch 30, Loss: 97.931216455275
Epoch 40, Loss: 79.36224414456275


In [142]:
# Generate a new MIDI sequence from a random input
random_index = np.random.randint(0, len(dataset))
random_sequence = dataset[random_index].unsqueeze(0)
random_sequence = random_sequence.to(device)  # Move input to same device
generated_sequence = model(random_sequence).detach().cpu().numpy()

# Apply inverse scaling to the generated sequence
'''
generated_sequence[:, :, 0] = pitch_scaler.inverse_transform(generated_sequence[:, :, 0])
generated_sequence[:, :, 1] = velocity_scaler.inverse_transform(generated_sequence[:, :, 1])
generated_sequence[:, :, 2] = start_scaler.inverse_transform(generated_sequence[:, :, 2])
generated_sequence[:, :, 3] = duration_scaler.inverse_transform(generated_sequence[:, :, 3])
'''
print(generated_sequence)


[[[63.632946   47.56388     6.095802    0.18727013]
  [62.614407   46.86259     6.022663    0.11177169]
  [65.69594    55.190727    6.0429773   0.3672974 ]
  [62.537354   76.7192      4.59267     0.09683529]
  [62.508804   44.765778    6.0900426   0.35648322]
  [60.97565    74.36569     4.7566695  -0.27469844]
  [63.228558   47.360847    6.1258955   0.1333732 ]
  [64.1821     48.732655    5.8902307   0.3552083 ]
  [63.297733   46.453403    6.0227723   0.14642431]
  [63.13791    48.024395    5.689944    0.20530272]
  [64.73459    49.957947    5.8427353   0.06333633]
  [65.270706   49.259678    6.163333    0.23612222]
  [61.424946   44.84435     5.6456966   0.57355386]
  [63.011677   49.17625     6.0233083   0.11009887]
  [62.417854   46.70301     5.963119    0.20634249]
  [62.45598    46.282383    6.090487   -0.02398095]]]


In [144]:
# Convert generated sequence to MIDI
def sequence_to_midi(sequence, output_file="generated.mid"):
    midi = pretty_midi.PrettyMIDI()
    instrument = pretty_midi.Instrument(program=0)  

    # Adjust start times to ensure the sequence starts at 0
    min_start_time = min(note_data[2] for note_data in sequence[0])
    
    for note_data in sequence[0]:  # Adjusted to handle batch dimension
        pitch, velocity, start, duration = note_data
        pitch = int(pitch)  # Ensure pitch is an integer
        velocity = int(velocity)  # Ensure velocity is an integer
        start = float(start) - min_start_time  # Adjust start time
        duration = float(duration)  # Ensure duration is a float
        end = start + duration
        note = pretty_midi.Note(
            velocity=velocity, pitch=pitch, start=start, end=end
        )
        instrument.notes.append(note)

    midi.instruments.append(instrument)
    midi.write(output_file)
    for noteInformation in midi.instruments[0].notes:
        print(noteInformation)
    print(f"Generated MIDI saved as {output_file}")

sequence_to_midi(generated_sequence, "generated.mid")


Note(start=1.503132, end=1.690402, pitch=63, velocity=47)
Note(start=1.429993, end=1.541765, pitch=62, velocity=46)
Note(start=1.450307, end=1.817605, pitch=65, velocity=55)
Note(start=0.000000, end=0.096835, pitch=62, velocity=76)
Note(start=1.497373, end=1.853856, pitch=62, velocity=44)
Note(start=0.164000, end=-0.110699, pitch=60, velocity=74)
Note(start=1.533226, end=1.666599, pitch=63, velocity=47)
Note(start=1.297561, end=1.652769, pitch=64, velocity=48)
Note(start=1.430102, end=1.576527, pitch=63, velocity=46)
Note(start=1.097274, end=1.302577, pitch=63, velocity=48)
Note(start=1.250065, end=1.313402, pitch=64, velocity=49)
Note(start=1.570663, end=1.806785, pitch=65, velocity=49)
Note(start=1.053027, end=1.626580, pitch=61, velocity=44)
Note(start=1.430638, end=1.540737, pitch=63, velocity=49)
Note(start=1.370449, end=1.576792, pitch=62, velocity=46)
Note(start=1.497817, end=1.473836, pitch=62, velocity=46)
Generated MIDI saved as generated.mid
