## 10 Midi Generation 

Given a 1s prompt from the dataset, generate samples and listen to audio quality. 

In [1]:
import numpy as np 
import torch 
import torch.nn.functional as F
import pandas as pd 
import sys 
import os 
import yaml 
import time
from IPython.display import Audio, display
import soundfile as sf 

from dotenv import load_dotenv

dotenv_path = '/home/robbizorg/classes/RT_MusicGen'
load_dotenv(dotenv_path=dotenv_path)

music_path = os.getenv("music_path")
sf_path = os.getenv('sf_path')
sample_path = os.getenv('sample_path')

sys.path.append('../../')
from src.spectral_ops import ISTFT, STFT
from src.models import Vocos
from src.encoder import TimbreEncoder
from src.dataset import Midi_Seg, train_collate_fn

2025-12-18 01:46:06.363856: I tensorflow/core/util/port.cc:113] oneDNN custom operations are on. You may see slightly different numerical results due to floating-point round-off errors from different computation orders. To turn them off, set the environment variable `TF_ENABLE_ONEDNN_OPTS=0`.
2025-12-18 01:46:06.415791: E external/local_xla/xla/stream_executor/cuda/cuda_dnn.cc:9261] Unable to register cuDNN factory: Attempting to register factory for plugin cuDNN when one has already been registered
2025-12-18 01:46:06.415829: E external/local_xla/xla/stream_executor/cuda/cuda_fft.cc:607] Unable to register cuFFT factory: Attempting to register factory for plugin cuFFT when one has already been registered
2025-12-18 01:46:06.417026: E external/local_xla/xla/stream_executor/cuda/cuda_blas.cc:1515] Unable to register cuBLAS factory: Attempting to register factory for plugin cuBLAS when one has already been registered
2025-12-18 01:46:06.423969: I tensorflow/core/platform/cpu_feature_guar

In [2]:
# Load in Base Config'
yaml_name = 'midi_vocos_1st.yaml'
with open('../../yamls/' + yaml_name, "r") as stream:
    try:
        config = yaml.safe_load(stream)
    except yaml.YAMLError as exc:
        print(exc)

device = 'cpu' # Running all Tests on the CPU 
sample_rate = config['sample_rate']
buffer_size = config['buffer_size']
prev_ratio = config.get('prev_ratio', 2.0)
comment = config['comment']
ckpt_path = os.path.join('../../ckpt', comment)

vocos_config = config['vocos_config']   

# Load Models
model = Vocos(vocos_config).to(device)
model.load_state_dict(torch.load(os.path.join(ckpt_path, '30', 'VocosSynth.pth'), map_location=device))
# model.load_state_dict(torch.load(os.path.join(ckpt_path, 'VocosSynth.pth'), map_location=device)) # Running Model
model.eval()

timbre_config = config['timbre_config']  
tmbr_encoder = TimbreEncoder(timbre_config).to(device)
tmbr_encoder.load_state_dict(torch.load(os.path.join(ckpt_path, '30', 'VocosTimbre.pth'), map_location=device))
# tmbr_encoder.load_state_dict(torch.load(os.path.join(ckpt_path, 'VocosTimbre.pth'), map_location=device)) # Running Model
tmbr_encoder.eval()

# Init Transform
stft_transform = STFT(
    n_fft=vocos_config['head']['n_fft'],
    hop_length=vocos_config['head']['hop_length'],
    win_length=vocos_config['head']['n_fft']
).to(device)


# Load Dataset
train_dataset = Midi_Seg(sf_path = sample_path, 
    sr = sample_rate, 
    buffer_size = buffer_size, 
    prev_ratio = prev_ratio)

In [17]:
idx = 50000
ex_path = train_dataset.files[idx]
print(f'Loading {ex_path}')

display(Audio(ex_path, rate=sample_rate))

x_raw, pitch, _, _ = train_dataset.__getitem__(idx)
x_raw = x_raw.unsqueeze(0)
pitch = pitch.unsqueeze(0)

gen_path = '../../samples/ground_truth'

filename = os.path.join(gen_path, ex_path.split('/')[-2] + '.wav')
sf.write(filename, x_raw[0], samplerate=sample_rate)

# Where idx sampling can begin and end 
start_idx = int(buffer_size * prev_ratio)
end_idx = int(x_raw.shape[-1] - buffer_size)

# Pad by Prev Ratio
pad_len = int(buffer_size * prev_ratio)
x = F.pad(x_raw, pad=(pad_len, 0))

# Get Beginning (should be just 0s)
prev_x = x[:, start_idx - pad_len : start_idx].float() # Previous Info

timbre_x = x[:, start_idx : start_idx + sample_rate].float()

# Process Timbre 
timbre_spec = stft_transform(timbre_x)
timbre_emb = tmbr_encoder(timbre_spec)
print(f'Timbre Emb Dimensions: {timbre_emb.shape}')

Loading /data/robbizorg/music/samples/DSoundfont_Ultimate/Synth-Bass-8/80_80.wav


Timbre Emb Dimensions: torch.Size([1, 128])


In [18]:
# Process Inputs and Estimate next step
all_out = torch.tensor([])

# Generate at least 4s of Audio
while all_out.shape[-1] < sample_rate * 4:
    prev_spec = stft_transform(prev_x)

    if len(pitch.shape) != 3:
        pitch_review = pitch[:, None, None].repeat(1, 1, prev_spec.shape[-1]).float().to(device)


    in_feats = torch.cat([pitch_review, prev_spec], dim = 1)

    with torch.no_grad():
        out = model(in_feats, timbre_emb=timbre_emb)

    # Match Time
    x_hat = out[:, :buffer_size]

    # Move buffer
    prev_x = torch.cat([prev_x[:, buffer_size:], x_hat], dim = 1)
    # Append to all out
    all_out = torch.cat([all_out, x_hat], dim = 1)
     

In [19]:
x_hat.shape, prev_x.shape, all_out.shape

(torch.Size([1, 1024]), torch.Size([1, 2048]), torch.Size([1, 192512]))

In [20]:
np_out = all_out.detach().numpy() 
display(Audio(np_out, rate=sample_rate))
gen_path = '../../samples/generated'

filename = os.path.join(gen_path, ex_path.split('/')[-2] + '.wav')
sf.write(filename, np_out[0], samplerate=sample_rate)

In [17]:
# How does Timbre affect the sound 
# NOTE: It seems to effect it quite a bit!
num_sounds = 20 

gen_path = '../../samples/generated'
for i in range(num_sounds): 
    rand_timbre_emb = torch.rand_like(timbre_emb)

    # Get Beginning (should be just 0s)
    prev_x = x[:, start_idx - pad_len : start_idx].float() # Previous Info

    all_out = torch.tensor([])

    # Generate at least 2s of Audio
    while all_out.shape[-1] < sample_rate * 2:
        prev_spec = stft_transform(prev_x)

        if len(pitch.shape) != 3:
            pitch_review = pitch[:, None, None].repeat(1, 1, prev_spec.shape[-1]).float().to(device)

        in_feats = torch.cat([pitch_review, prev_spec], dim = 1)

        with torch.no_grad():
            out = model(in_feats, timbre_emb=rand_timbre_emb)

        # Match Time
        x_hat = out[:, :buffer_size]

        # Move buffer
        prev_x = torch.cat([prev_x[:, buffer_size:], x_hat], dim = 1)
        # Append to all out
        all_out = torch.cat([all_out, x_hat], dim = 1)

    np_out = all_out.detach().numpy() 
    display(Audio(np_out, rate=sample_rate))

    filename = os.path.join(gen_path, f'gen_{i}.wav')
    sf.write(filename, np_out[0], samplerate=sample_rate)

In [None]:
# How does Pitch affect the sound 
# NOTE: It seems to effect it quite a bit!
num_sounds = 10 
pitches = [30, 60, 90]
 
for midi_pitch in pitches: 
    # Get Beginning (should be just 0s)
    prev_x = x[:, start_idx - pad_len : start_idx].float() # Previous Info

    all_out = torch.tensor([])

    # Generate at least 2s of Audio
    while all_out.shape[-1] < sample_rate * 2:
        prev_spec = stft_transform(prev_x)

        if len(pitch.shape) != 3:
            pitch_review = pitch[:, None, None].repeat(1, 1, prev_spec.shape[-1]).float().to(device)

        # Reset Pitch Value
        base_pitch = pitch[0]
        pitch_review = pitch_review / base_pitch 
        pitch_review = pitch_review * midi_pitch

        in_feats = torch.cat([pitch_review, prev_spec], dim = 1)

        with torch.no_grad():
            out = model(in_feats, timbre_emb=timbre_emb)

        # Match Time
        x_hat = out[:, :buffer_size]

        # Move buffer
        prev_x = torch.cat([prev_x[:, buffer_size:], x_hat], dim = 1)
        # Append to all out
        all_out = torch.cat([all_out, x_hat], dim = 1)



    np_out = all_out.detach().numpy() 
    display(Audio(np_out, rate=sample_rate))

In [None]:
pitch_review, pitch

(tensor([[[90., 90., 90., 90., 90., 90., 90., 90., 90., 90., 90., 90., 90., 90.,
           90., 90., 90., 90., 90., 90., 90., 90., 90., 90., 90., 90., 90., 90.,
           90., 90., 90., 90., 90., 90., 90.]]]),
 tensor([21]))