In [2]:
# image_to_music.py
# Generate varied music from images using RNN trained on MAESTRO dataset

import os
import cv2
import numpy as np
import matplotlib.pyplot as plt
import torch
import torchvision.transforms as transforms
from torchvision import models
from PIL import Image
import pretty_midi
import subprocess
from pydub import AudioSegment, effects
import tensorflow as tf
from tensorflow.keras import layers, models as tf_models
import glob
import random
from sklearn.preprocessing import StandardScaler

# Suppress TensorFlow warnings for cleaner output
tf.get_logger().setLevel('ERROR')

# --- Configuration ---
IMAGE_PATH = 'Images/Burger King.jpg'  # Replace with your image
MIDI_DIR = 'Music Genre/maestro-v3.0.0'  # MAESTRO dataset directory
SOUNDFONT_PATH = 'FluidR3_GM.sf2'  # Update with your SoundFont path
OUTPUT_MIDI = 'generated_music.mid'
OUTPUT_WAV = 'generated_music.wav'
OUTPUT_MP3 = 'generated_music.mp3'
SAMPLE_RATE = 44100  # For audio output
MIDI_RESOLUTION = 480  # Ticks per quarter note for MIDI
SEQUENCE_LENGTH = 50  # For RNN input
INPUT_SEQUENCE_LENGTH = SEQUENCE_LENGTH - 1  # Model expects 49 steps
FEATURE_DIM = 512  # Reduced image feature dimension

# Ensure SoundFont exists
if not os.path.exists(SOUNDFONT_PATH):
    raise FileNotFoundError(f"SoundFont not found at {SOUNDFONT_PATH}")

# --- Image Feature Extraction ---
def extract_image_features(img_path):
    """
    Extract low-level and high-level features from an image.
    Returns a feature vector combining brightness, edges, color histograms, and VGG16 features.
    """
    img = cv2.imread(img_path)
    if img is None:
        raise FileNotFoundError(f"Image not found: {img_path}")
    img_rgb = cv2.cvtColor(img, cv2.COLOR_BGR2RGB)
    img_small = cv2.resize(img_rgb, (64, 64))
    brightness = np.mean(img_small, axis=2)
    brightness_flat = brightness.flatten() / 255.0
    edges = cv2.Canny(cv2.cvtColor(img_small, cv2.COLOR_RGB2GRAY), 100, 200)
    edges_flat = edges.flatten() / 255.0
    hist_r = cv2.calcHist([img_small], [0], None, [32], [0, 256]) / img_small.size
    hist_g = cv2.calcHist([img_small], [1], None, [32], [0, 256]) / img_small.size
    hist_b = cv2.calcHist([img_small], [2], None, [32], [0, 256]) / img_small.size
    hist = np.concatenate([hist_r, hist_g, hist_b]).flatten()
    vgg = models.vgg16(pretrained=True).eval()
    transform = transforms.Compose([
        transforms.Resize((224, 224)),
        transforms.ToTensor(),
        transforms.Normalize(mean=[0.485, 0.456, 0.406], std=[0.229, 0.224, 0.225])
    ])
    image = Image.open(img_path).convert('RGB')
    input_tensor = transform(image).unsqueeze(0)
    with torch.no_grad():
        features = vgg.features(input_tensor)
        vgg_features = features.view(-1).numpy()
    vgg_features = vgg_features[:FEATURE_DIM]
    combined_features = np.concatenate([brightness_flat[:256], edges_flat[:256], hist, vgg_features])
    scaler = StandardScaler()
    combined_features = scaler.fit_transform(combined_features.reshape(-1, 1)).flatten()
    return combined_features

# --- MIDI Data Processing ---
def midi_to_sequence(midi_file, max_length=SEQUENCE_LENGTH):
    """
    Convert a MIDI file to a sequence of notes (pitch, velocity, duration, instrument).
    """
    try:
        midi = pretty_midi.PrettyMIDI(midi_file)
        notes = []
        for instrument in midi.instruments:
            for note in instrument.notes:
                duration = note.end - note.start
                if duration <= 0:
                    continue
                notes.append([
                    note.pitch,
                    note.velocity,
                    duration,
                    instrument.program
                ])
        notes.sort(key=lambda x: random.random())
        if len(notes) > max_length:
            notes = notes[:max_length]
        elif len(notes) < max_length:
            notes.extend([[0, 0, 0, 0]] * (max_length - len(notes)))
        return np.array(notes)
    except Exception as e:
        print(f"Error processing {midi_file}: {e}")
        return np.zeros((max_length, 4))

def prepare_training_data(midi_dir):
    """
    Load MIDI files and prepare sequences for RNN training.
    """
    midi_files = glob.glob(os.path.join(midi_dir, '**/*.midi'), recursive=True)
    if not midi_files:
        raise FileNotFoundError(f"No MIDI files found in {midi_dir}")
    sequences = []
    for midi_file in midi_files[:100]:
        seq = midi_to_sequence(midi_file)
        sequences.append(seq)
    sequences = np.array(sequences)
    sequences[:, :, 0] /= 127.0
    sequences[:, :, 1] /= 127.0
    sequences[:, :, 2] /= np.max(sequences[:, :, 2]) + 1e-6
    sequences[:, :, 3] /= 127.0
    return sequences

# --- RNN Model ---
def build_rnn_model(input_shape, image_feature_dim):
    """
    Build an LSTM model that outputs sequences for pitch, velocity, duration, instrument.
    """
    note_input = layers.Input(shape=input_shape)
    image_input = layers.Input(shape=(image_feature_dim,))
    image_repeated = layers.RepeatVector(input_shape[0])(image_input)
    combined_input = layers.Concatenate(axis=-1)([note_input, image_repeated])
    x = layers.LSTM(256, return_sequences=True)(combined_input)
    x = layers.LSTM(128, return_sequences=True)(x)
    pitch = layers.TimeDistributed(
        layers.Dense(128, activation='softmax'), name='pitch'
    )(x)
    velocity = layers.TimeDistributed(
        layers.Dense(1, activation='sigmoid'), name='velocity'
    )(x)
    duration = layers.TimeDistributed(
        layers.Dense(1, activation='relu'), name='duration'
    )(x)
    instrument = layers.TimeDistributed(
        layers.Dense(128, activation='softmax'), name='instrument'
    )(x)
    model = tf_models.Model(
        inputs=[note_input, image_input],
        outputs=[pitch, velocity, duration, instrument]
    )
    model.compile(
        optimizer='adam',
        loss={
            'pitch': 'sparse_categorical_crossentropy',
            'velocity': 'mse',
            'duration': 'mse',
            'instrument': 'sparse_categorical_crossentropy'
        },
        metrics={
            'pitch': 'accuracy',
            'velocity': 'mae',
            'duration': 'mae',
            'instrument': 'accuracy'
        }
    )
    return model

# --- Train RNN ---
def train_rnn(midi_dir, image_feature_dim):
    """
    Train the RNN model on MIDI sequences.
    """
    sequences = prepare_training_data(midi_dir)
    X = sequences[:, :-1, :]
    y_pitch = sequences[:, 1:, 0] * 127
    y_velocity = sequences[:, 1:, 1]
    y_duration = sequences[:, 1:, 2]
    y_instrument = sequences[:, 1:, 3] * 127
    y_pitch = np.expand_dims(y_pitch, axis=-1)
    y_velocity = np.expand_dims(y_velocity, axis=-1)
    y_duration = np.expand_dims(y_duration, axis=-1)
    y_instrument = np.expand_dims(y_instrument, axis=-1)
    dummy_features = np.random.rand(len(sequences), image_feature_dim)
    model = build_rnn_model((X.shape[1], X.shape[2]), image_feature_dim)
    model.fit(
        [X, dummy_features],
        {
            'pitch': y_pitch,
            'velocity': y_velocity,
            'duration': y_duration,
            'instrument': y_instrument
        },
        epochs=10,
        batch_size=32,
        verbose=1
    )
    return model

# --- Music Generation ---
def generate_music(model, image_features, seed_sequence, num_notes=100):
    """
    Generate a MIDI sequence using the trained RNN conditioned on image features.
    """
    sequence = seed_sequence.copy()
    if sequence.shape[0] > INPUT_SEQUENCE_LENGTH:
        sequence = sequence[-INPUT_SEQUENCE_LENGTH:]
    elif sequence.shape[0] < INPUT_SEQUENCE_LENGTH:
        padding = np.zeros((INPUT_SEQUENCE_LENGTH - sequence.shape[0], 4))
        sequence = np.vstack([padding, sequence])
    generated_notes = []
    for _ in range(num_notes):
        input_seq = sequence[-INPUT_SEQUENCE_LENGTH:].reshape(1, INPUT_SEQUENCE_LENGTH, 4)
        image_input = image_features.reshape(1, -1)
        pitch_probs, velocity, duration, instrument_probs = model.predict(
            [input_seq, image_input], verbose=0
        )
        pitch = np.argmax(pitch_probs[0, -1, :])
        velocity_val = velocity[0, -1, 0] * 127
        duration_val = duration[0, -1, 0] * 2
        instrument = np.argmax(instrument_probs[0, -1, :])
        generated_notes.append([pitch, velocity_val, duration_val, instrument])
        new_note = np.array([[pitch/127, velocity_val/127, duration_val/2, instrument/127]])
        sequence = np.vstack([sequence, new_note])
    return np.array(generated_notes)

def notes_to_midi(notes, output_midi):
    """
    Convert generated notes to a MIDI file.
    """
    midi = pretty_midi.PrettyMIDI(resolution=MIDI_RESOLUTION)
    current_instrument = None
    instr = None
    start_time = 0
    for note in notes:
        pitch, velocity, duration, program = note
        program = int(np.clip(program, 0, 127))
        if program != current_instrument:
            instr = pretty_midi.Instrument(program=program)
            midi.instruments.append(instr)
            current_instrument = program
        note_obj = pretty_midi.Note(
            velocity=int(np.clip(velocity, 0, 127)),
            pitch=int(np.clip(pitch, 0, 127)),
            start=start_time,
            end=start_time + duration
        )
        instr.notes.append(note_obj)
        start_time += duration / 2
    midi.write(output_midi)
    print(f"✅ Music saved as {output_midi}")

# --- Audio Conversion ---
def midi_to_wav(midi_file, soundfont_path, output_wav):
    """
    Convert MIDI to WAV using FluidSynth.
    """
    command = [
        'fluidsynth',
        '-ni', soundfont_path, midi_file,
        '-F', output_wav,
        '-r', str(SAMPLE_RATE)
    ]
    subprocess.run(command, check=True)
    print(f"✅ Converted to {output_wav}")

def apply_audio_effects(input_wav, output_wav, output_mp3):
    """
    Apply audio effects (normalization, reverb) and export WAV/MP3.
    """
    sound = AudioSegment.from_wav(input_wav)
    sound = effects.normalize(sound)
    delay_ms = 120
    echo = AudioSegment.silent(duration=delay_ms) + (sound - 6)
    reverb = sound.overlay(echo)
    reverb.export(output_wav, format='wav')
    reverb.export(output_mp3, format='mp3')
    print(f"✅ Exported: {output_wav}, {output_mp3}")

# --- Main Execution ---
def main():
    print("Extracting image features...")
    image_features = extract_image_features(IMAGE_PATH)
    print("Training RNN...")
    rnn_model = train_rnn(MIDI_DIR, image_features.shape[0])
    seed_sequence = np.random.rand(INPUT_SEQUENCE_LENGTH, 4)
    print("Generating music...")
    generated_notes = generate_music(rnn_model, image_features, seed_sequence)
    notes_to_midi(generated_notes, OUTPUT_MIDI)
    midi_to_wav(OUTPUT_MIDI, SOUNDFONT_PATH, OUTPUT_WAV)
    apply_audio_effects(OUTPUT_WAV, OUTPUT_WAV, OUTPUT_MP3)
    try:
        from IPython.display import Audio
        display(Audio(filename=OUTPUT_MP3))
    except ImportError:
        print("IPython not available, skipping audio display.")

if __name__ == '__main__':
    main()

Extracting image features...




Training RNN...
Epoch 1/10
[1m4/4[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m3s[0m 104ms/step - duration_loss: 0.0061 - duration_mae: 0.0400 - instrument_accuracy: 0.5053 - instrument_loss: 3.2268 - loss: 8.2728 - pitch_accuracy: 0.0016 - pitch_loss: 4.8394 - velocity_loss: 0.0279 - velocity_mae: 0.1345  
Epoch 2/10
[1m4/4[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m0s[0m 102ms/step - duration_loss: 0.0013 - duration_mae: 0.0195 - instrument_accuracy: 1.0000 - instrument_loss: 0.4298 - loss: 5.2080 - pitch_accuracy: 0.0247 - pitch_loss: 4.7127 - velocity_loss: 0.0331 - velocity_mae: 0.1435
Epoch 3/10
[1m4/4[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m0s[0m 100ms/step - duration_loss: 0.0017 - duration_mae: 0.0201 - instrument_accuracy: 1.0000 - instrument_loss: 0.0993 - loss: 4.6163 - pitch_accuracy: 0.0240 - pitch_loss: 4.4718 - velocity_loss: 0.0218 - velocity_mae: 0.1203
Epoch 4/10
[1m4/4[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m0s[0m 94ms/step - duration_los

In [5]:
# image_to_music.py
# Generate varied music from images using RNN trained on MAESTRO dataset

import os
import cv2
import numpy as np
import matplotlib.pyplot as plt
import torch
import torchvision.transforms as transforms
from torchvision import models
from PIL import Image
import pretty_midi
import subprocess
from pydub import AudioSegment, effects
import tensorflow as tf
from tensorflow.keras import layers, models as tf_models
import glob
import random
from sklearn.preprocessing import StandardScaler

# Suppress TensorFlow warnings for cleaner output
tf.get_logger().setLevel('ERROR')

# --- Configuration ---
IMAGE_PATH = 'Images/Burger King.jpg'  # Replace with your image
MIDI_DIR = 'Music Genre/maestro-v3.0.0'  # MAESTRO dataset directory
SOUNDFONT_PATH = 'FluidR3_GM.sf2'  # Update with your SoundFont path
OUTPUT_MIDI = 'generated_music.mid'
OUTPUT_WAV = 'generated_music.wav'
OUTPUT_MP3 = 'generated_music.mp3'
SAMPLE_RATE = 44100  # For audio output
MIDI_RESOLUTION = 480  # Ticks per quarter note for MIDI
SEQUENCE_LENGTH = 50  # For RNN input
INPUT_SEQUENCE_LENGTH = SEQUENCE_LENGTH - 1  # Model expects 49 steps
FEATURE_DIM = 512  # Reduced image feature dimension

# Ensure SoundFont exists
if not os.path.exists(SOUNDFONT_PATH):
    raise FileNotFoundError(f"SoundFont not found at {SOUNDFONT_PATH}")

# --- Image Feature Extraction ---
def extract_image_features(img_path):
    """
    Extract low-level and high-level features from an image.
    Returns a feature vector combining brightness, edges, color histograms, and VGG16 features.
    """
    img = cv2.imread(img_path)
    if img is None:
        raise FileNotFoundError(f"Image not found: {img_path}")
    img_rgb = cv2.cvtColor(img, cv2.COLOR_BGR2RGB)
    img_small = cv2.resize(img_rgb, (64, 64))
    brightness = np.mean(img_small, axis=2)
    brightness_flat = brightness.flatten() / 255.0
    edges = cv2.Canny(cv2.cvtColor(img_small, cv2.COLOR_RGB2GRAY), 100, 200)
    edges_flat = edges.flatten() / 255.0
    hist_r = cv2.calcHist([img_small], [0], None, [32], [0, 256]) / img_small.size
    hist_g = cv2.calcHist([img_small], [1], None, [32], [0, 256]) / img_small.size
    hist_b = cv2.calcHist([img_small], [2], None, [32], [0, 256]) / img_small.size
    hist = np.concatenate([hist_r, hist_g, hist_b]).flatten()
    vgg = models.vgg16(pretrained=True).eval()
    transform = transforms.Compose([
        transforms.Resize((224, 224)),
        transforms.ToTensor(),
        transforms.Normalize(mean=[0.485, 0.456, 0.406], std=[0.229, 0.224, 0.225])
    ])
    image = Image.open(img_path).convert('RGB')
    input_tensor = transform(image).unsqueeze(0)
    with torch.no_grad():
        features = vgg.features(input_tensor)
        vgg_features = features.view(-1).numpy()
    vgg_features = vgg_features[:FEATURE_DIM]
    combined_features = np.concatenate([brightness_flat[:256], edges_flat[:256], hist, vgg_features])
    scaler = StandardScaler()
    combined_features = scaler.fit_transform(combined_features.reshape(-1, 1)).flatten()
    return combined_features

# --- MIDI Data Processing ---
def midi_to_sequence(midi_file, max_length=SEQUENCE_LENGTH):
    """
    Convert a MIDI file to a sequence of notes (pitch, velocity, duration, instrument).
    """
    try:
        midi = pretty_midi.PrettyMIDI(midi_file)
        notes = []
        for instrument in midi.instruments:
            for note in instrument.notes:
                duration = note.end - note.start
                if duration <= 0:
                    continue
                notes.append([
                    note.pitch,
                    note.velocity,
                    duration,
                    instrument.program
                ])
        notes.sort(key=lambda x: random.random())
        if len(notes) > max_length:
            notes = notes[:max_length]
        elif len(notes) < max_length:
            notes.extend([[0, 0, 0, 0]] * (max_length - len(notes)))
        return np.array(notes)
    except Exception as e:
        print(f"Error processing {midi_file}: {e}")
        return np.zeros((max_length, 4))

def prepare_training_data(midi_dir):
    """
    Load MIDI files and prepare sequences for RNN training.
    """
    midi_files = glob.glob(os.path.join(midi_dir, '**/*.midi'), recursive=True)
    if not midi_files:
        raise FileNotFoundError(f"No MIDI files found in {midi_dir}")
    sequences = []
    for midi_file in midi_files[:100]:
        seq = midi_to_sequence(midi_file)
        sequences.append(seq)
    sequences = np.array(sequences)
    sequences[:, :, 0] /= 127.0
    sequences[:, :, 1] /= 127.0
    sequences[:, :, 2] /= np.max(sequences[:, :, 2]) + 1e-6
    sequences[:, :, 3] /= 127.0
    return sequences

# --- RNN Model ---
def build_rnn_model(input_shape, image_feature_dim):
    """
    Build an LSTM model that outputs sequences for pitch, velocity, duration, instrument.
    """
    note_input = layers.Input(shape=input_shape)
    image_input = layers.Input(shape=(image_feature_dim,))
    image_repeated = layers.RepeatVector(input_shape[0])(image_input)
    combined_input = layers.Concatenate(axis=-1)([note_input, image_repeated])
    x = layers.LSTM(256, return_sequences=True)(combined_input)
    x = layers.LSTM(128, return_sequences=True)(x)
    pitch = layers.TimeDistributed(
        layers.Dense(128, activation='softmax'), name='pitch'
    )(x)
    velocity = layers.TimeDistributed(
        layers.Dense(1, activation='sigmoid'), name='velocity'
    )(x)
    duration = layers.TimeDistributed(
        layers.Dense(1, activation='relu'), name='duration'
    )(x)
    instrument = layers.TimeDistributed(
        layers.Dense(128, activation='softmax'), name='instrument'
    )(x)
    model = tf_models.Model(
        inputs=[note_input, image_input],
        outputs=[pitch, velocity, duration, instrument]
    )
    model.compile(
        optimizer='adam',
        loss={
            'pitch': 'sparse_categorical_crossentropy',
            'velocity': 'mse',
            'duration': 'mse',
            'instrument': 'sparse_categorical_crossentropy'
        },
        metrics={
            'pitch': 'accuracy',
            'velocity': 'mae',
            'duration': 'mae',
            'instrument': 'accuracy'
        }
    )
    return model

# --- Train RNN ---
def train_rnn(midi_dir, image_feature_dim):
    """
    Train the RNN model on MIDI sequences.
    """
    sequences = prepare_training_data(midi_dir)
    X = sequences[:, :-1, :]
    y_pitch = sequences[:, 1:, 0] * 127
    y_velocity = sequences[:, 1:, 1]
    y_duration = sequences[:, 1:, 2]
    y_instrument = sequences[:, 1:, 3] * 127
    y_pitch = np.expand_dims(y_pitch, axis=-1)
    y_velocity = np.expand_dims(y_velocity, axis=-1)
    y_duration = np.expand_dims(y_duration, axis=-1)
    y_instrument = np.expand_dims(y_instrument, axis=-1)
    dummy_features = np.random.rand(len(sequences), image_feature_dim)
    model = build_rnn_model((X.shape[1], X.shape[2]), image_feature_dim)
    model.fit(
        [X, dummy_features],
        {
            'pitch': y_pitch,
            'velocity': y_velocity,
            'duration': y_duration,
            'instrument': y_instrument
        },
        epochs=10,
        batch_size=32,
        verbose=1
    )
    return model

# --- Music Generation ---
def generate_music(model, image_features, seed_sequence, num_notes=100):
    """
    Generate a MIDI sequence using the trained RNN conditioned on image features.
    """
    sequence = seed_sequence.copy()
    if sequence.shape[0] > INPUT_SEQUENCE_LENGTH:
        sequence = sequence[-INPUT_SEQUENCE_LENGTH:]
    elif sequence.shape[0] < INPUT_SEQUENCE_LENGTH:
        padding = np.zeros((INPUT_SEQUENCE_LENGTH - sequence.shape[0], 4))
        sequence = np.vstack([padding, sequence])
    generated_notes = []
    for _ in range(num_notes):
        input_seq = sequence[-INPUT_SEQUENCE_LENGTH:].reshape(1, INPUT_SEQUENCE_LENGTH, 4)
        image_input = image_features.reshape(1, -1)
        pitch_probs, velocity, duration, instrument_probs = model.predict(
            [input_seq, image_input], verbose=0
        )
        pitch = np.argmax(pitch_probs[0, -1, :])
        velocity_val = velocity[0, -1, 0] * 127
        duration_val = duration[0, -1, 0] * 2
        instrument = np.argmax(instrument_probs[0, -1, :])
        generated_notes.append([pitch, velocity_val, duration_val, instrument])
        new_note = np.array([[pitch/127, velocity_val/127, duration_val/2, instrument/127]])
        sequence = np.vstack([sequence, new_note])
    return np.array(generated_notes)

def notes_to_midi(notes, output_midi):
    """
    Convert generated notes to a MIDI file.
    """
    midi = pretty_midi.PrettyMIDI(resolution=MIDI_RESOLUTION)
    current_instrument = None
    instr = None
    start_time = 0
    for note in notes:
        pitch, velocity, duration, program = note
        program = int(np.clip(program, 0, 127))
        if program != current_instrument:
            instr = pretty_midi.Instrument(program=program)
            midi.instruments.append(instr)
            current_instrument = program
        note_obj = pretty_midi.Note(
            velocity=int(np.clip(velocity, 0, 127)),
            pitch=int(np.clip(pitch, 0, 127)),
            start=start_time,
            end=start_time + duration
        )
        instr.notes.append(note_obj)
        start_time += duration / 2
    midi.write(output_midi)
    print(f"✅ Music saved as {output_midi}")

# --- Audio Conversion ---
def midi_to_wav(midi_file, soundfont_path, output_wav):
    """
    Convert MIDI to WAV using FluidSynth.
    """
    command = [
        'fluidsynth',
        '-ni', soundfont_path, midi_file,
        '-F', output_wav,
        '-r', str(SAMPLE_RATE)
    ]
    subprocess.run(command, check=True)
    print(f"✅ Converted to {output_wav}")

def apply_audio_effects(input_wav, output_wav, output_mp3):
    """
    Apply audio effects (normalization, reverb) and export WAV/MP3.
    """
    sound = AudioSegment.from_wav(input_wav)
    sound = effects.normalize(sound)
    delay_ms = 120
    echo = AudioSegment.silent(duration=delay_ms) + (sound - 6)
    reverb = sound.overlay(echo)
    reverb.export(output_wav, format='wav')
    reverb.export(output_mp3, format='mp3')
    print(f"✅ Exported: {output_wav}, {output_mp3}")

# --- Main Execution ---
def main():
    print("Extracting image features...")
    image_features = extract_image_features(IMAGE_PATH)
    print("Training RNN...")
    rnn_model = train_rnn(MIDI_DIR, image_features.shape[0])
    seed_sequence = np.random.rand(INPUT_SEQUENCE_LENGTH, 4)
    print("Generating music...")
    generated_notes = generate_music(rnn_model, image_features, seed_sequence)
    notes_to_midi(generated_notes, OUTPUT_MIDI)
    midi_to_wav(OUTPUT_MIDI, SOUNDFONT_PATH, OUTPUT_WAV)
    apply_audio_effects(OUTPUT_WAV, OUTPUT_WAV, OUTPUT_MP3)
    try:
        from IPython.display import Audio
        display(Audio(filename=OUTPUT_MP3))
    except ImportError:
        print("IPython not available, skipping audio display.")

if __name__ == '__main__':
    main()

Extracting image features...




Training RNN...
Epoch 1/10
[1m4/4[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m3s[0m 91ms/step - duration_loss: 0.0246 - duration_mae: 0.1074 - instrument_accuracy: 0.5053 - instrument_loss: 3.5504 - loss: 8.6265 - pitch_accuracy: 0.0031 - pitch_loss: 4.8631 - velocity_loss: 0.0241 - velocity_mae: 0.1266
Epoch 2/10
[1m4/4[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m0s[0m 105ms/step - duration_loss: 0.0033 - duration_mae: 0.0269 - instrument_accuracy: 1.0000 - instrument_loss: 0.6530 - loss: 5.4352 - pitch_accuracy: 0.0162 - pitch_loss: 4.7176 - velocity_loss: 0.0208 - velocity_mae: 0.1159
Epoch 3/10
[1m4/4[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m0s[0m 104ms/step - duration_loss: 0.0038 - duration_mae: 0.0284 - instrument_accuracy: 1.0000 - instrument_loss: 0.1271 - loss: 4.6423 - pitch_accuracy: 0.0232 - pitch_loss: 4.4682 - velocity_loss: 0.0204 - velocity_mae: 0.1164
Epoch 4/10
[1m4/4[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m0s[0m 96ms/step - duration_loss: 



FluidSynth runtime version 2.4.4
Copyright (C) 2000-2025 Peter Hanappe and others.
Distributed under the LGPL license.
SoundFont(R) is a registered trademark of Creative Technology Ltd.

Rendering audio to file 'generated_music.wav'..
✅ Converted to generated_music.wav
✅ Exported: generated_music.wav, generated_music.mp3
