In [None]:
# This Python 3 environment comes with many helpful analytics libraries installed
# It is defined by the kaggle/python Docker image: https://github.com/kaggle/docker-python
# For example, here's several helpful packages to load
!apt install ffmpeg
!pip install "audio-separator[cpu]"

import numpy as np # linear algebra
import pandas as pd # data processing, CSV file I/O (e.g. pd.read_csv)

# Input data files are available in the read-only "../input/" directory
# For example, running this (by clicking run or pressing Shift+Enter) will list all files under the input directory

import os
for dirname, _, filenames in os.walk('/kaggle/input'):
    for filename in filenames:
        print(os.path.join(dirname, filename))

# You can write up to 20GB to the current directory (/kaggle/working/) that gets preserved as output when you create a version using "Save & Run All" 
# You can also write temporary files to /kaggle/temp/, but they won't be saved outside of the current session

In [None]:
# Processing a song to extract its notes

# Beat tracking example
import librosa
import librosa.display as disp
import matplotlib.pyplot as plt
import numpy as np
from audio_separator.separator import Separator

filename = "/kaggle/input/songstotest/Treat You Better.wav"

# Initialize the Separator class (with optional configuration properties, below)
separator = Separator(output_dir="/kaggle/working")

# Load a machine learning model (if unspecified, defaults to 'model_mel_band_roformer_ep_3005_sdr_11.4360.ckpt')
separator.load_model()

# Perform the separation on specific audio files without reloading the model
output_files = separator.separate(filename)

print(f"Separation complete! Output file(s): {' '.join(output_files)}")

y, sr = librosa.load(filename)

# 3. Run the default beat tracker
tempo, beat_frames = librosa.beat.beat_track(y=y, sr=sr)

#print('Estimated tempo: {:.2f} beats per minute'.format,tempo)

# 4. Convert the frame indices of beat events into timestamps
beat_times = librosa.frames_to_time(beat_frames, sr=sr)
print(beat_times)

y_harmonic, y_percussive = librosa.effects.hpss(y)

# Compute chroma features from the harmonic signal
chroma = librosa.feature.chroma_cqt(y=y_harmonic, sr=sr)

print(chroma)

fig, ax = plt.subplots(nrows=1)
img = librosa.display.specshow(chroma, y_axis='chroma', x_axis='time', ax=ax)
ax.set(title='chroma')
fig.colorbar(img, ax=ax)
plt.savefig('test.png')


In [None]:
# Map chroma indices to note names
note_to_midi = {
    'C': 60, 'C#': 61, 'D': 62, 'D#': 63, 'E': 64, 'F': 65, 
    'F#': 66, 'G': 67, 'G#': 68, 'A': 69, 'A#': 70, 'B': 71
}
note_names = list(note_to_midi.keys())

# Extract the most dominant note at each time frame
"""dominant_notes = [
    [note_to_midi[note_names[idx]] for idx in np.argsort(chroma[:, i])[-3:][::-1]]  # Get top 3 indices, reverse for descending order
    for i in range(chroma.shape[1])
]"""

# apparently the best threshold to capture harmonics while not capturing background noise
threshold = 0.75

dominant_notes = [
    [note_to_midi[note_names[idx]] for idx in range(12) if chroma[idx, i] > threshold]
    for i in range(chroma.shape[1])
]

# Get corresponding time values
time_values = librosa.frames_to_time(range(chroma.shape[1]), sr=sr)

curr_time = 0
curr_beat = beat_times[0]
curr_beat_index = 0
MAX_BEAT_INDEX = len(beat_times)

new_time_values = []
new_dominant_notes = []

for i in range(len(time_values)):
    curr_time = time_values[i]
    if curr_time > curr_beat:
        new_time_values.append(time_values[i - 1])
        new_dominant_notes.append(dominant_notes[i - 1])
        curr_beat_index+=1
        if curr_beat_index >= MAX_BEAT_INDEX:
            break
        curr_beat = beat_times[curr_beat_index] 

# Combine time and note information
note_timeline = list(zip(new_time_values, new_dominant_notes))
print(new_dominant_notes)
#print(note_timeline)

In [None]:
import tensorflow as tf

chord_encodings = {0: 'A#maj', 1: 'A#min', 2: 'Amaj', 3: 'Amin', 4: 'Bmaj', 5: 'Bmin', 6: 'C#maj', 7: 'C#min', 
                   8: 'Cmaj', 9: 'Cmin', 10: 'D#maj', 11: 'D#min', 12: 'Dmaj', 13: 'Dmin', 14: 'Emaj', 15: 'Emin', 
                   16: 'F#maj', 17: 'F#min', 18: 'Fmaj', 19: 'Fmin', 20: 'G#maj', 21: 'G#min', 22: 'Gmaj', 
                   23: 'Gmin', 24: 'N.C.'}

# Load the model from the H5 file
model = tf.keras.models.load_model('/kaggle/input/test-model/BestChordPredictor.keras')

max_sequence_length = 16

def predict_chords(model, note_sequences):
    padded_sequences = tf.keras.preprocessing.sequence.pad_sequences(note_sequences, maxlen=max_sequence_length, padding='post', truncating='post')
    predictions = model.predict(padded_sequences)
    return np.argmax(predictions, axis=1)  # Return predictions for each sequence

predicted_chords = predict_chords(model, new_dominant_notes)


#print(f"Predicted chord nums: {predicted_chords}")
chords = [chord_encodings[chord] for chord in predicted_chords]
str_beats = [str(round(beat, 3)) for beat in beat_times]

#print(f"Predicted chords: {chords}")

chords_per_beat = dict(zip(str_beats, chords))

# Store keys to delete
keys_to_delete = []

for i in range(1, len(chords)):
    if chords[i] == chords[i - 1]:
        keys_to_delete.append(str_beats[i])

# Delete keys after iteration
for key in keys_to_delete:
    chords_per_beat.pop(key, None)  # Using .pop() to avoid KeyError

print(chords_per_beat)

In [None]:
!pip install pretty_midi
import pretty_midi

chord_to_notes = {
    "Cmaj": [60, 64, 67, 72],
    "Cmin": [60, 63, 67, 72],
    
    "C#maj": [61, 65, 68, 73],
    "C#min": [61, 64, 68, 73],
    
    "Dmaj": [62, 66, 69, 74],
    "Dmin": [62, 65, 69, 74],
    
    "D#maj": [63, 67, 70, 75],
    "D#min": [63, 66, 70, 75],
    
    "Emaj": [64, 68, 71, 76],
    "Emin": [64, 67, 71, 76],
    
    "Fmaj": [65, 69, 72, 77],
    "Fmin": [65, 68, 72, 77],
    
    "F#maj": [66, 70, 73, 78],
    "F#min": [66, 69, 73, 78],
    
    "Gmaj": [67, 71, 74, 79],
    "Gmin": [67, 70, 74, 79],
    
    "G#maj": [68, 72, 75, 80],
    "G#min": [68, 71, 75, 80],
    
    "Amaj": [69, 73, 76, 81],
    "Amin": [69, 72, 76, 81],
    
    "A#maj": [70, 74, 77, 82],
    "A#min": [70, 73, 77, 82],
    
    "Bmaj": [71, 75, 78, 83],
    "Bmin": [71, 74, 78, 83]
}

# Create a PrettyMIDI object
midi = pretty_midi.PrettyMIDI()

# Create an instrument (piano)
instrument = pretty_midi.Instrument(program=0)

sorted_beats = sorted(float(b) for b in chords_per_beat.keys())

for i in range(len(sorted_beats) - 1):
    start_time = sorted_beats[i]
    end_time = sorted_beats[i + 1]
    chord = chords_per_beat[str(sorted_beats[i])]

    if chord in chord_to_notes:
        notes = chord_to_notes[chord]

        for note in notes:
            midi_note = pretty_midi.Note(
                velocity=80, pitch=note, start=start_time, end=end_time
            )
            instrument.notes.append(midi_note)

# Add instrument to MIDI
midi.instruments.append(instrument)

# Save MIDI file
midi.write("chords.mid")
print("MIDI file saved as chords.mid")

In [None]:
!apt-get install fluidsynth
!pip install midi2audio
from midi2audio import FluidSynth

# Convert MIDI to WAV using a soundfont
midi_file = "/kaggle/working/chords.mid"
output_wav = "/kaggle/working/midi_audio.wav"
soundfont = "/kaggle/input/songstotest/FluidR3_GM.sf2"  # Use a GM-compatible soundfont (.sf2)

fs = FluidSynth(soundfont)
fs.midi_to_audio(midi_file, output_wav)

print(f"Converted {midi_file} to {output_wav}")

In [None]:
from pydub import AudioSegment
import re

# Load both WAV files
midi_audio = AudioSegment.from_wav("/kaggle/working/midi_audio.wav")
original_audio = AudioSegment.from_wav(filename)

midi_audio += 2
original_audio -= 16

# Ensure both files have the same duration (trim or loop if needed)
min_length = min(len(midi_audio), len(original_audio))
midi_audio = midi_audio[:min_length]
original_audio = original_audio[:min_length]

# Mix the two audio files together (adjust volume if necessary)
combined_audio = original_audio.overlay(midi_audio, position=0)

# Export the final mix
combined_audio.export(f"final_output.wav", format="wav")

print("Final mixed WAV file saved as final_output.wav")