In [1]:
%load_ext autoreload
%autoreload 2

In [2]:
import librosa
import numpy as np
import scipy

from keyfinder import Tonal_Fragment
from audio_utils import detect_sound_start, remove_noise_from_audio

from keycnn.classifier import KeyClassifier
from keycnn.feature import read_features

import IPython.display as ipd

In [116]:
version = 'key 0.0.1'
model = "deepspec"
input_file = "/home/sake/userdata/sake/a2m/hummings/psycho_hum_1.wav"

In [117]:
ipd.Audio(input_file)

In [118]:
classifier = KeyClassifier(model)

In [119]:
features = read_features(input_file)

In [120]:
tonic, mode = classifier.estimate_key(features)

In [121]:
tonic, mode

('C', 'major')

In [122]:
out = classifier.estimate(features)

In [123]:
out_cnn = classifier.estimate_key_with_confidence(features)
out_cnn

{'C:maj': 0.15783069,
 'C#:maj': 0.028304048,
 'D:maj': 0.043494377,
 'D#:maj': 0.045109916,
 'E:maj': 0.13363402,
 'F:maj': 0.0459136,
 'F#:maj': 0.022212362,
 'G:maj': 0.122032,
 'G#:maj': 0.08895303,
 'A:maj': 0.05479422,
 'A#:maj': 0.032633368,
 'B:maj': 0.080406986,
 'C:min': 0.01348045,
 'C#:min': 0.008077531,
 'D:min': 0.0076666037,
 'D#:min': 0.0076666037,
 'E:min': 0.0374534,
 'F:min': 0.008038643,
 'F#:min': 0.0076666037,
 'G:min': 0.0076666037,
 'G#:min': 0.011406765,
 'A:min': 0.018556057,
 'A#:min': 0.0076666037,
 'B:min': 0.009335503}

In [124]:
y, sr = librosa.load(input_file)

In [125]:
y = detect_sound_start(y, sr)
# y = remove_noise_from_audio(y, sr)

In [126]:
y_harmonic, y_percussive = librosa.effects.hpss(y)

In [127]:
fragment = Tonal_Fragment(y_harmonic, sr)

In [128]:
out_rulebased = fragment.get_key_softmax()
out_rulebased

{'C:maj': 0.043940141946761824,
 'C#:maj': 0.015376314368474496,
 'D:maj': 0.012035111037638633,
 'D#:maj': 0.036155471673696676,
 'E:maj': 0.012589061442464916,
 'F:maj': 0.09209570401814164,
 'F#:maj': 0.039560387320591084,
 'G:maj': 0.011737963087370545,
 'G#:maj': 0.016408972640085877,
 'A:maj': 0.026254260940799946,
 'A#:maj': 0.1612294164199697,
 'B:maj': 0.02547833026661637,
 'C:min': 0.0668752067913972,
 'C#:min': 0.023874914314029917,
 'D:min': 0.07390853368614723,
 'D#:min': 0.08716721604349106,
 'E:min': 0.01840879054275703,
 'F:min': 0.015223317484183114,
 'F#:min': 0.011277710971505418,
 'G:min': 0.01768697154104812,
 'G#:min': 0.004253866344147167,
 'A:min': 0.03995797582307743,
 'A#:min': 0.12556559576201803,
 'B:min': 0.022938765533586584}

In [129]:
# Combine CNN and rule-based predictions with controllable weighting
alpha = 0.5  # Weight between 0 and 1: 0 = only multiply, 1 = only average
combined_predictions = {}
for key in out_cnn.keys():
  # Weighted combination of multiplication and averaging
  mult = np.sqrt(out_cnn[key] * out_rulebased[key])
  avg = (out_cnn[key] + out_rulebased[key]) / 2
  combined_predictions[key] = (1 - alpha) * mult + alpha * avg

# Normalize to ensure probabilities sum to 1
total = sum(combined_predictions.values())
combined_predictions = {k: v/total for k,v in combined_predictions.items()}

# Get most likely key
predicted_key = max(combined_predictions.items(), key=lambda x: x[1])
print(f"Predicted key: {predicted_key[0]} (confidence: {predicted_key[1]:.3f})")


Predicted key: C:maj (confidence: 0.104)


In [130]:
# Get top 5 most confident keys
n = 5
top_keys = sorted(combined_predictions.items(), key=lambda x: x[1], reverse=True)[:n]

print(f"\nTop {n} most confident keys:")
for key, confidence in top_keys:
  print(f"{key}: {confidence:.3f}")



Top 5 most confident keys:
C:maj: 0.104
A#:maj: 0.096
F:maj: 0.076
E:maj: 0.065
G:maj: 0.059
