In [1]:
import librosa
import numpy as np

def slice_spectrogram(mel_spec, slice_width=128):
    slices = []
    total_frames = mel_spec.shape[1]

    for start in range(0, total_frames - slice_width + 1, slice_width):
        slice_img = mel_spec[:, start:start + slice_width]
        if slice_img.shape == (128, 128):
            slices.append(slice_img)

    return slices


def predict_genre_for_song(file_path, model, encoder, slice_width=128):
    y_audio, sr = librosa.load(file_path, sr=None, duration=30)

    mel_spec = librosa.feature.melspectrogram(y=y_audio, sr=sr, n_mels=128)
    mel_db = librosa.power_to_db(mel_spec, ref=np.max)

    slices = slice_spectrogram(mel_db, slice_width=slice_width)
    predictions = []

    for s in slices:
        inp = s[np.newaxis, ..., np.newaxis] / 255.0
        pred = model.predict(inp, verbose=0)
        predictions.append(pred[0])  # softmax 输出

    if not predictions:
        return "Unknown"

    avg_pred = np.mean(predictions, axis=0)
    genre_index = np.argmax(avg_pred)
    return encoder.inverse_transform([genre_index])[0]