In [None]:
import  music21 as ms21
import numpy as np
import os

midi_predict_path = "./caption+visual_att_4_midi/"
midi_gt_path = "./gt_midi/"

midi_predict_files = os.listdir(midi_predict_path)

In [None]:
def process_midi(path):
    s=ms21.converter.parse(path)
    note2id = {
        "C": 0, 
        "C#": 1, 
        "D": 2, 
        "E-": 3, 
        "E": 4, 
        "F": 5, 
        "F#": 6, 
        "G": 7, 
        "G#": 8, 
        "A": 9, 
        "B-": 10,
        "B": 11, 
    }

    notes, bar, pattern, pattern_bar = [], [], [], [0 for k in range(16)]
    lastoffset = 0
    i, j = 1, 0

    for note in s.flat.notesAndRests:
        if isinstance(note, ms21.note.Rest):
            continue

        if note.offset >= 32 * i and lastoffset < 32 * i:
            notes.append(bar)
            bar = []
            i += 1
        while note.offset >= 4 * (j + 1) and lastoffset < 4 * (j + 1):
            pattern.append(np.array(pattern_bar))
            pattern_bar = [0 for k in range(16)]
            j += 1

        if isinstance(note,ms21.note.Note):
            # print(note.name, note.octave, note.pitch, note.pitch.midi, note.duration.quarterLength)
            bar.append(note2id[note.name])
            pattern_bar[int(4*(note.offset-4*j))] = 1
        else:
            try:
                for c_note in note.notes:
                    # print(c_note.name, c_note.pitch.midi, c_note.duration.quarterLength)
                    bar.append(note2id[c_note.name])
            except:
                pass
            pattern_bar[int(4*(note.offset-4*j))] = 1
            
        lastoffset = note.offset
    
    return notes, bar, pattern, pattern_bar

calculate pitch_class_histogram_entropy (PCHE)

In [None]:
def pitch_class_histogram_entropy(notes):
    
    result = []

    for bar in notes:
        # Construct the 12-dimensional pitch class histogram
        histogram = np.zeros(12)
        for note in bar:
            pitch_class = note % 12
            histogram[pitch_class] += 1

        # Normalize the histogram
        histogram = histogram / np.sum(histogram)

        # Calculate the entropy
        entropy = -np.sum(histogram * np.log2(histogram + 1e-6))  # Added epsilon to avoid log(0)
        result.append(entropy)

    return sum(result)/len(notes)

pche = []

for item in midi_predict_files:
# for item in midi_gt_files:

    file_path = f'{midi_predict_files}{item}'

    notes, bar, pattern, pattern_bar = process_midi(file_path)

    pche.append(pitch_class_histogram_entropy(notes))

sum(pche) / len(pche)

calculate grooving_pattern_similarity (GPS)

In [None]:
def grooving_pattern_similarity(g_a, g_b):
    assert len(g_a) == len(g_b), "Grooving patterns must have the same length"
    Q = len(g_a)
    gs = 1 - (1/Q) * np.sum(np.bitwise_xor(g_a, g_b))
    return gs

def cal_cps(pattern):

    results = []
    for i in range(len(pattern)):
        for j in range(i + 1, len(pattern)):
            g_a, g_b = pattern[i], pattern[j]
            results.append(grooving_pattern_similarity(g_a, g_b))

    return sum(results) / len(results)

cps = []

for item in midi_predict_files:

    file_path = f'{midi_predict_path}{item}'

    notes, bar, pattern, pattern_bar = process_midi(file_path)

    cps.append(cal_cps(pattern))

sum(cps) / len(cps)

To calculate the video-music correspondence, you should first transfer the midi output into audio, then extract the features and calculate the recall ratio.

1. transfer the midi files into audio files.

In [None]:
import os
from midi2audio import FluidSynth

# You can download the sound file at https://github.com/vyshor/MusicAids/blob/master/default_sound_font.sf2
fs = FluidSynth(sound_font="./default_sound_font.sf2")
mp3_path = midi_predict_path[:-1]+"_mp3/"
os.mkdir(mp3_path)

ls = os.listdir(midi_predict_path)
for i in ls:
    idx = i[:3]
    fs.midi_to_audio(f"{midi_predict_path}{idx}.mid", f"{mp3_path}{idx}.mp3")

2. use [Musicnn](https://github.com/jordipons/musicnn) to extract the audio features.

3. process the features to calculate the recall ratio.

In [12]:
import os
import warnings
import numpy as np
warnings.filterwarnings('ignore')

length = 1e9

# You can get our processed feature at https://drive.google.com/drive/folders/1sOLV2HtmXVwRLerw6Bt5W0UHhBreJ1-0?usp=sharing
gt_path = "./gt_feats/"
gt_files = os.listdir(gt_path)
gt_feats = []

# You can get our processed feature at https://drive.google.com/drive/folders/1vRWGxsg3KxJ5vSjX-jaDf1FlyhWez2SV?usp=sharing
predict_path_feats = "./caption+visual_att_4_feats/"
predict_files_feats = os.listdir(predict_path_feats)
predict_feats = []

for item in gt_files:
    if item in predict_files_feats:
        feat_gt = np.load(f'{gt_path}{item}')
        feat_predict = np.load(f'{predict_path_feats}{item}')
        for i in range(min(feat_gt.shape[0], feat_predict.shape[0])):
            gt_feats.append(feat_gt[i, :])
            predict_feats.append(feat_predict[i, :])

In [16]:
from scipy import spatial
from random import sample

print("Calculating Recall...")
acc = {
    "1": 0, 
    "2": 0, 
    "3": 0, 
    "5": 0, 
    "10": 0, 
    "20": 0, 
}
idx_record = []

for i, item in enumerate(predict_feats):
    sim = []

    select_gt_feats = []
    ls = [j for j in range(500)]
    idx_ls = sample(ls, 60)
    if not i in idx_ls:
        idx_ls[0] = i
    ans = idx_ls.index(i)
    for j in idx_ls:
        select_gt_feats.append(gt_feats[j])

    for item_gt in select_gt_feats:
        cos_sim = 1 - spatial.distance.cosine(item, item_gt)
        sim.append(cos_sim)
    max_index = sorted(range(len(sim)), key=lambda x: -sim[x])
    for k in acc.keys():
        if ans in max_index[:int(k)]:
            acc[k] += 1
    idx_record.append(max_index.index(ans)+1)

for k in acc.keys():
    print(f"Recall@{k}: {acc[k]/len(predict_feats)}")
print(f"AP: {sum(idx_record)/len(idx_record)}")

Calculating Recall...
Recall@1: 0.038788522848034
Recall@2: 0.06854410201912858
Recall@3: 0.10361317747077577
Recall@5: 0.15993623804463336
Recall@10: 0.2688629117959617
Recall@20: 0.4691817215727949
AP: 24.44155154091392
