In [None]:
!pip install pretty_midi > /dev/null 2>&1
!pip install librosa > /dev/null 2>&1
!pip install noisereduce > /dev/null 2>&1
!pip install tqdm > /dev/null 2>&1
!pip install mir_eval > /dev/null 2>&1

In [None]:
from torch.utils.data import Dataset, DataLoader
import json
import torch
import librosa
import pretty_midi
import numpy as np
import matplotlib.pyplot as plt
from IPython.display import Audio
from sklearn.neighbors import NearestNeighbors
import noisereduce as nr
from scipy import stats
import scipy
from sklearn.metrics import classification_report, confusion_matrix
from multiprocessing import Pool
from tqdm import tqdm
import mir_eval

In [None]:
#Data loader (wav and midi files)
class AudioDataset(Dataset):
    def __init__(self, directory,json_file, subset, transform=None):
        with open(directory+json_file, 'r') as f:
            data = json.load(f)
        self.file_list = data[subset]
        self.transform = transform

    def __len__(self):
        return len(self.file_list)

    def __getitem__(self, idx):
        if torch.is_tensor(idx):
            idx = idx.tolist()

        audio_file = self.file_list[idx]
        audio, sr = librosa.load(directory+"wav_data_sync_with_midi/"+audio_file+".wav", sr=None)
        midi=pretty_midi.PrettyMIDI(directory+"midi_data/"+audio_file+".mid")

        if self.transform:
            audio = self.transform(audio)

        return audio,sr,midi

def get_notes(midi):
    return np.array([note.pitch for note in midi.instruments[0].notes])

In [None]:
def get_onset_offset(midi):
  onset,offset=[],[]
  for instrument in midi.instruments:
      for note in instrument.notes:
          onset.append(note.start)  # onset time in seconds
          offset.append(note.end)  # offset time in seconds
  return np.vstack([onset,offset]).T

In [None]:
EVAL_TOLERANCE = 0.05 #ONSET TOLERANCE
OCTAVE_INVARIANT_RADIUS = 16


def trim_midi(ref_midi_data, est_midi_data):
    ref_notes = []
    for i in ref_midi_data.instruments:
        if i.is_drum:
            continue
        for n in i.notes:
            ref_notes.append(n)
    segment_start = ref_notes[0].start
    segment_end = ref_notes[-1].end

    num_dropped = 0
    for i in est_midi_data.instruments:
        if i.is_drum:
            continue
        i.notes = [
            n for n in i.notes if n.start >= segment_start and n.start <= segment_end
        ]

    return est_midi_data


def midi_to_mir_eval(midi_data, dummy_offsets = True):
    notes = []
    for i in midi_data.instruments:
        if i.is_drum:
            continue
        for n in i.notes:
            notes.append((n.start, n.end, n.pitch))
    notes = sorted(notes)
    note_onsets = [s for s, _, _ in notes]
    note_offsets = [e for _, e, _ in notes]
    if dummy_offsets and len(note_onsets) > 0:
        note_offsets = note_onsets[1:] + [note_onsets[-1] + 1]
    intervals = np.stack([note_onsets, note_offsets], axis = 1).astype(np.float64)
    pitches = np.array([p for _, _, p in notes], dtype = np.int64)
    return intervals, pitches


def extract_notes(ref_midi_data, est_midi_data):
    #ref_midi_data = pretty_midi.PrettyMIDI(ref_midi_file)
    #est_midi_data = pretty_midi.PrettyMIDI(est_midi_file)
    ref_midi_data = copy.deepcopy(ref_midi_data)
    est_midi_data = copy.deepcopy(est_midi_data)

    est_midi_data = trim_midi(ref_midi_data, est_midi_data)

    ref_intervals, ref_pitches = midi_to_mir_eval(ref_midi_data, dummy_offsets = False)
    est_intervals, est_pitches = midi_to_mir_eval(est_midi_data, dummy_offsets = False)

    return ref_intervals, ref_pitches, est_intervals, est_pitches

def mir_eval_onset_prf(ref_intervals, ref_pitches, est_intervals, est_pitches):
    m_to_f = lambda m: 440.0 * np.power(2, (m.astype(np.float32) - 69) / 12)
    p, r, f1, _ = mir_eval.transcription.precision_recall_f1_overlap(
            ref_intervals,
            m_to_f(ref_pitches),
            est_intervals,
            m_to_f(est_pitches),
            onset_tolerance = EVAL_TOLERANCE,
            pitch_tolerance = 1.0,
            offset_ratio = None,
        )
    return p, r, f1


def evaluate(ref_intervals, ref_pitches, est_intervals, est_pitches):
    octaves = list(range(-OCTAVE_INVARIANT_RADIUS, OCTAVE_INVARIANT_RADIUS + 1))
    ps = []
    rs = []
    f1s = []
    for o in octaves:
        p, r, f1 = mir_eval_onset_prf(
            ref_intervals,
            (o * 12) + ref_pitches,
            est_intervals,
            est_pitches
        )
        ps.append(p)
        rs.append(r)
        f1s.append(f1)

    best_octave_idx = np.argmax(f1s)
    return (
        ps[best_octave_idx],
        rs[best_octave_idx],
        f1s[best_octave_idx]
    )

In [None]:
directory="/content/drive/MyDrive/MLSP_PROJECT/"
json_file="train_valid_test_keys.json"
train_dataset=AudioDataset(directory,json_file,"TRAIN")
validation_dataset=AudioDataset(directory,json_file,"VALID")
test_dataset=AudioDataset(directory,json_file,"TEST")

In [None]:
train_dataset[0][2]

<pretty_midi.pretty_midi.PrettyMIDI at 0x7a80c13c3040>

In [None]:
get_notes(train_dataset[0][2])%12

array([ 0,  2,  4,  5,  7,  9, 11,  0,  0, 11,  9,  7,  5,  4,  2,  0])

In [None]:
ground_truth_pitches=[]

for i in range(test_dataset.__len__()):
  ground_truth_pitches.append(get_notes(test_dataset[i][2])%12)
  if i%50==0:
    print("Processing files:", i)

Processing files: 0
Processing files: 50
Processing files: 100
Processing files: 150
Processing files: 200
Processing files: 250
Processing files: 300
Processing files: 350
Processing files: 400
Processing files: 450
Processing files: 500
Processing files: 550
Processing files: 600
Processing files: 650
Processing files: 700
Processing files: 750


In [None]:
print(ground_truth_pitches.__len__())

769


In [None]:
onsets_dir="/content/drive/MyDrive/MLSP_PROJECT/corrected_onsets/"
directory="/content/drive/MyDrive/MLSP_PROJECT/"
json_file="train_valid_test_keys.json"
with open(directory+json_file, 'r') as f:
  data = json.load(f)
  file_list = data["TEST"]

In [None]:
np.loadtxt(onsets_dir+"TEST/"+file_list[0]+".txt")

array([ 0.20897959,  0.63854875,  1.05650794,  1.4860771 ,  1.91564626,
        2.75156463,  3.18113379,  3.68036281,  4.56272109,  5.46829932,
        7.09369615,  7.54648526,  7.99927438,  8.45206349,  8.91646259,
        9.76399093, 10.19356009, 10.65795918])

In [None]:
#These are corrected onsets
test_onsets=[]
for i in range(file_list.__len__()):
  test_onsets.append(np.loadtxt(onsets_dir+"TEST/"+file_list[i]+".txt"))
  if i%50==0:
    print("Processing files:", i)

Processing files: 0
Processing files: 50
Processing files: 100
Processing files: 150
Processing files: 200
Processing files: 250
Processing files: 300
Processing files: 350
Processing files: 400
Processing files: 450
Processing files: 500
Processing files: 550
Processing files: 600
Processing files: 650
Processing files: 700
Processing files: 750


In [None]:
print(test_onsets[8].__len__())
print(ground_truth_pitches[8].__len__())

43
43


In [None]:
librosa_onsets=[]
for k in range(test_dataset.__len__()):
  y,sr=test_dataset[k][:-1]
  librosa_onsets.append(librosa.onset.onset_detect(y=np.nan_to_num(nr.reduce_noise(y=y, sr=sr)), sr=sr, units='time'))
  if k%50==0:
    print("Processing files:", k)

Processing files: 0
Processing files: 50
Processing files: 100
Processing files: 150


  sig_mult_above_thresh = (abs_sig_stft - sig_stft_smooth) / sig_stft_smooth


Processing files: 200
Processing files: 250
Processing files: 300
Processing files: 350
Processing files: 400
Processing files: 450
Processing files: 500
Processing files: 550
Processing files: 600
Processing files: 650
Processing files: 700
Processing files: 750


In [None]:
est_pitches=[]

for k in range(test_dataset.__len__()):
  y,sr=test_dataset[k][:-1]
  #y_harm = librosa.effects.harmonic(y=y, margin=50)
  #chroma_cq = librosa.feature.chroma_cqt(y=y_harm, sr=sr)
  chroma_cq = librosa.feature.chroma_cqt(y=y, sr=sr)
  time_bins=(librosa_onsets[k]*chroma_cq.shape[1]/(y.shape[0]/sr)).astype(int)
  time_bins=np.hstack([time_bins,chroma_cq.shape[1]])

  #print(time_bins)
  est_pitch=[]
  #pitches.append(chroma_cq[:,0:time_bins[0]])
  for t in range(time_bins.shape[0]-1):
    if time_bins[t+1]-time_bins[t]!=0:
      pitch=np.argmax(chroma_cq[:,time_bins[t]:time_bins[t+1]],axis=0)
      most_frequent_pitch=np.argmax(np.bincount(pitch)) #Most frequent row (note)
    else:
      pitch=np.argmax(chroma_cq[:,time_bins[t]],axis=0) #This is the case where two consecutive time bins are the same
      most_frequent_pitch=np.argmax(pitch)
    #print(time_bins[t],time_bins[t+1])
    #print(t)

    est_pitch.append(most_frequent_pitch)
  est_pitches.append(np.array(est_pitch))
  #print(k)
  if k%50==0:
    print("Processing data:",k)

Processing data: 0
Processing data: 50
Processing data: 100
Processing data: 150
Processing data: 200
Processing data: 250
Processing data: 300
Processing data: 350
Processing data: 400
Processing data: 450
Processing data: 500
Processing data: 550
Processing data: 600
Processing data: 650
Processing data: 700
Processing data: 750


**Mean precision, recall and F1-score, when comparing Librosa's onsets and chroma features for pitch estimation vs corrected onsets + ground truth pitches**

In [None]:
#There are some onsets in the corrected onsets that repeat. Mier eval cannot handle this. I should get rid of these repeated onsets, but also to their corresponding ground truth note.
precision,recall,f1_score=[],[],[]
for k in range(test_dataset.__len__()):
  y,sr=test_dataset[k][:-1]
  ground_truth_onset,ixs=np.unique(test_onsets[k],return_index=True)

  ref_intervals=np.vstack([ground_truth_onset,np.append(ground_truth_onset[:-1]+np.diff(ground_truth_onset),y.shape[0]/sr)]).T
  est_intervals=np.vstack([librosa_onsets[k],np.append(librosa_onsets[k][:-1]+np.diff(librosa_onsets[k]),y.shape[0]/sr)]).T

  pr,rc,f1=evaluate(ref_intervals, ground_truth_pitches[k][ixs], est_intervals, est_pitches[k])
  #print(k)
  precision.append(pr)
  recall.append(rc)
  f1_score.append(f1)
  if k%50==0:
    print("Processing files:", k)
print(np.mean(precision),np.mean(recall),np.mean(f1_score))

Processing files: 0
Processing files: 50
Processing files: 100
Processing files: 150
Processing files: 200
Processing files: 250
Processing files: 300
Processing files: 350
Processing files: 400
Processing files: 450
Processing files: 500
Processing files: 550
Processing files: 600
Processing files: 650
Processing files: 700
Processing files: 750
0.45183269476589943 0.6302134874139907 0.5185815176993933


**Mean precision, recall and F1-score, when comparing Librosa's onsets and chroma features for pitch vs original onsets + ground truth pitches**

In [None]:
precision,recall,f1_score=[],[],[]
for k in range(test_dataset.__len__()):
  y,sr=test_dataset[k][:-1]
  ref_intervals=get_onset_offset(test_dataset[k][2]) #ORIGINAL NON-CORRECTED ONSETS/OFFSETS
  est_intervals=np.vstack([librosa_onsets[k],np.append(librosa_onsets[k][:-1]+np.diff(librosa_onsets[k]),y.shape[0]/sr)]).T

  pr,rc,f1=evaluate(ref_intervals, ground_truth_pitches[k], est_intervals, est_pitches[k])
  #print(k)
  precision.append(pr)
  recall.append(rc)
  f1_score.append(f1)
  if k%50==0:
    print("Processing files:", k)
print(np.mean(precision),np.mean(recall),np.mean(f1_score))

Processing files: 0
Processing files: 50
Processing files: 100
Processing files: 150
Processing files: 200
Processing files: 250
Processing files: 300
Processing files: 350
Processing files: 400
Processing files: 450
Processing files: 500
Processing files: 550
Processing files: 600
Processing files: 650
Processing files: 700
Processing files: 750
0.031050961668413646 0.04442319376700016 0.0359395952273152
