# Load Data

In [None]:
import pandas as pd
from torch.utils.data import DataLoader
from data_files.dataset import CQT_Dataset_test
import torch
from utils.model import Spice_model
import numpy as np
from utils.calibration import Calibrator
import matplotlib.pyplot as plt

In [None]:
mdb_test_data = pd.read_pickle("./data_test/MedleyDB.pkl") 
mir_test_data = pd.read_pickle("./data_test/MIR1k.pkl") 

In [None]:
# mir_test_data
data_MIR = mir_test_data.to_numpy()
data_MDB = mdb_test_data.to_numpy()

# Evaluation 

## Load Model and Calibation (Plots Calibration Estimation)

In [None]:
# load model to be evaluated
model = Spice_model([1, 64, 128, 256, 512, 512, 512], [512, 512, 512, 256, 128, 64, 1], [True, True, True, True, True, True])
checkpoint = torch.load('./rev_1k_checkpoints/MIR_minispice.ckp', 'cuda' if torch.cuda.is_available() else 'cpu')
model.load_state_dict(checkpoint['state_dict'])

In [None]:
cal1 = Calibrator(model, 1000, 110, 440)
PT_OFFSET, PT_SLOPE = cal1.get_values()
data, A, B= cal1.get_data()


# Uncoment For Plots
# x = np.linspace(0, 1, 5)
# y = PT_OFFSET + PT_SLOPE*x
# plt.scatter(B.squeeze(),A[:,1])
# plt.xlim(30, 70)
# plt.ylim(0.3, 0.55)
# plt.plot(y, x, 'r',linestyle='--',label='Line: Estimated Parameters')
# plt.xlabel('Pith Diff from fmin(10Hz) [Semitones]')
# plt.ylabel('Pitch Head Output')
# plt.legend()

In [None]:
def output2hz(pitch_output):
  # Constants taken from https://tfhub.dev/google/spice/2
  # PT_OFFSET = 75.06398400431725
  # PT_SLOPE = -43.583755096345676
  FMIN = 10.0    
  BINS_PER_OCTAVE = 12.0  
  cqt_bin = pitch_output * PT_SLOPE + PT_OFFSET;
  return FMIN*2**(cqt_bin/BINS_PER_OCTAVE)

## Whole Batch RPA 

In [None]:
import mir_eval
from tqdm import tqdm
label = []
yt_hat = []
voice = []
# data_part
for row in tqdm(data_MDB):
    pitch_h1,conf_h1,x_hat1 = model(torch.from_numpy(row[1:129].reshape(1,128)).float())
    yt_hat.append(pitch_h1.detach().numpy())
    voice.append(row[-2])
    label.append(row[-1])
y_hat = np.apply_along_axis(output2hz,0,yt_hat)
y_hat_cent = np.apply_along_axis(mir_eval.melody.hz2cents, 0, y_hat)
label_cent = np.apply_along_axis(mir_eval.melody.hz2cents, 0, label)
# voice = self.test_data[:,-2:-1]
voice = np.array(voice).reshape(len(voice),1)
label_cent = np.array(label_cent).reshape(len(label_cent),1)
y_hat_cent= np.array(y_hat_cent).reshape(len(y_hat_cent),1)
rpa = mir_eval.melody.raw_pitch_accuracy(voice, label_cent, voice, y_hat_cent, cent_tolerance=50)
rpa

#  Results - RPA vs M Samples on MIR
0.23760423855905596, M=2
<br>
0.3555553249233292, M=3
<br>
0.02949555542870783, M=5
<br>
0.23164700815227263 , M = 10
<br>
0.06791138879231165 , M = 20
<br>
0.12203500651247749, M= 50
<br>
## Results - RPA 
RPA - MIR_noconf.ckp = 0.02232404635015853
<br>
RPA - MIR_ninispice.ckp = MIR 0.07354688724332796 | MDB 0.08407682146123362
<br>
RPA - wo-recon.ckp = MIR 0.4413020803603398, 0.29013476417566564| MDB 0.06933577210988043


In [None]:
(0.4413020803603398+0.29013476417566564)/2

# Calibration on SPICE (Plots Calibration Estimation)

In [None]:
import tensorflow as tf
import tensorflow_hub as hub

import numpy as np
import matplotlib.pyplot as plt
import librosa
from librosa import display as librosadisplay
import pandas as pd

import logging
import math
import statistics
import sys

from IPython.display import Audio, Javascript
from scipy.io import wavfile

from base64 import b64decode

import music21
from pydub import AudioSegment

logger = logging.getLogger()
logger.setLevel(logging.ERROR)

print("tensorflow: %s" % tf.__version__)
#print("librosa: %s" % librosa.__version__)

In [None]:
%env http_proxy=http://proxy:80
%env https_proxy=http://proxy:80

In [None]:
# Loading the SPICE model is easy:
model_hub = hub.load("https://tfhub.dev/google/spice/2")

In [None]:
from utils.calibration2 import Calibrator_SPICE
import torch

In [1]:
cal = Calibrator_SPICE(model_hub, False,1000)
PT_OFFSET, PT_SLOPE = cal.get_values()
A, B= cal.get_data()

# for plotting
# x = np.linspace(0, 1, 5)
# # Define the equation
# y = PT_OFFSET + PT_SLOPE*x
# plt.scatter(B.squeeze(),A[:,1])
# plt.xlim(40, 70)
# plt.ylim(0.21, 0.64)
# plt.plot(y, x, 'r',linestyle='--',label='Line: Estimated Parameters')
# plt.xlabel('Pith Diff from fmin(10Hz) [Semitones]')
# plt.ylabel('Pitch Head Output')
# plt.legend()

NameError: name 'Calibrator' is not defined

# This part extra, for SPICE evaluaton refer to Mini_SPICE notebook

In [None]:
from tqdm import tqdm
label = []
yt_hat = []
voice = []
# pitch_h1,conf_h1,x_hat1 = model_mdb(torch.from_numpy(data_np[:1,1:129].reshape(1,128)).float())
# model_mdb(torch.from_numpy(row[1:129].reshape(1,128)).float())
for row in tqdm(data_MDB):
    pitch_h1,conf_h1,x_hat1 = model_mdb(torch.from_numpy(row[1:129].reshape(1,128)).float())
    yt_hat.append(pitch_h1.detach().numpy())
    voice.append(row[-2])
    label.append(row[-1])
# y_hat = np.apply_along_axis(output2hz, 0, pitch_h1.detach().numpy())

In [None]:
import mir_eval
y_hat = np.apply_along_axis(output2hz,0,yt_hat)
y_hat_cent = np.apply_along_axis(mir_eval.melody.hz2cents, 0, y_hat)
label_cent = np.apply_along_axis(mir_eval.melody.hz2cents, 0, label)

In [None]:
voice = np.array(voice).reshape(len(voice),1)
label_cent = np.array(label_cent).reshape(len(label_cent),1)
y_hat_cent= np.array(y_hat_cent).reshape(len(y_hat_cent),1)
# y_hat_cent
rpa = mir_eval.melody.raw_pitch_accuracy(voice, label_cent, voice, y_hat_cent, cent_tolerance=50)
rpa

In [None]:
pitch_h,conf_h,x_hat = model_mdb(torch.randn((10,128)))
y_hat = np.apply_along_axis(output2hz, 0, pitch_h.detach().numpy())
y_hat_cent = np.apply_along_axis(mir_eval.melody.hz2cents, 0, y_hat)
y_hat_voice = np.random.randint(2, size=y_hat.shape)
y = np.random.uniform(low=40, high=65, size=(10,1)) 
y_cent = np.apply_along_axis(mir_eval.melody.hz2cents, 0, y)
y_voice = np.random.randint(2, size=y.shape)
raw_pitch_accuracy = mir_eval.melody.raw_pitch_accuracy(y_voice, y_cent, y_hat_voice, y_hat_cent)
raw_pitch_accuracy
y_voice.shape

## Skip this

In [None]:
batch_size = 64
mir_test_batches = DataLoader(CQT_Dataset_test(data=mir_test_data, mode='test'), batch_size=batch_size, shuffle=True, num_workers=4, pin_memory=True)

In [None]:
for inputs, targets in enumerate(mir_test_batches):
    int_shift = targets[1]
    inp1 = targets[2]
    inp2 = targets[3]
    label = np.vstack((targets[4].detach().numpy(),targets[5].detach().numpy()))
    if inputs == 4:
        break

In [None]:
def output2hz(pitch_output):
  # Constants taken from https://tfhub.dev/google/spice/2
  PT_OFFSET = 10.806732248081362
  PT_SLOPE = -56.44811563764634
  FMIN = 10.0    #why is it 10, not 110?
  BINS_PER_OCTAVE = 12.0  
  cqt_bin = pitch_output * PT_SLOPE + PT_OFFSET;
  return FMIN * 2.0 ** (1.0 * cqt_bin / BINS_PER_OCTAVE)

In [None]:
1 / (24 * np.log2(666.664939769901 /  66.9456331525636256))

In [None]:
import mir_eval
from tqdm import tqdm

def rpa_on_dataset(model, batches, sigma=0.0125657):
    pred_pitch = np.array([])
    pred_pitch_cent = np.array([])
    pred_pitch_voicing = np.array([])
    lab_pitch_cent = np.array([])
    lab_pitch_voicing = np.array([])
    for inputs, targets in enumerate(tqdm(batches)):
        int_shift = targets[1].detach().numpy()
        inp1 = targets[2]
        inp2 = targets[3]
        label_voice = targets[4].detach().numpy()
        label_f0 = targets[5].detach().numpy()
        pitch_h1,conf_h1,x_hat1 = model(inp1.float())
        pitch_h2,conf_h2,x_hat2 = model(inp2.float())
        abs_pitch1 = np.apply_along_axis(output2hz, 0, pitch_h1.detach().numpy())
        abs_pitch2 = np.apply_along_axis(output2hz, 0, pitch_h2.detach().numpy())
        # abs_pitch_cent1 = np.apply_along_axis(mir_eval.melody.hz2cents, 0, abs_pitch1))
        # abs_pitch_cent2 = np.apply_along_axis(mir_eval.melody.hz2cents, 0, abs_pitch2))
        pitch_diff =  np.abs((abs_pitch1-abs_pitch2) - (sigma*int_shift).reshape(int_shift.shape[0],1))
        # pitch_diff_cent =  np.abs(abs_pitch_cent1-abs_pitch_cent2)
        # pitch_diff[pitch_diff>0.5] #not even a sigle one even without semitone implementation
        
        ## implemented on average pitch
        pred_pitch_batch = np.mean([abs_pitch1, abs_pitch2], axis=0)
        pred_pitch = np.append(pred_pitch, pred_pitch_batch)
        temp = np.apply_along_axis(mir_eval.melody.hz2cents, 0, pred_pitch_batch)
        pred_pitch_cent = np.append(pred_pitch_cent, temp)
        pred_pitch_voicing = np.append(pred_pitch_voicing,label_voice) #replace with conf head
        lab_pitch_cent = np.append(lab_pitch_cent,np.apply_along_axis(mir_eval.melody.hz2cents, 0, label_f0))
        lab_pitch_voicing = np.append(lab_pitch_voicing,label_voice) 
        if inputs == 5:
            break
        print("RPA : {} after batch {}".format(mir_eval.melody.raw_pitch_accuracy(lab_pitch_voicing, lab_pitch_cent, pred_pitch_voicing, pred_pitch_cent), inputs))
    raw_pitch_accuracy = mir_eval.melody.raw_pitch_accuracy(lab_pitch_voicing, lab_pitch_cent, pred_pitch_voicing, pred_pitch_cent)
    return raw_pitch_accuracy

In [None]:
rpa_on_dataset(model, mir_test_batches)

In [None]:
y_hat = np.apply_along_axis(output2hz, 0, pitch_h.detach().numpy())
y_hat_cent = np.apply_along_axis(mir_eval.melody.hz2cents, 0, y_hat)
y_hat_voice = np.random.randint(2, size=y_hat.shape)
y = np.random.uniform(low=40, high=65, size=(10,1)) #extrapolate back to time
y_cent = np.apply_along_axis(mir_eval.melody.hz2cents, 0, y)
y_voice = np.random.randint(2, size=y.shape)
raw_pitch_accuracy = mir_eval.melody.raw_pitch_accuracy(y_voice, y_cent, y_hat_voice, y_hat_cent)
raw_pitch_accuracy

In [None]:
pitch_h1,conf_h1,x_hat1 = model(inp1.float())
pitch_h2,conf_h2,x_hat2 = model(inp2.float())

In [None]:
y_hat1 = np.apply_along_axis(output2hz, 0, pitch_h1.detach().numpy())
y_hat2 = np.apply_along_axis(output2hz, 0, pitch_h2.detach().numpy())

In [None]:
print("{}\n{}\n{}\n{}".format(y_hat1,y_hat2,label,int_shift))