Here is an example of how to match the melspectrograms of librosa and torchaudio. <br>
I do not know which parameter is more accurate, but it reduces the risk of using a different looking image in the training than the one analyzed by librosa.

In [None]:
from pathlib import Path
import numpy as np
import pandas as pd
import typing as tp
import yaml
import random
import os
import sys
import soundfile as sf
import librosa
import librosa.display
import cv2
import matplotlib.pyplot as plt
import time
import pickle
import glob
from tqdm import tqdm

import torch
import torch.nn as nn
import torch.nn.functional as F
import torch.utils.data as data

import torchaudio

In [None]:
class CFG:
    n_mels = 128
    fmin = 20
    fmax = 16000
    n_fft = 2048
    hop_length = 512

In [None]:
DATA_DIR = Path("../input/birdclef-2022")
train_wav_dir = DATA_DIR / 'train_audio'
train_csv_path = DATA_DIR / 'train_metadata.csv'

In [None]:
train = pd.read_csv(train_csv_path)

In [None]:
# idx = np.random.randint(len(train))
idx = 100
raw = train.iloc[idx]

wav_name = raw["filename"]
ebird_code = raw["primary_label"]
y, sr = sf.read(train_wav_dir / wav_name, always_2d=True)
y = y[:, 0]

In [None]:
def compare_mels(mels, titles):
    fig, ax = plt.subplots(len(mels), 1, figsize=(12, len(mels)*4))
    for i in range(len(mels)):
        librosa.display.specshow(mels[i], y_axis='mel', fmax=CFG.fmax, x_axis='time', sr=sr, ax=ax[i])
        ax[i].set_title(titles[i])
    plt.tight_layout()
    plt.show()
    
    diff = np.max(np.abs(mels[1] - mels[0]))
    print(f'max diff = {diff}')

### common case

In [None]:
melspec = librosa.feature.melspectrogram(y=y, sr=sr, n_fft=CFG.n_fft, hop_length=CFG.hop_length, n_mels=CFG.n_mels, fmin=CFG.fmin, fmax=CFG.fmax)
melspec = librosa.power_to_db(melspec, ref=np.max).astype(np.float32)

In [None]:
logmelspec_extractor = nn.Sequential(torchaudio.transforms.MelSpectrogram(sample_rate=sr, 
                                                                     n_fft=CFG.n_fft, 
                                                                     win_length=CFG.n_fft, 
                                                                     hop_length=CFG.hop_length, 
                                                                     f_min=CFG.fmin, 
                                                                     f_max=CFG.fmax, 
                                                                     n_mels=CFG.n_mels),
                                                 torchaudio.transforms.AmplitudeToDB(),
                                                 )
torch_melspec = logmelspec_extractor(torch.tensor(y.reshape(1, -1)).float()).numpy()[0]

In [None]:
mels = [melspec, torch_melspec]
titles = ['librosa', 'torch_audio']

In [None]:
compare_mels(mels, titles)

## after correction

The fixes are as follows.

1. calc melspectrogram
Correct the parameters to those listed [here](https://github.com/pytorch/audio/issues/1058).

2. take the log
「ref」 is not specified because this cannot be set in torchaudio. Also, 「top_db」 should be set to None.

In [None]:
melspec = librosa.feature.melspectrogram(y=y, 
                                         sr=sr, 
                                         n_fft=CFG.n_fft, 
                                         hop_length=CFG.hop_length, 
                                         n_mels=CFG.n_mels, 
                                         fmin=CFG.fmin, 
                                         fmax=CFG.fmax,
                                         center=True,
                                         pad_mode="reflect",
#                                          power=1.0,
                                         norm='slaney',
                                         htk=True,)
melspec = librosa.power_to_db(melspec, top_db=None).astype(np.float32)

In [None]:
logmelspec_extractor = nn.Sequential(torchaudio.transforms.MelSpectrogram(sample_rate=sr, 
                                                                     n_fft=CFG.n_fft, 
                                                                     win_length=CFG.n_fft, 
                                                                     hop_length=CFG.hop_length, 
                                                                     f_min=CFG.fmin, 
                                                                     f_max=CFG.fmax, 
                                                                     n_mels=CFG.n_mels,
                                                                     center=True,
                                                                     pad_mode="reflect",
                                                                     norm="slaney",
                                                                     onesided=True,
                                                                     mel_scale="htk"),
                                                 torchaudio.transforms.AmplitudeToDB(),
                                                 )
torch_melspec = logmelspec_extractor(torch.tensor(y.reshape(1, -1)).float()).numpy()[0]

In [None]:
mels = [melspec, torch_melspec]
titles = ['librosa', 'torch_audio']

In [None]:
compare_mels(mels, titles)

If you know of a way to make the difference even smaller, please let me know.