## Diffusion Model

In [1]:
import torch
import librosa
from IPython.display import Audio
import torch.nn.functional as F


# Definizione della funzione per caricare il checkpoint
def load_model_from_checkpoint(checkpoint_path, model_class, params, device):
    model = model_class(params).to(device)
    checkpoint = torch.load(checkpoint_path, map_location=device)
    model.load_state_dict(checkpoint['model_state_dict'])
    model.eval()  # Imposta il modello in modalità valutazione
    return model

def interpolate_audio_signal(audio_signal, scale_factor = 2):
    """
    Effettua un'interpolazione lineare su un segnale audio PyTorch Tensor per raddoppiarne la lunghezza.
    
    :param audio_signal: Tensor di PyTorch contenente il segnale audio. Dimensioni previste [batch_size, channels, length].
    :param scale_factor: Fattore di scala per la lunghezza del segnale. Es: 2 per raddoppiare la lunghezza.
    :return: Tensor di PyTorch contenente il segnale audio interpolato.
    """
    # Interpolazione lungo l'ultimo asse
    # mode='linear' quando lavori con 3D assume 'linear' lungo l'asse W di [N, C, L]
    # align_corners=False per evitare artefatti agli estremi
    interpolated_signal = F.interpolate(audio_signal, scale_factor=scale_factor, mode='linear', align_corners=False)
    return interpolated_signal

# Percorsi ai file WAV a 24kHz
file_path_24 = '/media/nvme_4tb/simone_data/VoiceBank/clean_testset_wav_24khz/p257_085.wav'

# Carica gli audio con librosa
conditioning_audio, sr = librosa.load(file_path_24, sr=24000)

conditioning_audio_tensor = torch.from_numpy(conditioning_audio).unsqueeze(0).unsqueeze(0)  # [1, 1, L]
conditioned_audio_interpolated = interpolate_audio_signal(conditioning_audio_tensor, scale_factor=2)

print("L'audio conditioning ha shape:", conditioning_audio.shape)
print("L'audio conditioning interpolato ha shape:", conditioned_audio_interpolated.shape)

# Audio conditioning a 24KhZ
Audio(data=conditioning_audio, rate=sr)

L'audio conditioning ha shape: (60627,)
L'audio conditioning interpolato ha shape: torch.Size([1, 1, 121254])


In [2]:
import sys
sys.path.append('/home/simone')

import numpy as np

# Aggiungi le funzioni necessarie per il caricamento del modello e l'interpolazione
from VideoMamba.Train_AudioMamba3 import Params
from VideoMamba.DiffWave_simone3 import DiffWave

def load_audio(file_path, target_sample_rate=48000):
    # Carica il file audio con Librosa
    audio, sr = librosa.load(file_path, sr=target_sample_rate)
    return audio

# Funzione di caricamento del modello dal checkpoint
def load_model_from_checkpoint(checkpoint_path, model_class, params, device):
    model = model_class(params).to(device)
    checkpoint = torch.load(checkpoint_path, map_location=device)
    model.load_state_dict(checkpoint['model_state_dict'])

    # for k, v in checkpoint.items():
    #     if "norm_layer" in k:
    #         checkpoint.pop(k)

    model.eval()  # Imposta il modello in modalità valutazione
    return model

conditioning_audio, sr = librosa.load(file_path_24, sr=24000)

conditioning_audio_tensor = torch.from_numpy(conditioning_audio).unsqueeze(0).unsqueeze(0)  # [1, 1, L]
conditioned_audio_interpolated = interpolate_audio_signal(conditioning_audio_tensor, scale_factor=2)

# Carica il modello dal checkpoint
device = torch.device('cuda' if torch.cuda.is_available() else 'cpu')
params = Params(
        residual_channels=32,
        noise_schedule_params=(1e-6, 0.006, 300),
        unconditional=False,
        n_mels=10,
        residual_layers=11,
        dilation_cycle_length=5000, #150
        device=device
    )

model = load_model_from_checkpoint("/media/nvme_4tb/simone_data/VoiceBank/checkpoints_NEW2/checkpoint_epoch_5.pt", DiffWave, params, device)

input_audio = torch.randn(2 * conditioning_audio.shape[-1])

# Converti l'audio di input e il conditioning in tensori PyTorch
input_audio = input_audio.unsqueeze(0).unsqueeze(0).to(device)  # [1, 1, L]
conditioned_audio_interpolated = conditioned_audio_interpolated.squeeze(0).to(device)  # [1, 1, L]

print(f"Input: {input_audio.shape}, Conditioning: {conditioned_audio_interpolated.shape}")

# Esegui il sampling (processo di denoising)
sampled_audio = model.sample(steps=len(params.noise_schedule), conditioning=conditioning_audio_tensor, audio_length=input_audio.size(-1))

# Converti l'audio generato in formato NumPy per poterlo salvare e ascoltare
sampled_audio_np = sampled_audio.squeeze().cpu().numpy()

# Stampa finale per confermare la fine del processo
print(f"Sampled audio shape: {sampled_audio_np.shape}")

display(Audio(data=sampled_audio_np, rate=48000))


  from .autonotebook import tqdm as notebook_tqdm
  _torch_pytree._register_pytree_node(
  _torch_pytree._register_pytree_node(




Input: torch.Size([1, 1, 121254]), Conditioning: torch.Size([1, 121254])
Sampled audio shape: (121254,)


In [None]:
#conditioning_audio
import numpy as np
import matplotlib.pyplot as plt

# Assumendo che `sampled_audio_np` sia il tuo array numpy
# Esempio: sampled_audio_np = np.array([...])

X = conditioning_audio
X = sampled_audio_np

# Creare l'istogramma
plt.figure(figsize=(10, 6))
plt.hist(X, bins=100, color='blue', edgecolor='black', alpha=0.7)
plt.title('Istogramma dei Valori del Campione Audio')
plt.xlabel('Valore')
plt.ylabel('Frequenza')
plt.grid(True)
plt.show()


In [21]:
# Qui calcolo il SNR:

file_path_test = '/media/nvme_4tb/simone_data/VoiceBank/clean_testset_wav/p257_085.wav'

# Carica gli audio con librosa
test_audio, sr = librosa.load(file_path_24, sr=48000)

test_audio_tensor = torch.from_numpy(test_audio).unsqueeze(0).unsqueeze(0)  # [1, 1, L]


# Da scipy < 0.16.0
def signaltonoise(a, axis=0, ddof=0):
    """
    The signal-to-noise ratio of the input data.

    Returns the signal-to-noise ratio of a, here defined as the mean
    divided by the standard deviation.

    Parameters
    ----------
    a : array_like
        An array_like object containing the sample data.
    axis : int or None, optional
        Axis along which to operate. Default is 0. If None, compute over
        the whole array a.
    ddof : int, optional
        Degrees of freedom correction for standard deviation. Default is 0.

    Returns
    -------
    s2n : ndarray
        The mean to standard deviation ratio(s) along axis, or 0 where the
        standard deviation is 0.

    """
    a = np.asanyarray(a)
    m = a.mean(axis)
    sd = a.std(axis=axis, ddof=ddof)
    return np.where(sd == 0, 0, m/sd)

#snr_value = signaltonoise(sampled_audio_np) #conditioning_audio
#print(f"Signal-to-Noise Ratio (SciPy): {snr_value}")


# NU-Wave
def snr(pred, target):
    pred = torch.tensor(pred)
    target = torch.tensor(target)
    return (20 *torch.log10(torch.norm(target, dim=-1) \
                /torch.norm(pred -target, dim =-1).clamp(min =1e-8))).mean()

snr_value = snr(sampled_audio_np, test_audio_tensor)
print(f"Signal-to-Noise Ratio (NU-Wave): {snr_value}")


Signal-to-Noise Ratio (NU-Wave): 0.9276686906814575


  target = torch.tensor(target)


In [None]:
# Log spectral distance

import torch.nn as nn

class STFTMag(nn.Module):
    def __init__(self,
                 nfft=1024,
                 hop=256):
        super().__init__()
        self.nfft = nfft
        self.hop = hop
        self.register_buffer('window', torch.hann_window(nfft), False)

    #x: [B,T] or [T]
    @torch.no_grad()
    def forward(self, x):
        T = x.shape[-1]
        stft = torch.stft(x,
                          self.nfft,
                          self.hop,
                          window=self.window,
                          )#return_complex=False)  #[B, F, TT,2]
        mag = torch.norm(stft, p=2, dim =-1) #[B, F, TT]
        return mag

stft = STFTMag()

def lsd(pred, target):
    sp = torch.log10(stft(pred).square().clamp(1e-8))
    st = torch.log10(stft(target).square().clamp(1e-8))
    return (sp - st).square().mean(dim=1).sqrt().mean()

lsd_value = snr(sampled_audio_np, test_audio_tensor)
print(f"Signal-to-Noise Ratio (NU-Wave): {lsd_value}")