# single 

In [None]:

import torch
import torchaudio
import torch.nn.functional as F
import matplotlib.pyplot as plt
from speechbrain.inference.vocoders import HIFIGAN
from speechbrain.lobes.models.FastSpeech2 import mel_spectogram

# 加载预训练的 HiFi-GAN 声码器模型
hifi_gan = HIFIGAN.from_hparams(source="speechbrain/tts-hifigan-libritts-16kHz", savedir="pretrained_models/tts-hifigan-libritts-16kHz")

# 加载 CycleGAN 模型
G_y2m = Generator()
G_y2m.load_state_dict(torch.load('./best_G_m2y.pth', map_location=torch.device('cpu')))
# G_y2m.load_state_dict(torch.load('best_G_o2m.pth', map_location=torch.device('cpu')))
G_y2m.eval()


def load_audio(file_path, target_sr=16000, duration=5):
    audio, sr = torchaudio.load(file_path)
    
    # 移除直流分量
    audio = audio - audio.mean()

    # 如果音频是立体声，转换为单声道
    if audio.size(0) > 1:
        audio = torch.mean(audio, dim=0, keepdim=True)
    
    # 计算目标长度
    target_length = duration * target_sr
    

    if audio.size(1) > target_length:
        audio = audio[:, :target_length]
    elif audio.size(1) < target_length:
        padding = target_length - audio.size(1)
        repeat_times = (padding // audio.size(1)) + 1  
        repeated_audio = audio.repeat(1, repeat_times)
        audio = torch.cat((audio, repeated_audio[:, :padding]), dim=1)  
    
    return audio


# 加载音频文件并生成 Mel 频谱图
# signal = load_audio('./02_audio_16k/Batch1/Billy_Connoly/74/74_000459_000570_000.wav') 
signal = load_audio('./02_audio_16k/Batch2/Bob_Balaban/69/69_000758_002723_000.wav') 
signal = signal[0].squeeze()
torchaudio.save('waveform.wav', signal.unsqueeze(0), 16000)
signal = signal.squeeze()

# 计算 Mel 频谱图
spectrogram, _ = mel_spectogram(
    audio=signal.squeeze(),
    sample_rate=16000,
    hop_length=256,
    win_length=1024,
    n_mels=80,
    n_fft=1024,
    f_min=0.0,
    f_max=8000.0,
    power=1,
    normalized=False,
    min_max_energy_norm=True,
    norm="slaney",
    mel_scale="slaney",
    compression=True
)
# 使用CycleGAN训练阶段的均值和标准差
# mean = -4.9434876
# std =   2.0440395
mean = spectrogram.mean()
std =  spectrogram.std()
std_spectrogram = spectrogram
std_spectrogram = (spectrogram - mean) / std
std_spectrogram = std_spectrogram.unsqueeze(0)

# 添加通道维度，使其形状为 (1, 80, T)
with torch.no_grad():
    generated_spectrogram = G_y2m(std_spectrogram).squeeze()

# 反标准化生成的 Mel 频谱图
generated_spectrogram = generated_spectrogram 
generated_spectrogram = generated_spectrogram * std + mean


#  打印原始和生成的 Mel 频谱图
plt.figure(figsize=(10, 4))

plt.subplot(1, 2, 1)
plt.title("Original Mel Spectrogram")
plt.imshow(spectrogram.squeeze().cpu().detach().numpy(), aspect='auto', origin='lower')

plt.subplot(1, 2, 2)
plt.title("Generated Mel Spectrogram (Denormalized)")
plt.imshow(generated_spectrogram.cpu().detach().numpy(), aspect='auto', origin='lower')

plt.show()

#使用 HiFi-GAN 将 Mel 频谱图还原为音频
with torch.no_grad():
    original_waveform = hifi_gan.decode_batch(spectrogram)
    generated_waveform = hifi_gan.decode_batch(generated_spectrogram.unsqueeze(0))
# 放大生成的音频
# amplification_factor = 30.0 
# generated_waveform = generated_waveform * amplification_factor

# 保存原始和生成的音频波形
torchaudio.save('original_waveform.wav', original_waveform.squeeze(1), 16000)
torchaudio.save('generated_waveform.wav', generated_waveform.squeeze(1), 16000)

print('done')


# batch

In [None]:
import torch
import torchaudio
import torch.nn.functional as F
from tqdm import tqdm
import pandas as pd
from speechbrain.inference.vocoders import HIFIGAN
from speechbrain.lobes.models.FastSpeech2 import mel_spectogram
import os


# 加载预训练的 HiFi-GAN 声码器模型
hifi_gan = HIFIGAN.from_hparams(source="speechbrain/tts-hifigan-libritts-16kHz", savedir="pretrained_models/tts-hifigan-libritts-16kHz")

# 加载 CycleGAN 模型
G_y2m = Generator()
G_y2m.load_state_dict(torch.load('./best_G_y2m.pth', map_location=torch.device('cuda')))
G_y2m.eval()

G_o2m = Generator()
G_o2m.load_state_dict(torch.load('./best_G_o2m.pth', map_location=torch.device('cuda')))
G_o2m.eval()

def load_audio(file_path, target_sr=16000):
    audio, sr = torchaudio.load(file_path)
    
    # 移除直流分量
    audio = audio - audio.mean()

    # 如果音频是立体声，转换为单声道
    if audio.size(0) > 1:
        audio = torch.mean(audio, dim=0, keepdim=True)
    
    # 处理音频长度为 3 秒
    duration = 3  
    target_length = duration * target_sr
    

    if audio.size(1) > target_length:
        audio = audio[:, :target_length]
    elif audio.size(1) < target_length:
        padding = target_length - audio.size(1)
        repeat_times = (padding // audio.size(1)) + 1  
        repeated_audio = audio.repeat(1, repeat_times)  
        audio = torch.cat((audio, repeated_audio[:, :padding]), dim=1)  
    
    return audio

def process_and_save(file_path, age, target_dir):
    signal = load_audio('02_audio_16k/' + file_path)

    # 计算 Mel 频谱图
    spectrogram, _ = mel_spectogram(
        audio=signal.squeeze(),
        sample_rate=16000,
        hop_length=256,
        win_length=1024,
        n_mels=80,
        n_fft=1024,
        f_min=0.0,
        f_max=8000.0,
        power=1,
        normalized=False,
        min_max_energy_norm=True,
        norm="slaney",
        mel_scale="slaney",
        compression=True
    )
    
    # 使用CycleGAN训练阶段的均值和标准差
    mean = -4.9434876
    std = 2.0440395
    std_spectrogram = (spectrogram - mean) / std
    
    # 确保 std_spectrogram 是四维张量 (batch_size, channels, height, width)
    std_spectrogram = std_spectrogram.unsqueeze(0)  # 添加 batch 和 channel 维度

    if age < 35:
        # 使用 CycleGAN G_y2m 处理
        with torch.no_grad():
            generated_spectrogram = G_y2m(std_spectrogram).squeeze()

        # 反标准化生成的 Mel 频谱图
        generated_spectrogram = generated_spectrogram * std + mean
    
    elif age > 55:
        # 使用 CycleGAN G_o2m 处理
        with torch.no_grad():
            generated_spectrogram = G_o2m(std_spectrogram).squeeze()

        # 反标准化生成的 Mel 频谱图
        generated_spectrogram = generated_spectrogram * std + mean
    
    else:
        generated_spectrogram = std_spectrogram.squeeze() * std + mean
    
    # 使用 HiFi-GAN 将 Mel 频谱图还原为音频
    with torch.no_grad():
        generated_waveform = hifi_gan.decode_batch(generated_spectrogram.unsqueeze(0))

    save_path = os.path.join(target_dir, file_path)
    os.makedirs(os.path.dirname(save_path), exist_ok=True)

    # 保存为 16-bit PCM 编码的 WAV 文件，确保 Kaldi 兼容
    torchaudio.save(save_path, generated_waveform.squeeze(1), 16000, encoding="PCM_S", bits_per_sample=16)

# 读取 CSV 文件
df = pd.read_csv('new_audio_samples.csv')

# 获取所有音频文件路径及对应的年龄
audio_files = df['fname'].tolist()
ages = df['age'].tolist()

# 处理并保存所有音频文件
target_dir = './generated_audio'
for file_path, age in tqdm(zip(audio_files, ages), total=len(audio_files)):
    process_and_save(file_path, age, target_dir)

print("all audios are saved in ", target_dir)
