In [None]:
import torch
import torchaudio
import pandas as pd
import numpy as np
from torch.nn import functional as F
from tqdm import tqdm
from speechbrain.lobes.models.FastSpeech2 import mel_spectogram

def preprocess_audio(audio, sr=16000, duration=3):
    # 固定音频长度为3秒
    target_length = sr * duration
    if audio.shape[1] > target_length:
        audio = audio[:, :target_length]
    elif audio.shape[1] < target_length:
        padding = target_length - audio.shape[1]
        audio = F.pad(audio, (0, padding))
    
    return audio

def load_and_process_audio(fname, root_dir='02_audio_16k', sr=16000, duration=3):
    # 加载音频文件
    full_path = f"{root_dir}/{fname}"
    audio, _ = torchaudio.load(full_path)
     # 移除直流分量
    audio = audio - audio.mean()
    
    # 如果是立体声，转换为单声道
    if audio.size(0) > 1:
        audio = torch.mean(audio, dim=0, keepdim=True)
    
    # 预处理音频
    audio = preprocess_audio(audio, sr, duration)
    
    # 使用与声码器一致的mel_spectogram函数转换为Mel频谱图
    mel_spec, _ = mel_spectogram(
        audio=audio.squeeze(),
        sample_rate=sr,
        hop_length=256,
        win_length=1024,
        n_mels=80,
        n_fft=1024,
        f_min=0.0,
        f_max=8000.0,
        power=1,
        normalized=False,
        min_max_energy_norm=True,
        norm="slaney",
        mel_scale="slaney",
        compression=True
    )
    
    # 添加批次维度，使其形状为 (1, 80, T)
    mel_spec = mel_spec.unsqueeze(0)
    
    return mel_spec.numpy()

# 加载数据文件
data = pd.read_csv('new_audio_samples.csv')

data = data[data['fname'].str.startswith(('Batch1','Batch2','Batch3','Batch4','Batch5',
                                          'Batch7','Batch6','Batch8','Batch9'))]
# data = data[data['fname'].str.startswith(('Batch1','Batch3'))]

# 添加一列保存Mel频谱图
mel_specs = []
for i, row in tqdm(data.iterrows(), total=len(data)):
    mel_spec = load_and_process_audio(row['fname'])
    mel_specs.append(mel_spec)

data['mel_spec'] = mel_specs

# 计算全局的均值和标准差
# all_specs = np.concatenate(data['mel_spec'].values, axis=2)
# global_mean = all_specs.mean()
# global_std = all_specs.std()
# print('Global mean:', global_mean, 'Global std:', global_std)
# middle mean -4.855521 std 2.1557724
all_specs = np.stack(data['mel_spec'].values)
global_mean = all_specs.mean()
global_std = all_specs.std()
print('middle mean',global_mean, 'std',global_std)


# 定义年龄段范围
youth_age_max = 35
old_age_min = 55

# 根据年龄分成青年、中年、老年三类
youth_data = data[data['age'] < youth_age_max]
middle_age_data = data[(data['age'] >= youth_age_max) & (data['age'] <= old_age_min)]
old_age_data = data[data['age'] > old_age_min]

# 标准化函数
def normalize(mel_spec, mean, std):
    return (mel_spec - mean) / std

# 对每个数据集进行标准化处理，使用全局均值和标准差
youth_data['mel_spec'] = youth_data['mel_spec'].apply(lambda x: normalize(x, global_mean, global_std))
middle_age_data['mel_spec'] = middle_age_data['mel_spec'].apply(lambda x: normalize(x, global_mean, global_std))
old_age_data['mel_spec'] = old_age_data['mel_spec'].apply(lambda x: normalize(x, global_mean, global_std))

# 处理后的数据保存到各自的文件中

youth_data.to_pickle('youth_data_16k.pkl')
middle_age_data.to_pickle('middle_age_data_16k.pkl')
old_age_data.to_pickle('old_age_data_16k.pkl')

