In [1]:
import platform

import librosa
import numpy as np
import scipy

import os
from glob import glob
from tqdm import tqdm

print(f"platform.python_version(): {platform.python_version()}")
print(f"np.version.version: {np.version.version}")
print(f"scipy.__version__: {scipy.__version__}")
print(f"librosa.version.version: {librosa.version.version}")

platform.python_version(): 3.9.13
np.version.version: 1.22.4
scipy.__version__: 1.7.3
librosa.version.version: 0.10.2.post1


In [2]:
# def get_mfcc(file_path, n_mfcc, max_pad_len):
#     # 讀取音檔，轉為單聲道
#     audio, sample_rate = librosa.load(file_path, mono=True, sr=None)
#     # 計算 MFCC
#     mfccs = librosa.feature.mfcc(y=audio, sr=sample_rate, n_mfcc=n_mfcc)
#     # 計算填充或截斷的長度
#     pad_width = max_pad_len - mfccs.shape[1]
#     if pad_width < 0:
#         # 截斷
#         mfccs = mfccs[:, :max_pad_len]
#     else:
#         # 填充
#         mfccs = np.pad(mfccs, pad_width=((0, 0), (0, pad_width)), mode='constant')
#     return mfccs

wav轉mfcc的function

In [3]:
def get_mfcc(file_path, n_mfcc, max_pad_len, n_fft=2048, sr=22050, fmax=None, n_mels=128):
    # 讀取音檔，轉為單聲道
    audio, sample_rate = librosa.load(file_path, mono=True, sr=sr)
    
    # 確保 n_fft 小於或等於輸入信號的長度
    n_fft = min(n_fft, len(audio))
    
    # 計算梅爾頻率倒譜係數（MFCC）
    mfccs = librosa.feature.mfcc(y=audio, sr=sample_rate, n_mfcc=n_mfcc, n_fft=n_fft, n_mels=n_mels, fmax=fmax)
    
    # 計算填充或截斷的長度
    pad_width = max_pad_len - mfccs.shape[1]
    if pad_width < 0:
        # 截斷
        mfccs = mfccs[:, :max_pad_len]
    else:
        # 填充
        mfccs = np.pad(mfccs, pad_width=((0, 0), (0, pad_width)), mode='constant')
    
    return mfccs

取得資料夾 'audio' 底下的所有wav檔案放入 'wav_path_list'

In [4]:
wav_path_list: list = glob(f"{os.getcwd()}\\audio\\*.wav")
print(f"type(wav_list): {type(wav_path_list)}")
print(f"len(wav_list): {len(wav_path_list)}")
# wav_path_list

type(wav_list): <class 'list'>
len(wav_list): 34215


取得資料夾 'record' 底下的所有wav檔案放入 'record_wav_path_list'

In [5]:
record_wav_path_list: list = glob(f"{os.getcwd()}\\record\\*.wav")
print(f"type(record_wav_path_list): {type(record_wav_path_list)}")
print(f"len(record_wav_path_list): {len(record_wav_path_list)}")
# record_wav_path_list

type(record_wav_path_list): <class 'list'>
len(record_wav_path_list): 3043


取得 'samplePinyin\\Male' 資料夾底下的1467種拼音(來源: 教育部)路徑並放入 'sample_pinyin_list'

In [6]:
sample_list: list = glob(f"{os.getcwd()}\\samplePinyin\\Male\\*.wav")

sample_pinyin_list: list = []
for i in range(len(sample_list)):
    label = sample_list[i][sample_list[i].find("_") + 1:sample_list[i].find(".wav")]
    sample_pinyin_list.append(label)
    
print(f"type(sample_pinyin_list): {type(sample_pinyin_list)}")    
print(f"len(sample_pinyin_list): {len(sample_pinyin_list)}")    
# sample_pinyin_list

type(sample_pinyin_list): <class 'list'>
len(sample_pinyin_list): 1467


mfcc 設定

In [7]:
n_mfcc: int = 13  # row
max_pad_len: int = 44  # column

# 如果沒有 'mfcc' 這個資料夾就做一個
if not os.path.exists(f"{os.getcwd()}\\mfcc"):
    os.mkdir(f"{os.getcwd()}\\mfcc")

'record'下的wav轉成mfcc放入'mfcc'

In [8]:
for wav_path in tqdm(record_wav_path_list):
    mfcc = get_mfcc(file_path=wav_path, n_mfcc=n_mfcc, max_pad_len=max_pad_len)
    # 篩選掉 'record_wav_path_list' 中檔名的label格式不正確的檔案
    label = wav_path[wav_path.find("_") + 1:wav_path.find(".wav")]
    if label in sample_pinyin_list:
        path = wav_path.replace("record", "mfcc").replace(".wav", ".npy")
        np.save(file=path, arr=mfcc)

100%|██████████| 3043/3043 [00:40<00:00, 74.94it/s] 


'audio'下的wav轉成mfcc放入'mfcc'

In [9]:
# 將轉好的mfcc陣列輸出成npy檔儲存在 'mfcc' 這個資料夾
for wav_path in tqdm(wav_path_list):
    mfcc = get_mfcc(file_path=wav_path, n_mfcc=n_mfcc, max_pad_len=max_pad_len)
    
    # 處理一聲和輕聲的標籤問題
    label = wav_path[wav_path.find("_") + 1:wav_path.find(".wav")]
    if label[-1] == "1":
        wav_path = wav_path.replace(f"{label}.wav", f"{label.replace('1', '')}.wav")
    elif label[-1] not in ["2", "3", "4"]:
        wav_path = wav_path.replace(f"{label}.wav", f"{label}5.wav")
    
    # 篩選掉 'wav_path_list' 中檔名的label格式不正確的檔案
    label = wav_path[wav_path.find("_") + 1:wav_path.find(".wav")]
    if label in sample_pinyin_list:
        path = wav_path.replace("audio", "mfcc").replace(".wav", ".npy")
        np.save(file=path, arr=mfcc)

  mel_basis = filters.mel(sr=sr, n_fft=n_fft, **kwargs)
100%|██████████| 34215/34215 [03:55<00:00, 145.55it/s]
