In [3]:
import platform
print(f"platform.python_version(): {platform.python_version()}")

# conda install -c conda-forge librosa
import librosa
import numpy as np
import scipy

import os
from glob import glob
from tqdm import tqdm

print(f"np.version.version: {np.version.version}")
print(f"scipy.__version__: {scipy.__version__}")
print(f"librosa.version.version: {librosa.version.version}")

platform.python_version(): 3.9.13
np.version.version: 2.0.2
scipy.__version__: 1.13.1
librosa.version.version: 0.10.2.post1


In [4]:
# def get_mfcc(file_path, n_mfcc, max_pad_len):
#     # 讀取音檔，轉為單聲道
#     audio, sample_rate = librosa.load(file_path, mono=True, sr=None)
#     # 計算 MFCC
#     mfccs = librosa.feature.mfcc(y=audio, sr=sample_rate, n_mfcc=n_mfcc)
#     # 計算填充或截斷的長度
#     pad_width = max_pad_len - mfccs.shape[1]
#     if pad_width < 0:
#         # 截斷
#         mfccs = mfccs[:, :max_pad_len]
#     else:
#         # 填充
#         mfccs = np.pad(mfccs, pad_width=((0, 0), (0, pad_width)), mode='constant')
#     return mfccs

wav轉mfcc的function

In [5]:
def get_mfcc(file_path, n_mfcc, max_pad_len, n_fft=2048, sr=22050, fmax=None, n_mels=128):
    # 讀取音檔，轉為單聲道
    audio, sample_rate = librosa.load(file_path, mono=True, sr=sr)
    
    # 確保 n_fft 小於或等於輸入信號的長度
    n_fft = min(n_fft, len(audio))
    
    # 計算梅爾頻率倒譜係數（MFCC）
    mfccs = librosa.feature.mfcc(y=audio, sr=sample_rate, n_mfcc=n_mfcc, n_fft=n_fft, n_mels=n_mels, fmax=fmax)
    
    # 計算填充或截斷的長度
    pad_width = max_pad_len - mfccs.shape[1]
    if pad_width < 0:
        # 截斷
        mfccs = mfccs[:, :max_pad_len]
    else:
        # 填充
        mfccs = np.pad(mfccs, pad_width=((0, 0), (0, pad_width)), mode='constant')
    
    return mfccs

取得資料夾 'audio' 底下的所有wav檔案放入 'wav_path_list'

In [6]:
wav_path_list: list = glob(f"{os.getcwd()}\\audio\\*.wav")
print(f"type(wav_list): {type(wav_path_list)}")
print(f"len(wav_list): {len(wav_path_list)}")
wav_path_list

type(wav_list): <class 'list'>
len(wav_list): 22496


['D:\\Coding\\SchoolProject\\中文單音CNN分類\\audio\\android這邊也是一樣輸入email-4_shi4.wav',
 'D:\\Coding\\SchoolProject\\中文單音CNN分類\\audio\\android這邊也是一樣輸入email-7_shu1.wav',
 'D:\\Coding\\SchoolProject\\中文單音CNN分類\\audio\\android這邊也是一樣輸入email-8_ru4.wav',
 'D:\\Coding\\SchoolProject\\中文單音CNN分類\\audio\\android這邊去執行這個app-1_zhe4.wav',
 'D:\\Coding\\SchoolProject\\中文單音CNN分類\\audio\\android這邊去執行這個app-4_zhi2.wav',
 'D:\\Coding\\SchoolProject\\中文單音CNN分類\\audio\\android這邊去執行這個app-5_xing2.wav',
 'D:\\Coding\\SchoolProject\\中文單音CNN分類\\audio\\android這邊去執行這個app-6_zhe4.wav',
 'D:\\Coding\\SchoolProject\\中文單音CNN分類\\audio\\android這邊去執行這個app-7_ge4.wav',
 'D:\\Coding\\SchoolProject\\中文單音CNN分類\\audio\\audi有極大機會在未來與sauber正式結盟-11_zheng4.wav',
 'D:\\Coding\\SchoolProject\\中文單音CNN分類\\audio\\audi有極大機會在未來與sauber正式結盟-13_jie2.wav',
 'D:\\Coding\\SchoolProject\\中文單音CNN分類\\audio\\audi有極大機會在未來與sauber正式結盟-14_meng2.wav',
 'D:\\Coding\\SchoolProject\\中文單音CNN分類\\audio\\audi有極大機會在未來與sauber正式結盟-1_you3.wav',
 'D:\\Coding\\SchoolProjec

取得資料夾 'record' 底下的所有wav檔案放入 'record_wav_path_list'

In [7]:
record_wav_path_list: list = glob(f"{os.getcwd()}\\record\\*.wav")
print(f"type(record_wav_path_list): {type(record_wav_path_list)}")
print(f"len(record_wav_path_list): {len(record_wav_path_list)}")
record_wav_path_list

type(record_wav_path_list): <class 'list'>
len(record_wav_path_list): 3043


['D:\\Coding\\SchoolProject\\中文單音CNN分類\\record\\F_a.wav',
 'D:\\Coding\\SchoolProject\\中文單音CNN分類\\record\\F_a2.wav',
 'D:\\Coding\\SchoolProject\\中文單音CNN分類\\record\\F_a4.wav',
 'D:\\Coding\\SchoolProject\\中文單音CNN分類\\record\\F_a5.wav',
 'D:\\Coding\\SchoolProject\\中文單音CNN分類\\record\\F_ai.wav',
 'D:\\Coding\\SchoolProject\\中文單音CNN分類\\record\\F_ai2.wav',
 'D:\\Coding\\SchoolProject\\中文單音CNN分類\\record\\F_ai3.wav',
 'D:\\Coding\\SchoolProject\\中文單音CNN分類\\record\\F_ai4.wav',
 'D:\\Coding\\SchoolProject\\中文單音CNN分類\\record\\F_an.wav',
 'D:\\Coding\\SchoolProject\\中文單音CNN分類\\record\\F_an2.wav',
 'D:\\Coding\\SchoolProject\\中文單音CNN分類\\record\\F_an3.wav',
 'D:\\Coding\\SchoolProject\\中文單音CNN分類\\record\\F_an4.wav',
 'D:\\Coding\\SchoolProject\\中文單音CNN分類\\record\\F_ang.wav',
 'D:\\Coding\\SchoolProject\\中文單音CNN分類\\record\\F_ang2.wav',
 'D:\\Coding\\SchoolProject\\中文單音CNN分類\\record\\F_ang3.wav',
 'D:\\Coding\\SchoolProject\\中文單音CNN分類\\record\\F_ang4.wav',
 'D:\\Coding\\SchoolProject\\中文單音CNN分類\\reco

取得 'samplePinyin\\Male' 資料夾底下的1467種拼音(來源: 教育部)路徑並放入 'sample_pinyin_list'

In [8]:
sample_list: list = glob(f"{os.getcwd()}\\samplePinyin\\Male\\*.wav")

sample_pinyin_list: list = []
for i in range(len(sample_list)):
    label = sample_list[i][sample_list[i].find("_") + 1:sample_list[i].find(".wav")]
    sample_pinyin_list.append(label)
    
print(f"type(sample_pinyin_list): {type(sample_pinyin_list)}")    
print(f"len(sample_pinyin_list): {len(sample_pinyin_list)}")    
sample_pinyin_list

type(sample_pinyin_list): <class 'list'>
len(sample_pinyin_list): 1467


['a',
 'a2',
 'a4',
 'a5',
 'ai',
 'ai2',
 'ai3',
 'ai4',
 'an',
 'an2',
 'an3',
 'an4',
 'ang',
 'ang2',
 'ang3',
 'ang4',
 'ao',
 'ao2',
 'ao3',
 'ao4',
 'ba',
 'ba2',
 'ba3',
 'ba4',
 'ba5',
 'bai',
 'bai2',
 'bai3',
 'bai4',
 'bai5',
 'ban',
 'ban3',
 'ban4',
 'bang',
 'bang3',
 'bang4',
 'bao',
 'bao2',
 'bao3',
 'bao4',
 'bei',
 'bei3',
 'bei4',
 'bei5',
 'ben',
 'ben3',
 'ben4',
 'beng',
 'beng2',
 'beng3',
 'beng4',
 'bi',
 'bi2',
 'bi3',
 'bi4',
 'bian',
 'bian2',
 'bian3',
 'bian4',
 'biao',
 'biao3',
 'biao4',
 'bie',
 'bie2',
 'bie3',
 'bie4',
 'bin',
 'bin3',
 'bin4',
 'bing',
 'bing3',
 'bing4',
 'bo',
 'bo2',
 'bo3',
 'bo4',
 'bo5',
 'bu',
 'bu2',
 'bu3',
 'bu4',
 'ca',
 'ca3',
 'ca4',
 'cai',
 'cai2',
 'cai3',
 'cai4',
 'can',
 'can2',
 'can3',
 'can4',
 'cang',
 'cang2',
 'cang3',
 'cang4',
 'cao',
 'cao2',
 'cao3',
 'cao4',
 'ce4',
 'cen',
 'cen2',
 'ceng',
 'ceng2',
 'ceng3',
 'ceng4',
 'cha',
 'cha2',
 'cha3',
 'cha4',
 'chai',
 'chai2',
 'chai3',
 'chai4',
 'chan',

mfcc 設定

In [9]:
n_mfcc: int = 13  # row
max_pad_len: int = 44  # column

# 如果沒有 'mfcc' 這個資料夾就做一個
if not os.path.exists(f"{os.getcwd()}\\mfcc"):
    os.mkdir(f"{os.getcwd()}\\mfcc")

'record'下的wav轉成mfcc放入'mfcc'

In [10]:
for wav_path in tqdm(record_wav_path_list):
    mfcc = get_mfcc(file_path=wav_path, n_mfcc=n_mfcc, max_pad_len=max_pad_len)
    # 篩選掉 'record_wav_path_list' 中檔名的label格式不正確的檔案
    label = wav_path[wav_path.find("_") + 1:wav_path.find(".wav")]
    if label in sample_pinyin_list:
        path = wav_path.replace("record", "mfcc").replace(".wav", ".npy")
        np.save(file=path, arr=mfcc)

100%|██████████| 3043/3043 [00:30<00:00, 101.16it/s]


'audio'下的wav轉成mfcc放入'mfcc'

In [11]:
# 將轉好的mfcc陣列輸出成npy檔儲存在 'mfcc' 這個資料夾
for wav_path in tqdm(wav_path_list):
    mfcc = get_mfcc(file_path=wav_path, n_mfcc=n_mfcc, max_pad_len=max_pad_len)
    
    # 處理一聲和輕聲的標籤問題
    label = wav_path[wav_path.find("_") + 1:wav_path.find(".wav")]
    if label[-1] == "1":
        wav_path = wav_path.replace(f"{label}.wav", f"{label.replace('1', '')}.wav")
    elif label[-1] not in ["2", "3", "4"]:
        wav_path = wav_path.replace(f"{label}.wav", f"{label}5.wav")
    
    # 篩選掉 'wav_path_list' 中檔名的label格式不正確的檔案
    label = wav_path[wav_path.find("_") + 1:wav_path.find(".wav")]
    if label in sample_pinyin_list:
        path = wav_path.replace("audio", "mfcc").replace(".wav", ".npy")
        np.save(file=path, arr=mfcc)

  mel_basis = filters.mel(sr=sr, n_fft=n_fft, **kwargs)
100%|██████████| 22496/22496 [01:59<00:00, 188.86it/s]
