# 음성 데이터(MFCC) 전처리
- 음성 데이터 용량이 매우 크므로, 512개 씩 나누어서 저장

# 필요 모듈 설치

In [None]:
!pip install pickle5

Looking in indexes: https://pypi.org/simple, https://us-python.pkg.dev/colab-wheels/public/simple/
Collecting pickle5
  Downloading pickle5-0.0.12-cp37-cp37m-manylinux_2_5_x86_64.manylinux1_x86_64.whl (256 kB)
[K     |████████████████████████████████| 256 kB 30.0 MB/s 
[?25hInstalling collected packages: pickle5
Successfully installed pickle5-0.0.12


# 필요 모듈 import

In [None]:
# 그 외 기본 라이브러리
import os
import pickle5 as pickle
import numpy as np
import operator
import pandas as pd
from tqdm import tqdm, tqdm_notebook
import time
from typing import List
import IPython
import matplotlib.pyplot as plt

# 경고 제거용
import warnings
warnings.filterwarnings('ignore')

In [None]:
from google.colab import drive
drive.mount('/content/drive')

Mounted at /content/drive


## 음성 데이터 및 텍스트 데이터 동시 로드
- 텍스트 데이터를 로드하는 이유
 - 해당 음성 데이터가 에러가 포함되었는지 여부를 확인하기 위해

## padding function
- 길이가 다른 n개의 음성 데이터가 같은 길이가 되도록 해주는 function

In [None]:
def paddingFunction(array2d, m=20, n=3500):
    zeroArray = np.zeros((m, n-array2d.shape[1]))
    return np.concatenate([array2d, zeroArray], 1)

## 음성 데이터 로드

In [None]:
audioPath1 = '/content/drive/MyDrive/Data enhancement code for speech recognition/00 Data/01 sound(mfcc)/KsponSpeech_01_SpeechData.pickle'
with open(audioPath1, 'rb') as f:
    mfcc_01_dict = pickle.load(f)

mfcc_01_dict = dict(sorted(mfcc_01_dict.items()))

## 음성 데이터 길이순으로 정렬

In [None]:
mfcc01_len_dict = {}
for dictKey in list(mfcc_01_dict):
    mfcc01_len_dict.setdefault(dictKey, mfcc_01_dict[dictKey].shape[1])

mfcc01_len_dict = dict(sorted(mfcc01_len_dict.items(), key=operator.itemgetter(1)))

newMfcc01Dict = {}
for dictKey in mfcc01_len_dict:
    newMfcc01Dict.setdefault(dictKey, mfcc_01_dict[dictKey])

del mfcc_01_dict

## 텍스트 데이터 로드

In [None]:
labelPath = '/content/drive/MyDrive/Data enhancement code for speech recognition/00 Data/02 text/KsponSpeech_01_labelDict.pickle'
with open(labelPath, 'rb') as f:
    labelDataDict01 = pickle.load(f)

In [None]:
labelDataDict01[list(newMfcc01Dict)[80000][:-3] + 'txt']

'o/ 고작 (1시간)/(한 시간)도 안 봐주고 (1시간)/(한 시간)? (1시간)/(한 시간) 보주나? b/\n'

# 500 저장
- 각 그룹에서 256개씩 추출하여 저장

In [21]:
normalNum, errorNum = 0, 0

In [22]:
normalListNum, errorListNum = 0, 0

In [23]:
tempNormal256Dict, tempError256Dict = {}, {}

In [24]:
tempNormalLen256Dict, tempErrorLen256Dict = {}, {}

In [25]:
for tName in tqdm_notebook(list(newMfcc01Dict)):
    if normalNum == 256:
        tempMaxLen = max(tempNormalLen256Dict.values())
        # padding process
        for tName2 in list(tempNormal256Dict):
            tempNormal256Dict[tName2] = paddingFunction(tempNormal256Dict[tName2], m=20, n = tempMaxLen)
            tempNormal256Dict[tName2] = tempNormal256Dict[tName2].reshape(1, 20, tempMaxLen)
        # save len dict
        tempName = '/content/drive/MyDrive/Data enhancement code for speech recognition/02 Preprocessed Data/mfcc data/normal/mfccNormal01Len' + str(normalListNum) + '.pickle'
        with open(tempName, 'wb') as f:
            pickle.dump(tempNormalLen256Dict, f, pickle.HIGHEST_PROTOCOL)
        # save mfcc dict
        tempName = '/content/drive/MyDrive/Data enhancement code for speech recognition/02 Preprocessed Data/mfcc data/normal/mfccNormal01Dict' + str(normalListNum) + '.pickle'
        with open(tempName, 'wb') as f:
            pickle.dump(tempNormal256Dict, f, pickle.HIGHEST_PROTOCOL)
        normalNum = 0
        normalListNum += 1
        tempNormal256Dict = {}
        tempNormalLen256Dict = {}
        
    if errorNum == 256:
        tempMaxLen = max(tempErrorLen256Dict.values())
        # padding process
        for tName2 in list(tempError256Dict):
            tempError256Dict[tName2] = paddingFunction(tempError256Dict[tName2], m=20, n = tempMaxLen)
            tempError256Dict[tName2] = tempError256Dict[tName2].reshape(1, 20, tempMaxLen)
        # save len dict
        tempName = '/content/drive/MyDrive/Data enhancement code for speech recognition/02 Preprocessed Data/mfcc data/error/mfccError01Len' + str(errorListNum) + '.pickle'
        with open(tempName, 'wb') as f:
            pickle.dump(tempErrorLen256Dict, f, pickle.HIGHEST_PROTOCOL)
        # save mfcc dict
        tempName = '/content/drive/MyDrive/Data enhancement code for speech recognition/02 Preprocessed Data/mfcc data/error/mfccError01Dict' + str(errorListNum) + '.pickle'
        with open(tempName, 'wb') as f:
            pickle.dump(tempError256Dict, f, pickle.HIGHEST_PROTOCOL)
        errorNum = 0
        errorListNum += 1
        tempError256Dict = {}
        tempErrorLen256Dict = {}
    
    if '*' in labelDataDict01[tName[:-3] + 'txt']:
        errorNum += 1
        tempError256Dict.setdefault(tName, newMfcc01Dict[tName])
        tempErrorLen256Dict.setdefault(tName, mfcc01_len_dict[tName])
    
    else:
        normalNum += 1
        tempNormal256Dict.setdefault(tName, newMfcc01Dict[tName])
        tempNormalLen256Dict.setdefault(tName, mfcc01_len_dict[tName])

tempMaxLen = max(tempNormalLen256Dict.values())
# padding process
for tName2 in list(tempNormal256Dict):
    tempNormal256Dict[tName2] = paddingFunction(tempNormal256Dict[tName2], m=20, n = tempMaxLen)
    tempNormal256Dict[tName2] = tempNormal256Dict[tName2].reshape(1, 20, tempMaxLen)
# save len dict
tempName = '/content/drive/MyDrive/Data enhancement code for speech recognition/02 Preprocessed Data/mfcc data/normal/mfccNormal01Len' + str(normalListNum) + '.pickle'
with open(tempName, 'wb') as f:
    pickle.dump(tempNormalLen256Dict, f, pickle.HIGHEST_PROTOCOL)
# save mfcc dict
tempName = '/content/drive/MyDrive/Data enhancement code for speech recognition/02 Preprocessed Data/mfcc data/normal/mfccNormal01Dict' + str(normalListNum) + '.pickle'
with open(tempName, 'wb') as f:
    pickle.dump(tempNormal256Dict, f, pickle.HIGHEST_PROTOCOL)

tempMaxLen = max(tempErrorLen256Dict.values())
# padding process
for tName2 in list(tempError256Dict):
    tempError256Dict[tName2] = paddingFunction(tempError256Dict[tName2], m=20, n = tempMaxLen)
    tempError256Dict[tName2] = tempError256Dict[tName2].reshape(1, 20, tempMaxLen)
# save len dict
tempName = '/content/drive/MyDrive/Data enhancement code for speech recognition/02 Preprocessed Data/mfcc data/error/mfccError01Len' + str(errorListNum) + '.pickle'
with open(tempName, 'wb') as f:
    pickle.dump(tempErrorLen256Dict, f, pickle.HIGHEST_PROTOCOL)
# save mfcc dict
tempName = '/content/drive/MyDrive/Data enhancement code for speech recognition/02 Preprocessed Data/mfcc data/error/mfccError01Dict' + str(errorListNum) + '.pickle'
with open(tempName, 'wb') as f:
    pickle.dump(tempError256Dict, f, pickle.HIGHEST_PROTOCOL)

  0%|          | 0/124000 [00:00<?, ?it/s]

In [26]:
normalListNum, errorListNum

(443, 40)