# 필요 라이브러리 설치

In [1]:
!pip3 install pickle5

Looking in indexes: https://pypi.org/simple, https://us-python.pkg.dev/colab-wheels/public/simple/
Collecting pickle5
  Downloading pickle5-0.0.12-cp37-cp37m-manylinux_2_5_x86_64.manylinux1_x86_64.whl (256 kB)
[K     |████████████████████████████████| 256 kB 19.9 MB/s 
[?25hInstalling collected packages: pickle5
Successfully installed pickle5-0.0.12


# 필요 라이브러리 로드

In [2]:
# pytorch 계열 라이브러리
import torch
from torch import nn
import torch.nn.functional as F
import torch.optim as optim
from torch.utils.data import Dataset, DataLoader
import torchaudio

# 정규표현식 사용용도
import re

# 그 외 기본 라이브러리
import os
import pickle5 as pickle
import numpy as np
import pandas as pd
from tqdm import tqdm, tqdm_notebook
import random
import time
from typing import List
import IPython
import matplotlib.pyplot as plt

# 경고 제거용
import warnings
warnings.filterwarnings('ignore')

In [3]:
from google.colab import drive
drive.mount('/content/drive')

Mounted at /content/drive


# 텍스트 전처리 함수

## Input Text 전처리 함수

In [4]:
def InputDataPreProcessing(inputText):
    # 0. \n 지우기
    preProcessedText = inputText.replace('\n', '')
    # 가. 잡음 관련 기호 제거 (b/, o/, n/, l/)
    preProcessedText = re.sub("[a-z][/]", "", preProcessedText)
    # +, * 제거
    preProcessedText = re.sub("[+*/]", "", preProcessedText)
    # 이중 전사 표현 발음만 남기기 (a)/(b) - > b
    tempProcessedText = ''
    a_num = 0
    b_num = 0
    for i in range(len(preProcessedText)):
        if preProcessedText[i] != '(' and a_num == 0:
            tempProcessedText += preProcessedText[i]
        elif preProcessedText[i] == '(' and a_num == 0:
            a_num += 1
        elif preProcessedText[i] == ')' and a_num == 1:
            a_num += 1
        elif preProcessedText[i] == '(' and a_num == 2:
            b_num += 1
        elif preProcessedText[i] == ')' and a_num == 2:
            b_num += 1
        elif b_num == 1:
            tempProcessedText += preProcessedText[i]
        
        if b_num == 2:
            a_num = 0 
            b_num = 0

    tempProcessedText = re.sub('[^\sA-Za-z0-9가-힣*]', '', tempProcessedText)

    # space가 반복된 문장 space 하나로 대체하기
    tempProcessedText = re.sub('\s+', ' ', tempProcessedText)
    
    if tempProcessedText[0] == ' ':
        tempProcessedText = tempProcessedText[1:]

    if tempProcessedText[len(tempProcessedText)-1] == ' ':
        tempProcessedText=tempProcessedText[:-1]

    return tempProcessedText

## Label Text 전처리 함수

In [5]:
def LabelDataPreProcessing(inputText):
    # 0. \n 지우기
    preProcessedText = inputText.replace('\n', '')
    # 가. 잡음 관련 기호 제거 (b/, o/, n/, l/)
    preProcessedText = re.sub("[a-z][/]", "", preProcessedText)
    # + 제거
    preProcessedText = re.sub("[+]", "", preProcessedText)
    # 이중 전사 표현 발음만 남기기 (a)/(b) - > b
    tempProcessedText = ''
    a_num = 0
    b_num = 0
    for i in range(len(preProcessedText)):
        if preProcessedText[i] != '(' and a_num == 0:
            tempProcessedText += preProcessedText[i]
        elif preProcessedText[i] == '(' and a_num == 0:
            a_num += 1
        elif preProcessedText[i] == ')' and a_num == 1:
            a_num += 1
        elif preProcessedText[i] == '(' and a_num == 2:
            b_num += 1
        elif preProcessedText[i] == ')' and a_num == 2:
            b_num += 1
        elif b_num == 1:
            tempProcessedText += preProcessedText[i]
        
        if b_num == 2:
            a_num = 0 
            b_num = 0
    # *가 포함되어 있는 단어 지우기
#     preProcessedText = re.sub("\S*[*]", "*", tempProcessedText)
    preProcessedText = tempProcessedText
    p = re.compile('\S*[*]')
    pList = p.findall(preProcessedText)
    pList.sort(key=len, reverse=True)
    
    for pl in pList:
        tempStr = ''
        for i in range(len(pl)-1):
            tempStr = tempStr + '*'

        preProcessedText = preProcessedText.replace(pl, tempStr)

    preProcessedText = re.sub('[^\sA-Za-z0-9가-힣*]', '', preProcessedText)

    # space가 반복된 문장 space 하나로 대체하기
    preProcessedText = re.sub('\s+', ' ', preProcessedText)
    
    if preProcessedText[0] == ' ':
        preProcessedText = preProcessedText[1:]

    if preProcessedText[len(preProcessedText)-1] == ' ':
        preProcessedText=preProcessedText[:-1]
     
    return preProcessedText

## Label string to 0(Normal) or 1(Error)

In [6]:
def LabelToInt(labelText):
    output = []
    labelList = list(labelText)
    for l in labelList:
        if l =='*':
            output.append(3)
        elif l == ' ':
            output.append(1) 
        else:
            output.append(2)

    output =  np.array(output)
    return output

## TextToVector Class

In [7]:
class TextToVector():
    def __init__(self, vocab = None, textList = None):
        if vocab == None:
            self.vocab = {}
        else:
            self.vocab = vocab
        
        if textList != None:
            self.vocab = self.madeVocab(textList, self.vocab)
    
    def IncreaseString(self, simpleString, mfccLen):
        simpleCharList = list(simpleString)

        lenSimpleChar = len(simpleCharList)
        increaseRate, randomIndexNum = mfccLen // lenSimpleChar, mfccLen % lenSimpleChar
        increaseString = ''
        increaseRates = np.full((lenSimpleChar), increaseRate)

        for i in np.random.randint(0, lenSimpleChar, size = randomIndexNum):
            increaseRates[i] += 1

        for index, incRate in enumerate(increaseRates):
            for i in range(incRate):
                increaseString += simpleCharList[index]
        
        return increaseString
    
    def madeVocab(self, strList, vocab = None):
        if vocab != None:
            self.vocab = vocab
        self.vocab.setdefault("'", 0)

        vocabList = list(self.vocab.keys())
        vocabFreauency = {}

        for tempText in strList:
            for tempStr in tempText:
                for tempChar in tempStr:
                    if tempChar not in vocabList:
                        vocabFreauency.setdefault(tempChar, 0)
                        vocabFreauency[tempChar] += 1

            
        vocabFreauency = dict(sorted(vocabFreauency.items(), reverse = True, key = lambda item: item[1]))

        for v in vocabFreauency:
            self.vocab.setdefault(v, len(self.vocab))        

        return self.vocab
    
    def TextToId(self, TextList, mfccLens, padLn=None):
        idList = []
        
        for index, tempText in enumerate(TextList):
            strList = list(tempText)
            lenSimpleChar = len(tempText)
            increaseRate = mfccLens[index] // lenSimpleChar
            strT = self.IncreaseString(tempText, mfccLens[index])

            if padLn == None:
                padLen = len(strT)
            else:
                padLen = padLn

            strList = list(strT)
            tempIdList = []

            for s in strList:
                tempIdList.append(self.vocab[s])

            tempIdList = np.concatenate((tempIdList, np.zeros(padLn-mfccLens[index])), axis=0)
            idList.append(tempIdList)

        return np.array(idList)
    
    def TextToVector(self, TextList, mfccLens, padLn=None, overlapPercent=0):
        textVector = []
        for index, strT in enumerate(TextList):
            lenSimpleChar = len(strT)
            increaseRate = mfccLens[index] // lenSimpleChar
            overlapNum = int(increaseRate * overlapPercent)
            strT = self.IncreaseString(strT, mfccLens[index])

            if padLn == None:
                padLen = len(strT)
            else:
                padLen = padLn

            strList = list(strT)
            if len(self.vocab)%2 == 0:
                oneHotVector = np.zeros((padLen, len(self.vocab)))
            else:
                oneHotVector = np.zeros((padLen, len(self.vocab) + 1))              

            for i, s in enumerate(strT):
                oneHotVector[i][self.vocab[s]] = 1
                if overlapNum + i < len(strT):
                    oneHotVector[overlapNum + i][self.vocab[s]] = 1
                if i - overlapNum >= 0:
                    oneHotVector[i - overlapNum][self.vocab[s]] = 1

            textVector.append(oneHotVector)

        return np.array(textVector)

## label padding 함수

In [8]:
def intPadding(output, padLen = 512):
    padList = np.zeros(padLen-len(output))
    return np.concatenate([output, padList])

## 전처리 예시

In [9]:
sample = 'o/ 근데* 키가 크다니까? 백팔십팔? 그리고 몸\n'

print(sample)
print(InputDataPreProcessing(sample))
print(LabelDataPreProcessing(sample))
print(LabelToInt(LabelDataPreProcessing(sample)))
print(intPadding(LabelToInt(LabelDataPreProcessing(sample))))
textToVector = TextToVector()
print(textToVector.madeVocab([LabelDataPreProcessing(sample)]))
print(textToVector.TextToId([LabelDataPreProcessing(sample), LabelDataPreProcessing(sample)], [1240, 1240], padLn=3500))
print(textToVector.TextToId([LabelDataPreProcessing(sample), LabelDataPreProcessing(sample)], [1240, 1240], padLn=3500)[0])

o/ 근데* 키가 크다니까? 백팔십팔? 그리고 몸

근데 키가 크다니까 백팔십팔 그리고 몸
** 키가 크다니까 백팔십팔 그리고 몸
[3 3 1 2 2 1 2 2 2 2 1 2 2 2 2 1 2 2 2 1 2]
[3. 3. 1. 2. 2. 1. 2. 2. 2. 2. 1. 2. 2. 2. 2. 1. 2. 2. 2. 1. 2. 0. 0. 0.
 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0.
 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0.
 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0.
 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0.
 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0.
 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0.
 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0.
 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0.
 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0.
 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0.
 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0.
 0. 0. 

# Text Data Load

In [11]:
labelPath = '/content/drive/MyDrive/Data enhancement code for speech recognition/00 Data/02 text/KsponSpeech_01_labelDict.pickle'
with open(labelPath, 'rb') as f:
    labelDataDict01 = pickle.load(f)

# Input Data
inputTextDict1 = {}

# Label Data
labelTextDict1 = {}

for key in tqdm_notebook(list(labelDataDict01)):
    inputTextDict1.setdefault(key, InputDataPreProcessing(labelDataDict01[key]))
    labelTextDict1.setdefault(key, LabelDataPreProcessing(labelDataDict01[key]))

inputTextList = np.array(list(inputTextDict1.values()))
labelTextList = np.array(list(labelTextDict1.values()))

labelPath = '/content/drive/MyDrive/Data enhancement code for speech recognition/00 Data/02 text/KsponSpeech_03_labelData.pickle'
with open(labelPath, 'rb') as f:
    labelDataDict03 = pickle.load(f)

for key in tqdm_notebook(list(labelDataDict03)):
    inputTextDict1.setdefault(key, InputDataPreProcessing(labelDataDict03[key]))
    labelTextDict1.setdefault(key, LabelDataPreProcessing(labelDataDict03[key]))

inputTextList = np.array(list(inputTextDict1.values()))
labelTextList = np.array(list(labelTextDict1.values()))

labelPath = '/content/drive/MyDrive/Data enhancement code for speech recognition/00 Data/02 text/KsponSpeech_04_labelDict.pickle'
with open(labelPath, 'rb') as f:
    labelDataDict04 = pickle.load(f)

for key in tqdm_notebook(list(labelDataDict04)):
    inputTextDict1.setdefault(key, InputDataPreProcessing(labelDataDict04[key]))
    labelTextDict1.setdefault(key, LabelDataPreProcessing(labelDataDict04[key]))

inputTextList = np.array(list(inputTextDict1.values()))
labelTextList = np.array(list(labelTextDict1.values()))

labelPath = '/content/drive/MyDrive/Data enhancement code for speech recognition/00 Data/02 text/KsponSpeech_05_labelDict.pickle'
with open(labelPath, 'rb') as f:
    labelDataDict05 = pickle.load(f)

for key in tqdm_notebook(list(labelDataDict05)):
    inputTextDict1.setdefault(key, InputDataPreProcessing(labelDataDict05[key]))
    labelTextDict1.setdefault(key, LabelDataPreProcessing(labelDataDict05[key]))

inputTextList = np.array(list(inputTextDict1.values()))
labelTextList = np.array(list(labelTextDict1.values()))

  0%|          | 0/124000 [00:00<?, ?it/s]

  0%|          | 0/124000 [00:00<?, ?it/s]

  0%|          | 0/124000 [00:00<?, ?it/s]

  0%|          | 0/126545 [00:00<?, ?it/s]

In [12]:
len(inputTextList)

498545

In [13]:
textToVector = TextToVector()
vocab = textToVector.madeVocab(inputTextList)
print(len(vocab))

2258


In [14]:
LabelToInt(labelTextList[14]), intPadding(LabelToInt(labelTextList[14]))

(array([2, 2, 1, 2, 2, 2, 1, 2, 1, 3, 1, 2, 2, 2, 1, 2, 2, 1, 2, 2, 1, 2,
        2, 2, 2, 1, 2, 2, 1, 3, 1, 2, 1, 2, 1, 2, 2, 1, 2, 2, 1, 2, 2, 2,
        2, 1, 2, 2, 1, 2, 2, 2, 1, 2, 1, 2, 1, 2, 2, 1, 2, 1, 2, 2]),
 array([2., 2., 1., 2., 2., 2., 1., 2., 1., 3., 1., 2., 2., 2., 1., 2., 2.,
        1., 2., 2., 1., 2., 2., 2., 2., 1., 2., 2., 1., 3., 1., 2., 1., 2.,
        1., 2., 2., 1., 2., 2., 1., 2., 2., 2., 2., 1., 2., 2., 1., 2., 2.,
        2., 1., 2., 1., 2., 1., 2., 2., 1., 2., 1., 2., 2., 0., 0., 0., 0.,
        0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0.,
        0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0.,
        0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0.,
        0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0.,
        0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0.,
        0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0.,
        0., 0., 0., 0.

In [15]:
print(textToVector.TextToId([inputTextList[14]], [1240], padLn=3500).shape)

embedding_layer = nn.Embedding(num_embeddings=len(textToVector.vocab), 
                               embedding_dim=20,
                               padding_idx=1)


print(embedding_layer(torch.tensor(textToVector.TextToId([inputTextList[14]], [1240], padLn=3500)).long())[0].shape)

(1, 3500)
torch.Size([3500, 20])


In [16]:
et = embedding_layer(torch.tensor(textToVector.TextToId([inputTextList[14], inputTextList[14]], [1240, 1200], padLn=3500)).long())

et = et.reshape(et.size()[0], 1, et.size()[1], et.size()[2])
et.transpose(2, 3).shape

torch.Size([2, 1, 20, 3500])

In [17]:
lenLabelList = [len(LabelToInt(i)) for i in labelTextList]

np.max(lenLabelList)

316

# Dataset 프레임워크

In [18]:
class sPED_Dataset(Dataset):
    def __init__(
        self, mfcc, inputText,labels, mfccLens, textToVector
    ):
        self.mfcc = mfcc
        self.mfccLens = mfccLens
        self.realLens = self.mfcc.shape[3]
        self.labels = [LabelToInt(l) for l in labels]
        self.lenLabels = [len(i) for i in self.labels]
        self.labels = [intPadding(l) for l in self.labels]
        self.textVector = textToVector.TextToId(inputText, mfccLens, padLn=self.realLens)
        

    def __getitem__(self, i):
        return ((self.mfcc[i], ) + (self.mfccLens[i], ) + (self.textVector[i],) + (self.labels[i], ) + (self.lenLabels[i], ))

    def __len__(self):
        return (len(self.labels))

# 음성 데이터 파일명 리스트 로드

In [19]:
normalFilePath = '/content/drive/MyDrive/Data enhancement code for speech recognition/02 Preprocessed Data/mfcc data/normal'
normalFileList = os.listdir(normalFilePath)

normalLenPickleList = []
normalMfccPickleList = []

for fName in normalFileList:
    if 'Len' in fName:
        normalLenPickleList.append(normalFilePath + '/' + fName)
    else:
        normalMfccPickleList.append(normalFilePath + '/' + fName)

normalLenPickleList = sorted(normalLenPickleList)
normalMfccPickleList = sorted(normalMfccPickleList)

In [20]:
errorFilePath = '/content/drive/MyDrive/Data enhancement code for speech recognition/02 Preprocessed Data/mfcc data/error'
errorFileList = os.listdir(errorFilePath)

errorLenPickleList = []
errorMfccPickleList = []

for fName in errorFileList:
    if 'Len' in fName:
        errorLenPickleList.append(errorFilePath + '/' + fName)
    else:
        errorMfccPickleList.append(errorFilePath + '/' + fName)

errorLenPickleList = sorted(errorLenPickleList)
errorMfccPickleList = sorted(errorMfccPickleList)

In [21]:
len(normalMfccPickleList), len(errorMfccPickleList)

(1781, 169)

# Train, valid, test split
- 7 : 1 : 2

In [22]:
normalTrainNum = int(len(normalMfccPickleList) * 0.7)
normalValidNum = int(len(normalMfccPickleList) * 0.1)
normalTestNum = len(normalMfccPickleList) - normalTrainNum - normalValidNum

errorTrainNum = int(len(errorMfccPickleList) * 0.7)
errorValidNum = int(len(errorMfccPickleList) * 0.1)
errorTestNum = len(errorMfccPickleList) - errorTrainNum - errorValidNum

print(normalTrainNum, normalValidNum, normalTestNum)
print(errorTrainNum, errorValidNum, errorTestNum)

1246 178 357
118 16 35


In [23]:
normalIndex = np.arange(len(normalMfccPickleList))
errorIndex = np.arange(len(errorMfccPickleList))

random.shuffle(normalIndex)
random.shuffle(errorIndex)

In [24]:
trainNormalIndex = normalIndex[:normalTrainNum]
validNormalIndex = normalIndex[normalTrainNum:normalTrainNum+normalValidNum]
testNormalIndex = normalIndex[normalTrainNum+normalValidNum:]

trainErrorIndex = errorIndex[:errorTrainNum]
validErrorIndex = errorIndex[errorTrainNum:errorTrainNum+errorValidNum]
testErrorIndex = errorIndex[errorTrainNum+errorValidNum:]

In [25]:
print(len(trainNormalIndex), len(validNormalIndex), len(testNormalIndex))
print(len(trainErrorIndex), len(validErrorIndex), len(testErrorIndex))

1246 178 357
118 16 35


# Train dataloader 생성

## normal

In [33]:
new_i = 0
for i in tqdm_notebook(trainNormalIndex):
    # mfcc load
    with open(normalMfccPickleList[i], 'rb') as f:
        tempMfccDict = pickle.load(f)
    tempMfcc = np.array(list(tempMfccDict.values()))
    # len load
    with open(normalLenPickleList[i], 'rb') as f:
        tempLenDict = pickle.load(f)
    tempLen = np.array(list(tempLenDict.values()))
    # tempInputText
    tempInputText = np.array([inputTextDict1[fName[:-3] + 'txt'] for fName in tempMfccDict])
    # tempLabelText
    tempLabelText = np.array([labelTextDict1[fName[:-3] + 'txt'] for fName in tempMfccDict])
    trainset = sPED_Dataset(tempMfcc, tempInputText, tempLabelText, tempLen, textToVector)
    tempDataLoaderName = '/content/drive/MyDrive/Data enhancement code for speech recognition/03 Dataloader/train/normal/trainDataloader' + str(new_i) +'.pickle'
    new_i += 1
    train_dataloader = torch.utils.data.DataLoader(trainset, batch_size=16, num_workers=2, shuffle = True)
    with open(tempDataLoaderName, 'wb') as f:
        pickle.dump(train_dataloader, f, pickle.HIGHEST_PROTOCOL)

  0%|          | 0/1246 [00:00<?, ?it/s]

## error

In [28]:
new_i = 0
for i in tqdm_notebook(trainErrorIndex):
    # mfcc load
    with open(errorMfccPickleList[i], 'rb') as f:
        tempMfccDict = pickle.load(f)
    tempMfcc = np.array(list(tempMfccDict.values()))
    # len load
    with open(errorLenPickleList[i], 'rb') as f:
        tempLenDict = pickle.load(f)
    tempLen = np.array(list(tempLenDict.values()))
    # tempInputText
    tempInputText = np.array([inputTextDict1[fName[:-3] + 'txt'] for fName in tempMfccDict])
    # tempLabelText
    tempLabelText = np.array([labelTextDict1[fName[:-3] + 'txt'] for fName in tempMfccDict])
    trainset = sPED_Dataset(tempMfcc, tempInputText, tempLabelText, tempLen, textToVector)
    tempDataLoaderName = '/content/drive/MyDrive/Data enhancement code for speech recognition/03 Dataloader/train/error/trainDataloader' + str(new_i) +'.pickle'
    new_i += 1
    train_dataloader = torch.utils.data.DataLoader(trainset, batch_size=16, num_workers=2, shuffle = True)
    with open(tempDataLoaderName, 'wb') as f:
        pickle.dump(train_dataloader, f, pickle.HIGHEST_PROTOCOL)

  0%|          | 0/118 [00:00<?, ?it/s]

# Valid dataloader 생성

## normal

In [34]:
new_i = 0
for i in tqdm_notebook(validNormalIndex):
    # mfcc load
    with open(normalMfccPickleList[i], 'rb') as f:
        tempMfccDict = pickle.load(f)
    tempMfcc = np.array(list(tempMfccDict.values()))
    # len load
    with open(normalLenPickleList[i], 'rb') as f:
        tempLenDict = pickle.load(f)
    tempLen = np.array(list(tempLenDict.values()))
    # tempInputText
    tempInputText = np.array([inputTextDict1[fName[:-3] + 'txt'] for fName in tempMfccDict])
    # tempLabelText
    tempLabelText = np.array([labelTextDict1[fName[:-3] + 'txt'] for fName in tempMfccDict])
    validset = sPED_Dataset(tempMfcc, tempInputText, tempLabelText, tempLen, textToVector)
    tempDataLoaderName = '/content/drive/MyDrive/Data enhancement code for speech recognition/03 Dataloader/validation/normal/validDataloader' + str(new_i) +'.pickle'
    new_i += 1
    valid_dataloader = torch.utils.data.DataLoader(validset, batch_size=16, num_workers=2, shuffle = True)
    with open(tempDataLoaderName, 'wb') as f:
        pickle.dump(valid_dataloader, f, pickle.HIGHEST_PROTOCOL)

  0%|          | 0/178 [00:00<?, ?it/s]

## error

In [29]:
new_i = 0
for i in tqdm_notebook(validErrorIndex):
    # mfcc load
    with open(errorMfccPickleList[i], 'rb') as f:
        tempMfccDict = pickle.load(f)
    tempMfcc = np.array(list(tempMfccDict.values()))
    # len load
    with open(errorLenPickleList[i], 'rb') as f:
        tempLenDict = pickle.load(f)
    tempLen = np.array(list(tempLenDict.values()))
    # tempInputText
    tempInputText = np.array([inputTextDict1[fName[:-3] + 'txt'] for fName in tempMfccDict])
    # tempLabelText
    tempLabelText = np.array([labelTextDict1[fName[:-3] + 'txt'] for fName in tempMfccDict])
    validset = sPED_Dataset(tempMfcc, tempInputText, tempLabelText, tempLen, textToVector)
    tempDataLoaderName = '/content/drive/MyDrive/Data enhancement code for speech recognition/03 Dataloader/validation/error/trainDataloader' + str(new_i) +'.pickle'
    new_i += 1
    valid_dataloader = torch.utils.data.DataLoader(validset, batch_size=16, num_workers=2, shuffle = True)
    with open(tempDataLoaderName, 'wb') as f:
        pickle.dump(valid_dataloader, f, pickle.HIGHEST_PROTOCOL)

  0%|          | 0/16 [00:00<?, ?it/s]

# Test dataloader 생성

## normal

In [35]:
new_i = 0
for i in tqdm_notebook(testNormalIndex):
    # mfcc load
    with open(normalMfccPickleList[i], 'rb') as f:
        tempMfccDict = pickle.load(f)
    tempMfcc = np.array(list(tempMfccDict.values()))
    # len load
    with open(normalLenPickleList[i], 'rb') as f:
        tempLenDict = pickle.load(f)
    tempLen = np.array(list(tempLenDict.values()))
    # tempInputText
    tempInputText = np.array([inputTextDict1[fName[:-3] + 'txt'] for fName in tempMfccDict])
    # tempLabelText
    tempLabelText = np.array([labelTextDict1[fName[:-3] + 'txt'] for fName in tempMfccDict])
    testset = sPED_Dataset(tempMfcc, tempInputText, tempLabelText, tempLen, textToVector)
    tempDataLoaderName = '/content/drive/MyDrive/Data enhancement code for speech recognition/03 Dataloader/test/normal/validDataloader' + str(new_i) +'.pickle'
    new_i += 1
    test_dataloader = torch.utils.data.DataLoader(testset, batch_size=16, num_workers=2, shuffle = True)
    with open(tempDataLoaderName, 'wb') as f:
        pickle.dump(test_dataloader, f, pickle.HIGHEST_PROTOCOL)

  0%|          | 0/357 [00:00<?, ?it/s]

## error

In [30]:
new_i = 0
for i in tqdm_notebook(testErrorIndex):
    # mfcc load
    with open(errorMfccPickleList[i], 'rb') as f:
        tempMfccDict = pickle.load(f)
    tempMfcc = np.array(list(tempMfccDict.values()))
    # len load
    with open(errorLenPickleList[i], 'rb') as f:
        tempLenDict = pickle.load(f)
    tempLen = np.array(list(tempLenDict.values()))
    # tempInputText
    tempInputText = np.array([inputTextDict1[fName[:-3] + 'txt'] for fName in tempMfccDict])
    # tempLabelText
    tempLabelText = np.array([labelTextDict1[fName[:-3] + 'txt'] for fName in tempMfccDict])
    testset = sPED_Dataset(tempMfcc, tempInputText, tempLabelText, tempLen, textToVector)
    tempDataLoaderName = '/content/drive/MyDrive/Data enhancement code for speech recognition/03 Dataloader/test/error/trainDataloader' + str(new_i) +'.pickle'
    new_i += 1
    test_dataloader = torch.utils.data.DataLoader(testset, batch_size=16, num_workers=2, shuffle = True)
    with open(tempDataLoaderName, 'wb') as f:
        pickle.dump(test_dataloader, f, pickle.HIGHEST_PROTOCOL)

  0%|          | 0/35 [00:00<?, ?it/s]

In [31]:
textToVectorPath = '/content/drive/MyDrive/Data enhancement code for speech recognition/00 Data/03 textToVector/textToVector.p'
with open(textToVectorPath, 'wb') as f:
    pickle.dump(textToVector, f, pickle.HIGHEST_PROTOCOL)

with open(textToVectorPath, 'rb') as f:
    textToVector = pickle.load(f)

In [32]:
textToVector.vocab

{"'": 0,
 ' ': 1,
 '그': 2,
 '이': 3,
 '는': 4,
 '아': 5,
 '가': 6,
 '고': 7,
 '어': 8,
 '거': 9,
 '지': 10,
 '데': 11,
 '나': 12,
 '하': 13,
 '다': 14,
 '서': 15,
 '에': 16,
 '도': 17,
 '게': 18,
 '니': 19,
 '기': 20,
 '은': 21,
 '면': 22,
 '야': 23,
 '있': 24,
 '한': 25,
 '을': 26,
 '까': 27,
 '해': 28,
 '리': 29,
 '라': 30,
 '래': 31,
 '사': 32,
 '근': 33,
 '들': 34,
 '안': 35,
 '로': 36,
 '일': 37,
 '뭐': 38,
 '내': 39,
 '보': 40,
 '제': 41,
 '같': 42,
 '자': 43,
 '만': 44,
 '시': 45,
 '런': 46,
 '너': 47,
 '대': 48,
 '때': 49,
 '되': 50,
 '으': 51,
 '진': 52,
 '를': 53,
 '잖': 54,
 '오': 55,
 '러': 56,
 '막': 57,
 '인': 58,
 '무': 59,
 '었': 60,
 '구': 61,
 '했': 62,
 '수': 63,
 '간': 64,
 '애': 65,
 '우': 66,
 '요': 67,
 '마': 68,
 '생': 69,
 '렇': 70,
 '냥': 71,
 '짜': 72,
 '주': 73,
 '없': 74,
 '말': 75,
 '학': 76,
 '스': 77,
 '더': 78,
 '많': 79,
 '원': 80,
 '음': 81,
 '정': 82,
 '겠': 83,
 '먹': 84,
 '여': 85,
 '금': 86,
 '든': 87,
 '부': 88,
 '할': 89,
 '번': 90,
 '전': 91,
 '좋': 92,
 '랑': 93,
 '네': 94,
 '람': 95,
 '약': 96,
 '건': 97,
 '각': 98,
 '좀': 99,
 '알': 100,