# 필요 라이브러리 설치

In [None]:
!pip3 install pickle5

Looking in indexes: https://pypi.org/simple, https://us-python.pkg.dev/colab-wheels/public/simple/
Collecting pickle5
  Downloading pickle5-0.0.12-cp37-cp37m-manylinux_2_5_x86_64.manylinux1_x86_64.whl (256 kB)
[K     |████████████████████████████████| 256 kB 17.0 MB/s 
[?25hInstalling collected packages: pickle5
Successfully installed pickle5-0.0.12


# 필요 라이브러리 로드

In [None]:
# pytorch 계열 라이브러리
import torch
from torch import nn
import torch.nn.functional as F
import torch.optim as optim
from torch.utils.data import Dataset, DataLoader
import torchaudio

# 정규표현식 사용용도
import re

# 그 외 기본 라이브러리
import os
import pickle5 as pickle
import numpy as np
import pandas as pd
from tqdm import tqdm, tqdm_notebook
import random
import time
from typing import List
import IPython
import matplotlib.pyplot as plt

# 경고 제거용
import warnings
warnings.filterwarnings('ignore')

In [None]:
from google.colab import drive
drive.mount('/content/drive')

Mounted at /content/drive


# 텍스트 전처리 함수

## Input Text 전처리 함수

In [None]:
def InputDataPreProcessing(inputText):
    # 0. \n 지우기
    preProcessedText = inputText.replace('\n', '')
    # 가. 잡음 관련 기호 제거 (b/, o/, n/, l/)
    preProcessedText = re.sub("[a-z][/]", "", preProcessedText)
    # +, * 제거
    preProcessedText = re.sub("[+*/]", "", preProcessedText)
    # 이중 전사 표현 발음만 남기기 (a)/(b) - > b
    tempProcessedText = ''
    a_num = 0
    b_num = 0
    for i in range(len(preProcessedText)):
        if preProcessedText[i] != '(' and a_num == 0:
            tempProcessedText += preProcessedText[i]
        elif preProcessedText[i] == '(' and a_num == 0:
            a_num += 1
        elif preProcessedText[i] == ')' and a_num == 1:
            a_num += 1
        elif preProcessedText[i] == '(' and a_num == 2:
            b_num += 1
        elif preProcessedText[i] == ')' and a_num == 2:
            b_num += 1
        elif b_num == 1:
            tempProcessedText += preProcessedText[i]
        
        if b_num == 2:
            a_num = 0 
            b_num = 0

    tempProcessedText = re.sub('[^\sA-Za-z0-9가-힣*]', '', tempProcessedText)

    # space가 반복된 문장 space 하나로 대체하기
    tempProcessedText = re.sub('\s+', ' ', tempProcessedText)
    
    if tempProcessedText[0] == ' ':
        tempProcessedText = tempProcessedText[1:]

    if tempProcessedText[len(tempProcessedText)-1] == ' ':
        tempProcessedText=tempProcessedText[:-1]

    return tempProcessedText

## Label Text 전처리 함수

In [None]:
def LabelDataPreProcessing(inputText):
    # 0. \n 지우기
    preProcessedText = inputText.replace('\n', '')
    # 가. 잡음 관련 기호 제거 (b/, o/, n/, l/)
    preProcessedText = re.sub("[a-z][/]", "", preProcessedText)
    # + 제거
    preProcessedText = re.sub("[+]", "", preProcessedText)
    # 이중 전사 표현 발음만 남기기 (a)/(b) - > b
    tempProcessedText = ''
    a_num = 0
    b_num = 0
    for i in range(len(preProcessedText)):
        if preProcessedText[i] != '(' and a_num == 0:
            tempProcessedText += preProcessedText[i]
        elif preProcessedText[i] == '(' and a_num == 0:
            a_num += 1
        elif preProcessedText[i] == ')' and a_num == 1:
            a_num += 1
        elif preProcessedText[i] == '(' and a_num == 2:
            b_num += 1
        elif preProcessedText[i] == ')' and a_num == 2:
            b_num += 1
        elif b_num == 1:
            tempProcessedText += preProcessedText[i]
        
        if b_num == 2:
            a_num = 0 
            b_num = 0
    # *가 포함되어 있는 단어 지우기
#     preProcessedText = re.sub("\S*[*]", "*", tempProcessedText)
    preProcessedText = tempProcessedText
    p = re.compile('\S*[*]')
    pList = p.findall(preProcessedText)
    pList.sort(key=len, reverse=True)
    
    for pl in pList:
        tempStr = ''
        for i in range(len(pl)-1):
            tempStr = tempStr + '*'

        preProcessedText = preProcessedText.replace(pl, tempStr)

    preProcessedText = re.sub('[^\sA-Za-z0-9가-힣*]', '', preProcessedText)

    # space가 반복된 문장 space 하나로 대체하기
    preProcessedText = re.sub('\s+', ' ', preProcessedText)
    
    if preProcessedText[0] == ' ':
        preProcessedText = preProcessedText[1:]

    if preProcessedText[len(preProcessedText)-1] == ' ':
        preProcessedText=preProcessedText[:-1]
     
    return preProcessedText

## Label string to 0(Normal) or 1(Error)

In [None]:
def LabelToInt(labelText):
    output = []
    labelList = list(labelText)
    for l in labelList:
        if l =='*':
            output.append(3)
        elif l == ' ':
            output.append(1) 
        else:
            output.append(2)

    output =  np.array(output)
    return output

## TextToVector Class

In [None]:
class TextToVector():
    def __init__(self, vocab = None, textList = None):
        if vocab == None:
            self.vocab = {}
        else:
            self.vocab = vocab
        
        if textList != None:
            self.vocab = self.madeVocab(textList, self.vocab)
    
    def IncreaseString(self, simpleString, mfccLen):
        simpleCharList = list(simpleString)

        lenSimpleChar = len(simpleCharList)
        increaseRate, randomIndexNum = mfccLen // lenSimpleChar, mfccLen % lenSimpleChar
        increaseString = ''
        increaseRates = np.full((lenSimpleChar), increaseRate)

        for i in np.random.randint(0, lenSimpleChar, size = randomIndexNum):
            increaseRates[i] += 1

        for index, incRate in enumerate(increaseRates):
            for i in range(incRate):
                increaseString += simpleCharList[index]
        
        return increaseString
    
    def madeVocab(self, strList, vocab = None):
        if vocab != None:
            self.vocab = vocab
        self.vocab.setdefault("'", 0)

        vocabList = list(self.vocab.keys())
        vocabFreauency = {}

        for tempText in strList:
            for tempStr in tempText:
                for tempChar in tempStr:
                    if tempChar not in vocabList:
                        vocabFreauency.setdefault(tempChar, 0)
                        vocabFreauency[tempChar] += 1

            
        vocabFreauency = dict(sorted(vocabFreauency.items(), reverse = True, key = lambda item: item[1]))

        for v in vocabFreauency:
            self.vocab.setdefault(v, len(self.vocab))        

        return self.vocab
    
    def TextToId(self, TextList, mfccLens, padLn=None):
        idList = []
        
        for index, tempText in enumerate(TextList):
            strList = list(tempText)
            lenSimpleChar = len(tempText)
            increaseRate = mfccLens[index] // lenSimpleChar
            strT = self.IncreaseString(tempText, mfccLens[index])

            if padLn == None:
                padLen = len(strT)
            else:
                padLen = padLn

            strList = list(strT)
            tempIdList = []

            for s in strList:
                if s in list(self.vocab):
                    tempIdList.append(self.vocab[s])
                else:
                    tempIdList.append(1)

            tempIdList = np.concatenate((tempIdList, np.zeros(padLn-mfccLens[index])), axis=0)
            idList.append(tempIdList)

        return np.array(idList)
    
    def TextToVector(self, TextList, mfccLens, padLn=None, overlapPercent=0):
        textVector = []
        for index, strT in enumerate(TextList):
            lenSimpleChar = len(strT)
            increaseRate = mfccLens[index] // lenSimpleChar
            overlapNum = int(increaseRate * overlapPercent)
            strT = self.IncreaseString(strT, mfccLens[index])

            if padLn == None:
                padLen = len(strT)
            else:
                padLen = padLn

            strList = list(strT)
            if len(self.vocab)%2 == 0:
                oneHotVector = np.zeros((padLen, len(self.vocab)))
            else:
                oneHotVector = np.zeros((padLen, len(self.vocab) + 1))              

            for i, s in enumerate(strT):
                oneHotVector[i][self.vocab[s]] = 1
                if overlapNum + i < len(strT):
                    oneHotVector[overlapNum + i][self.vocab[s]] = 1
                if i - overlapNum >= 0:
                    oneHotVector[i - overlapNum][self.vocab[s]] = 1

            textVector.append(oneHotVector)

        return np.array(textVector)

## label padding 함수

In [None]:
def intPadding(output, padLen = 512):
    padList = np.zeros(padLen-len(output))
    return np.concatenate([output, padList])

## 전처리 예시

In [None]:
sample = 'o/ 근데* 키가 크다니까? 백팔십팔? 그리고 몸\n'

print(sample)
print(InputDataPreProcessing(sample))
print(LabelDataPreProcessing(sample))
print(LabelToInt(LabelDataPreProcessing(sample)))
print(intPadding(LabelToInt(LabelDataPreProcessing(sample))))
textToVector = TextToVector()
print(textToVector.madeVocab([LabelDataPreProcessing(sample)]))
print(textToVector.TextToId([LabelDataPreProcessing(sample), LabelDataPreProcessing(sample)], [1240, 1240], padLn=3500))
print(textToVector.TextToId([LabelDataPreProcessing(sample), LabelDataPreProcessing(sample)], [1240, 1240], padLn=3500)[0])

o/ 근데* 키가 크다니까? 백팔십팔? 그리고 몸

근데 키가 크다니까 백팔십팔 그리고 몸
** 키가 크다니까 백팔십팔 그리고 몸
[3 3 1 2 2 1 2 2 2 2 1 2 2 2 2 1 2 2 2 1 2]
[3. 3. 1. 2. 2. 1. 2. 2. 2. 2. 1. 2. 2. 2. 2. 1. 2. 2. 2. 1. 2. 0. 0. 0.
 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0.
 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0.
 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0.
 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0.
 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0.
 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0.
 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0.
 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0.
 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0.
 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0.
 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0.
 0. 0. 

# Text Data Load

In [None]:
labelPath = '/content/drive/MyDrive/Data enhancement code for speech recognition/05  Data for improvement/외국인 음성 데이터셋/음성 데이터/영어/englishTxtDict.pickle'
with open(labelPath, 'rb') as f:
    labelDataDict01 = pickle.load(f)

# Input Data
inputTextDict1 = {}

# Label Data
labelTextDict1 = {}

for key in tqdm_notebook(list(labelDataDict01)):
    inputTextDict1.setdefault(key, InputDataPreProcessing(labelDataDict01[key]))
    labelTextDict1.setdefault(key, LabelDataPreProcessing(labelDataDict01[key]))

inputTextList = np.array(list(inputTextDict1.values()))
labelTextList = np.array(list(labelTextDict1.values()))

  0%|          | 0/25152 [00:00<?, ?it/s]

In [None]:
inputTextList

array(['저는 미국에서 왔습니다 저는 한국에서 있 오는 이유는 한국어를 제대로 공부하고 싶어서 왔습니다',
       '제가 스웨덴에서 왔고 어 우리나라가 큰 나라이지만 인구가 좀 적어서 어 학생이라도 재밌게 할 수 있는 게 많이 없고 카페나 술집이나 이런 게 많이 없어서 그런 거 한국에 많이 있다고 들었고 그것 때문에 한국에 공부하러 왔어요',
       '저는 미국에서 왔고요 한국에 처음에 온 이유는 케이팝을 좋아해서 온 건데 나중에 또 대학교를 여기서 다니게 돼 가지고 대학교를 위해서 온 것도 있어요',
       ..., '나도 찬성이야 어떤 영화를 보든 즐겁고 의미 있는 시간을 보낼 수 있으면 참 좋을 것 같아 너도 그렇지',
       '나도 찬성이야 어떤 영화를 보는 즐겁고 의미 있는 시간을 보낼 수 있으면 참 좋을 것 같아 너도 그렇지',
       '나도 찬성이야 어떤 영화를 보든 즐겁고 의미 있는 시간을 보낼 수 있으면 참 좋을 것 같아 너도 그렇지'],
      dtype='<U349')

In [None]:
len(inputTextList)

25152

In [None]:
textToVectorPath = '/content/drive/MyDrive/Data enhancement code for speech recognition/00 Data/03 textToVector/textToVector.p'
with open(textToVectorPath, 'rb') as f:
    textToVector = pickle.load(f)

In [None]:
LabelToInt(labelTextList[14]), intPadding(LabelToInt(labelTextList[14]))

(array([2, 2, 1, 2, 2, 1, 2, 2, 2, 2, 2, 1, 2, 1, 2, 2, 2, 1, 2, 2, 1, 2,
        2, 1, 2, 2, 1, 2, 2, 2, 1, 2, 2, 2, 1, 2, 2, 2, 1, 2, 1, 2, 2, 1,
        2, 2, 2, 1, 2, 2, 2, 1, 2, 2, 1, 2, 2, 2, 2, 2, 2, 1, 2, 2, 2, 2,
        1, 2, 1, 2, 2, 2, 1, 2, 2, 1, 2, 2, 2, 1, 2, 2, 2, 1, 2, 2, 2, 1,
        2, 1, 2, 1, 2, 2, 2]),
 array([2., 2., 1., 2., 2., 1., 2., 2., 2., 2., 2., 1., 2., 1., 2., 2., 2.,
        1., 2., 2., 1., 2., 2., 1., 2., 2., 1., 2., 2., 2., 1., 2., 2., 2.,
        1., 2., 2., 2., 1., 2., 1., 2., 2., 1., 2., 2., 2., 1., 2., 2., 2.,
        1., 2., 2., 1., 2., 2., 2., 2., 2., 2., 1., 2., 2., 2., 2., 1., 2.,
        1., 2., 2., 2., 1., 2., 2., 1., 2., 2., 2., 1., 2., 2., 2., 1., 2.,
        2., 2., 1., 2., 1., 2., 1., 2., 2., 2., 0., 0., 0., 0., 0., 0., 0.,
        0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0.,
        0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0.,
        0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0

In [None]:
print(textToVector.TextToId([inputTextList[14]], [1240], padLn=3500).shape)

embedding_layer = nn.Embedding(num_embeddings=len(textToVector.vocab), 
                               embedding_dim=20,
                               padding_idx=1)


print(embedding_layer(torch.tensor(textToVector.TextToId([inputTextList[14]], [1240], padLn=3500)).long())[0].shape)

(1, 3500)
torch.Size([3500, 20])


In [None]:
et = embedding_layer(torch.tensor(textToVector.TextToId([inputTextList[14], inputTextList[14]], [1240, 1200], padLn=3500)).long())

et = et.reshape(et.size()[0], 1, et.size()[1], et.size()[2])
et.transpose(2, 3).shape

torch.Size([2, 1, 20, 3500])

In [None]:
lenLabelList = [len(LabelToInt(i)) for i in labelTextList]

np.max(lenLabelList)

349

# Dataset 프레임워크

In [None]:
class sPED_Dataset2(Dataset):
    def __init__(
        self, mfcc, inputText, mfccLens, textToVector, fileNames
    ):
        self.mfcc = mfcc
        self.mfccLens = mfccLens
        self.realLens = self.mfcc.shape[3]
        self.textVector = textToVector.TextToId(inputText, mfccLens, padLn=self.realLens)
        self.fileNames = fileNames
        

    def __getitem__(self, i):
        return ((self.mfcc[i], ) + (self.mfccLens[i], ) + (self.textVector[i],) + (self.fileNames[i],))#(self.labels[i], ) + (self.lenLabels[i], ))

    def __len__(self):
        return (len(self.inputText))

# 음성 데이터 파일명 리스트 로드

In [None]:
normalFilePath = '/content/drive/MyDrive/Data enhancement code for speech recognition/05  Data for improvement/외국인 음성 데이터셋/Preprocessed Data/영어'
normalFileList = os.listdir(normalFilePath)

normalLenPickleList = []
normalMfccPickleList = []

for fName in normalFileList:
    if 'Len' in fName:
        normalLenPickleList.append(normalFilePath + '/' + fName)
    else:
        normalMfccPickleList.append(normalFilePath + '/' + fName)

normalLenPickleList = sorted(normalLenPickleList)
normalMfccPickleList = sorted(normalMfccPickleList)

In [None]:
len(normalMfccPickleList)

99

# Dataloader 생성

In [None]:
new_i = 0
for i in tqdm_notebook(range(len(normalMfccPickleList))):
    # mfcc load
    with open(normalMfccPickleList[i], 'rb') as f:
        tempMfccDict = pickle.load(f)
    tempMfcc = np.array(list(tempMfccDict.values()))
    # len load
    with open(normalLenPickleList[i], 'rb') as f:
        tempLenDict = pickle.load(f)
    tempLen = np.array(list(tempLenDict.values()))
    # tempInputText
    tempInputText = np.array([inputTextDict1[fName[:-3] + 'csv'] for fName in tempMfccDict])
    # tempLabelText
    tempLabelText = np.array([labelTextDict1[fName[:-3] + 'csv'] for fName in tempMfccDict])
    trainset = sPED_Dataset2(tempMfcc, tempInputText, tempLen, textToVector, list(tempMfccDict))
    tempDataLoaderName = '/content/drive/MyDrive/Data enhancement code for speech recognition/05  Data for improvement/외국인 음성 데이터셋/Dataloader/영어/영어Dataloader' + str(new_i) +'.pickle'
    new_i += 1
    train_dataloader = torch.utils.data.DataLoader(trainset, batch_size=1, num_workers=2, shuffle = False)
    with open(tempDataLoaderName, 'wb') as f:
        pickle.dump(train_dataloader, f, pickle.HIGHEST_PROTOCOL)

  0%|          | 0/99 [00:00<?, ?it/s]