# Data prepare

## Define path

In [None]:
data_path = '/content/Data'
drive_path = '/content/drive/MyDrive/foreigner_speech'

## Data load

In [None]:
from datetime import datetime
import zipfile
import os

print('='*20,'Unzip Start','='*20,'\n')

start_time = datetime.now()

if not os.path.exists(data_path+'/train/'):
    with zipfile.ZipFile(drive_path+'/Data/train/[train] 음성데이터_wav.zip') as zip_file:
        zip_file.extractall(data_path+'/train/wav/')
    with zipfile.ZipFile(drive_path+'/Data/train/[train] 메타데이터_json.zip') as zip_file:
        zip_file.extractall(data_path+'/train/json/')
else:
    print('Json file exists')

end_time = datetime.now()

print('\nElapse time => ', end_time-start_time)



Elapse time =>  0:02:57.852460


In [None]:
from glob import glob
train_list = glob(data_path+"/train/wav/*.wav")
json_list = glob(data_path+"/train/json/*.json")

In [None]:
from datetime import datetime
import json
import pandas as pd

# fileName, Reading, ReadingLabelText 항목만 csv로 저장
fileName_list = []
Reading_list = []
ReadingLabelText_list = []

start_time = datetime.now()

for i in json_list :
  with open(i, encoding='utf-8') as f : 
    data = json.load(f)
    fileName_list.append(data['fileName'])
    Reading_list.append(data['transcription']['Reading'])
    ReadingLabelText_list.append(data['transcription']['ReadingLabelText'])
df = pd.DataFrame(data = list(zip(fileName_list, Reading_list, ReadingLabelText_list)), columns = ['fileName', 'Reading', 'ReadingLabelText'])

end_time = datetime.now()

print('\nElapse time => ', end_time-start_time)


Elapse time =>  0:00:14.460325


## Save json as csv

In [None]:
if not os.path.exists(drive_path+'/Data/processing/'):
    os.mkdir(drive_path+'/Data/processing/')
    print('processing dir created !')

In [None]:
df.to_csv(drive_path+'/Data/processing/meta.csv',encoding='utf-8',index=False)

# Load Data

In [None]:
json_df = pd.read_csv(drive_path+'/Data/processing/meta.csv')

In [None]:
json_df.head(5)

Unnamed: 0,fileName,Reading,ReadingLabelText
0,TH28RB191_TH0084_20210825.wav,"치킨이 맛있다고 많이 먹으면, 높은 열량으로 인해 살이 찔 수 있기에, 적당히 섭취...",치킨이 맛있다고 많이 먹으면 높은 열량으로 인해 살이 찔 수 있기에 적당히 섭취하는...
1,JP55RA123_JP0176_20210825.wav,"출발할 때는 비즈니스석을 예매하셨으니, 전용 라운지 이용이 가능하세요. 간단한 음료...",출발할 때는 비즈니스석을 예매하셨으니 전용 라운지 이용이 가능하세요 간단한 음료나 ...
2,CN41RB004_CN0225_20210805.wav,"광화문은 문이 세 개나 있는데, 정문 앞에는 노란색, 빨간색 깃발을 들고 한국의 전...",광화문은 문이 세 개나 있는데 정문 앞에는 노란색 빨간색 깃발을 들고 한국의 전통 ...
3,JP15RC121_JP0110_20210730.wav,"의사 선생님, 저는 운동을 하다가 발목을 삐었는데, 걸을 때마다 아직도 아파요. 사...",의사 선생님 저는 운동을 하다가 발목을 삐었는데 걸을 때마다 아직도 아파요 사진을 ...
4,JP38RA198_JP0019_20210731.wav,"교육 문제로 충돌이 있을 때는, 아이가 없는 곳에서 서로 대화를 통해 이해하려고 노...",교육 문제로 충돌이 있을 때는 아이가 없는 곳에서 서로 대화를 통해 이해하려고 노력...


# Label prepocessing

Reference:  
https://github.com/sooftware/KoSpeech/wiki/Preparation-before-Training

## Transcriptional text processing

Remove `/`, `+`, `un/`, `sn/`

In [None]:
def text_filter(sen) :
  return sen.replace("un/","").replace("sn/","").replace("/","").replace("+","").replace("  "," ")

In [None]:
json_df['new_ReadingLabelText'] = json_df['ReadingLabelText'].apply(text_filter)

# Create character labels

## Create label data

**Execute only one time.**
    
ReadingLabelText에서 등장한 글자와 그 빈도를 label.csv에 저장

In [None]:
from tqdm import tqdm 
label_list = []    # 글자 리스트
label_freq = []    # 각 글자별 빈도 리스트

start_time = datetime.now()

for label in tqdm(json_df['new_ReadingLabelText']):
  for ch in label :
    if ch not in label_list :
      label_list.append(ch)
      label_freq.append(1)
    else :
      label_freq[label_list.index(ch)] += 1

label_freq, label_list = zip(*sorted(zip(label_freq, label_list), reverse=True))
label = {'char': ['_', '<s>', '</s>'], 'freq': [0, 0, 0]}
for idx, (ch, freq) in enumerate(zip(label_list, label_freq)):
    label['char'].append(ch)
    label['freq'].append(freq)
    
end_time = datetime.now()

print('\nElapse time => ', end_time-start_time)

100%|██████████| 300000/300000 [01:18<00:00, 3845.30it/s]


Elapse time =>  0:01:18.037558





In [None]:
label_df = pd.DataFrame(label).reset_index().rename(columns={'index':'id'})
label_df

Unnamed: 0,id,char,freq
0,0,_,0
1,1,<s>,0
2,2,</s>,0
3,3,,4163793
4,4,이,388630
...,...,...,...
1033,1033,뎐,1
1034,1034,뎌,1
1035,1035,논,1
1036,1036,꾼,1


### train test split  


once freq = test  
others = train

In [None]:
train_labels = label_df[label_df['freq'] != 1]
test_labels = label_df

Dictonary to csv

In [None]:
train_labels.to_csv(drive_path+'/Data/processing/train_dict.csv', encoding='utf-8', index=False)
test_labels.to_csv(drive_path+'/Data/processing/validation_dict.csv', encoding='utf-8', index=False)

## Convert sentence and number

In [None]:
# id to char, char to id
def load_label(filepath):
    char2id = dict()
    id2char = dict()
    ch_labels = pd.read_csv(filepath, encoding='utf-8')
    id_list = ch_labels["id"]
    char_list = ch_labels["char"]
    freq_list = ch_labels["freq"]
    
    for (id, char, freq) in zip(id_list, char_list, freq_list):
        char2id[char] = id
        id2char[id] = char
    return char2id, id2char

In [None]:
# text -> number
def sentence_to_target(sentence, char2id):
    target = ""
    for ch in sentence:
        target += (str(char2id[ch]) + ' ')
    return target[:-1]

In [None]:
# number -> text
def target_to_sentence(target, id2char):
    sentence = ""
    targets = target.split()

    for n in targets:
        sentence += id2char[int(n)]
    return sentence

In [None]:
# text.csv에 숫자로 라벨링한 text 추가
def target(sen, char2id) :
  return sentence_to_target(sen, char2id)

In [None]:
char2id, id2char = load_label(drive_path+'/Data/processing/validation_dict.csv')

In [None]:
json_df['ReadingTarget'] = json_df['new_ReadingLabelText'].apply(lambda x : sentence_to_target(x, char2id))

In [None]:
json_df.to_csv(drive_path+'/Data/processing/processed_meta.csv',encoding='utf-8',index=False)

## Add audio data

In [None]:
# import pandas as pd
# json_df = pd.read_csv(drive_path+'/Data/processing/json_add_num.csv', encoding='utf-8')

In [None]:
# from datetime import datetime
# import librosa

# print('='*30,'Start adding audio data','='*30)
# start_time = datetime.now()

# file_name_list = list(json_df['fileName'])

# for file_path in train_list:
#     file_name = file_path.split('/')[-1]
#     index = file_name_list.index(file_name)
#     y, _ = librosa.load(file_path, sr=16000)
#     json_df.loc[index,'audio'] = str(y)

# end_time = datetime.now()

# print('\nElapse time => ', end_time-start_time)

In [None]:
# json_df.to_csv(drive_path+'/Data/processing/processed_data.csv', encoding='utf-8',index=False)