### 실행 모듈

데이터 다운로드 : 다운로드할 디렉토리(object) 목록을 run_pipe.yaml에 설정  
데이터 검증  
데이터 리샘플링  
토크나이저 실행(방언별로 각각 실행)  
- 데이터 준비  
- 토큰화  

언어모델 실행(방언별로 각각 실행)  
음성인식 실행(방언별로 각각 실행)  
추론 준비 : 추론에 필요한 모델 파일을 한 디렉토리에 복사

In [18]:
import hyperpyyaml
from hyperpyyaml import load_hyperpyyaml
import speechbrain as sb
import logging
import os
from pathlib import Path
import boto3
import datetime
from tqdm import tqdm

In [None]:
# 설정값 yaml에서 읽어오기
# log file 설정

In [20]:
# hparams_file, run_opts, overrides = sb.parse_arguments(sys.argv[1:])
hparams_file = 'run_pipe.yaml'
with open(hparams_file) as fin:
    hparams = load_hyperpyyaml(fin)

In [6]:
logger = logging.getLogger(__name__)

log_config = hparams["log_config"]
log_file = hparams["log_file"]

logger_overrides = {
    "handlers": {"file_handler": {"filename": log_file}}
}

# setup_logging(config_path="log-config.yaml", overrides={}, default_level=logging.INFO)
sb.utils.logger.setup_logging(log_config, logger_overrides)

In [9]:
### download

# yaml에서 설정값 읽어오기 : 스토리지 접속 정보, 데이터 저장 위치

service_name = hparams["service_name"]
endpoint_url = hparams["endpoint_url"]
region_name = hparams["region_name"]
access_key = hparams["access_key"]
secret_key = hparams["secret_key"]


In [10]:
s3 = boto3.client(service_name, endpoint_url=endpoint_url, aws_access_key_id=access_key,
                    aws_secret_access_key=secret_key)

In [11]:
def get_s3_object_list(s3, bucket_name, prefix, max_keys):
    obj_list = []
    response = s3.list_objects(Bucket=bucket_name, MaxKeys=max_keys, Prefix=prefix)

    while True:
        if response.get('Contents') is not None:

            for content in response.get('Contents'):
                filename = content.get('Key')
                date_info = content.get('LastModified')
                
                obj_list.append(filename)
        
            if response.get('IsTruncated'):
                response = s3.list_objects(Bucket=bucket_name, MaxKeys=max_keys, Prefix=prefix,
                                        Marker=response.get('NextMarker'))
            else:
                break
        
        else:
            print(f'{prefix} : there is no data.')
            break
    
    return obj_list

In [21]:
data_save_path = secret_key = hparams["data_save_path"]
os.makedirs(data_save_path, exist_ok=True)


bucket_name = hparams["bucket_name"]
max_keys = hparams["max_keys"]
key_names = hparams["key_names"]

In [22]:
key_names

['1214Dataset', '1215Dataset']

In [23]:
error_file = 'error_' + datetime.datetime.now().strftime('%Y%m%d%H%M%S') + '.txt'
error_file_path = os.path.join(data_save_path, error_file)
with open(error_file_path, 'w') as lf:

    for date in key_names:
        logger.info(date)
        logger.info(f'----- date download start -----\n')
        # object_list = get_s3_object_list(s3, bucket_name, date, max_keys)
        # print(object_list[0])

        # for key in tqdm(object_list):
        for key in tqdm(get_s3_object_list(s3, bucket_name, date, max_keys)):
            # print(Path(key).suffix)
            if Path(key).suffix in ['.json', '.wav']:
                key_2 = key.split('/')
                save_dir = os.path.join(data_save_path, key_2[1], key_2[2], key_2[3], key_2[4])
                os.makedirs(save_dir, exist_ok=True)
                save_file = os.path.join(save_dir, key_2[5])
                try:
                    # print(i)
                    s3.download_file(bucket_name, key, save_file)
                except:
                    lf.write(f'{key}\n')
                    print(f'{key} is not exist.')
                    continue
            else:
                lf.write(f'{key}\n')
                print(f'{key} is not file')

        logger.info(f'----- date download end -----\n')

    lf.write('----- end time : ' + datetime.datetime.now().strftime('%Y%m%d%H%M%S'))
lf.close()

__main__ - 1214Dataset
__main__ - ----- date download start -----



100%|██████████| 7542/7542 [11:58<00:00, 10.50it/s]

__main__ - ----- date download end -----

__main__ - 1215Dataset
__main__ - ----- date download start -----




100%|██████████| 8210/8210 [13:53<00:00,  9.85it/s]

__main__ - ----- date download end -----




