In [None]:
from sklearn.model_selection import KFold
# coding:utf-8
import os
import shutil
import json
import numpy as np
import pandas as pd
from collections import OrderedDict
import natsort #listdir 순서정렬

def make_meta(label_path,wav_path, num):
    
    meta = {'filename':[],
        'target':[],
        'length':[] }
    label_list = os.listdir(label_path)
    wav_list = os.listdir(wav_path)
    label_list = natsort.natsorted(label_list)
    if num > 0:
        label_list = label_list[:num]

    for idx in range(len(label_list)):
        with open(label_path+'/'+label_list[idx],'r') as file:
            data = json.load(file)
        wav = data['annotations'][0]['audio_id']
        filename = wav + '.wav'
        target = wav.split('.')[0]
        area = data['annotations'][0]['area']
        length = area['end'] - area['start']
        if filename in wav_list:
            meta['filename'].append(filename)
            meta['target'].append(target)
            meta['length'].append(length)
    
    df = pd.DataFrame(meta)

    return df

def hubdataprocess(train_path,label_num,kfold=5,save=False):
    
    folder_list = os.listdir(train_path)
    wav_list = [s for s in folder_list if '원천' in s]
    label_list = [s for s in folder_list if '라벨' in s]
    wav_list = natsort.natsorted(wav_list)
    label_list = natsort.natsorted(label_list)

    df = 0
    for idx2 in range(len(wav_list)):
        label_path = train_path+'/'+label_list[idx2]
        wav_path = train_path+'/'+wav_list[idx2]

        meta = make_meta(label_path,wav_path,label_num)
        
        if idx2 == 0:
            df = meta.copy()
        else:
            df = pd.concat([df,meta],ignore_index=True)

    numbers = df.index
    kf = KFold(n_splits=kfold,shuffle=True)

    fold_number = [0] * len(numbers)  # 초기화
    for fold, (train_index, test_index) in enumerate(kf.split(numbers)):
        for index in test_index:
            fold_number[index] = fold + 1  # 폴드 번호 할당 (1부터 시작)

    fold_list = list(zip(numbers, fold_number))

    df['fold'] = 0
    for i in range(len(fold_list)):
        idx3 = fold_list[i][0]
        df.iloc[idx3,3] = fold_list[i][1]

    classes = df['target'].unique()
    
    if len(classes) != 16:
        for idx in range(len(classes)):
            df.loc[df['target']==classes[idx],'target'] = idx
    
    df['target'] = df['target'].astype(int)
    if save==True:
        df.to_csv('custom_meta.csv',index=False)
        
    
    return df


In [None]:
train_path = r'C:\DataScience\deepdaiv_emergencyclf\위급상황 음성_음향\Training'
df = hubdataprocess(train_path,label_num=10000,kfold=5)

In [None]:
test_path = r'C:\DataScience\deepdaiv_emergencyclf\위급상황 음성_음향\Validation'
test = hubdataprocess(test_path,label_num=1000,kfold=5)

In [None]:
df.to_csv('custom_meta.csv',index=False) #저장후 train wav가 있는 폴더로 이동
test.to_csv('custom_meta_test.csv',index=False)#저장후 test wave가 있는 폴더로 이동