In [2]:
import sys
sys.path.append('./models/')

In [13]:
import os
import torch
import torchaudio
import torch.nn.functional as F
from torch.utils.data import Dataset, DataLoader
from torchvision import transforms
from sklearn.model_selection import train_test_split
from collections import Counter

In [10]:
DATASET = ".\\Dataset"
CLASS_NAME = ["danger", "fire", "gas", "non", "tsunami"]
BATCH_SIZE = 16

In [6]:
# 데이터와 라벨
filepaths = []
labels = []

for root, _, files in os.walk(DATASET):
    class_label = os.path.basename(root)
    if class_label not in CLASS_NAME:
        continue

    for file in files:
        if not file.lower().endswith(".wav"):
            continue

        full_path = os.path.join(root, file)
        filepaths.append(full_path)
        labels.append(class_label)

In [8]:

class AudioDataset(Dataset):
    def __init__(self, filepaths, labels, sample_rate=16000, duration=10.0):
        self.filepaths = filepaths # 데이터 경로
        self.labels = labels # 라벨
        self.sample_rate = sample_rate # 샘플링레이트
        self.num_samples = int(sample_rate * duration) # 오디오 샘플의 길이

    # 데이터셋의 길이 = 파일 갯수
    def __len__(self):
        return len(self.filepaths)

    def __getitem__(self, idx):
        filepath = self.filepaths[idx]
        label = self.labels[idx]

        waveform, sr = torchaudio.load(filepath)

        # 모노(1채널)로 변환
        if waveform.shape[0] > 1:
            waveform = waveform.mean(dim=0, keepdim=True)

        # 원하는 샘플링레이트가 아니면 리샘플링
        if sr != self.sample_rate:
            resampler = torchaudio.transforms.Resample(sr, self.sample_rate)
            waveform = resampler(waveform)

        # 오디오 샘플의 길이조정
        if waveform.shape[1] < self.num_samples: # 길이가 부족하면 0(무음)을 채워 길이를 연장
            waveform = F.pad(waveform, (0, self.num_samples - waveform.shape[1]))
        else:
            waveform = waveform[:, :self.num_samples] # 길이가 길면 슬라이싱

        # 절댓값 정규화(-1 ~ 1)
        waveform = waveform / (waveform.abs().max() + 1e-9)
        '''
        샘플의 shape를 모델에 맞추기

        샘플은 전처리과정을 거쳐 모노화(1채널)와 
        길이가 고정 되어 (1, 160000) 형태

        모델은 (batch_size, data_length)형태의 입력을 받는다
        여기서 필요한 부분은 data_length뿐 채널의 정보는 필요없다

        그래서 채널정보를 제거하기위해 squeeze()
        '''
        waveform = waveform.squeeze(0)
        return waveform, label


In [9]:
X_train, X_temp, y_train, y_temp = train_test_split(
    filepaths, labels,
    test_size=0.2,
    stratify=labels,
    random_state=42
)

X_val, X_test, y_val, y_test = train_test_split(
    X_temp, y_temp,
    test_size=0.5,
    stratify=y_temp,
    random_state=42
)

In [14]:
print(Counter(y_train))
print(Counter(y_val))
print(Counter(y_test))

Counter({'danger': 480, 'fire': 480, 'tsunami': 480, 'non': 480, 'gas': 480})
Counter({'gas': 60, 'fire': 60, 'danger': 60, 'non': 60, 'tsunami': 60})
Counter({'danger': 60, 'fire': 60, 'gas': 60, 'non': 60, 'tsunami': 60})


In [11]:
train_dataset = AudioDataset(X_train, y_train)
val_dataset   = AudioDataset(X_val,   y_val)
test_dataset  = AudioDataset(X_test,  y_test)

train_loader = DataLoader(train_dataset, batch_size=BATCH_SIZE, shuffle=True)
val_loader   = DataLoader(val_dataset,   batch_size=BATCH_SIZE, shuffle=False)
test_loader  = DataLoader(test_dataset,  batch_size=BATCH_SIZE, shuffle=False)

In [15]:
train_iter = iter(train_loader)
inputs, labels = next(train_iter)

print("Inputs:", inputs.shape)
print("Labels:", labels)

Inputs: torch.Size([16, 160000])
Labels: ('fire', 'tsunami', 'danger', 'danger', 'tsunami', 'non', 'tsunami', 'fire', 'non', 'non', 'danger', 'danger', 'gas', 'tsunami', 'fire', 'fire')
