In [1]:
import sys
sys.path.append('./models/')

import torch
import torchaudio
import torch.nn.functional as F
import numpy as np

In [2]:
MODEL_PATH = "./result/best.pt"
CLASS_NAME = ["danger", "fire", "gas", "non", "tsunami"]
DEVICE = torch.device("cuda" if torch.cuda.is_available() else "cpu")

SAMPLE_RATE = 16000
DURATION = 1.0

sample_audio_path = "./sample/sample.wav"

In [3]:
def preprocess(audio_path, sample_rate=SAMPLE_RATE, duration=DURATION):
    '''
    오디오 파일을 로드하고 전처리

    Returns:
        waveform: [1, data_length]
    '''
    num_samples = int(sample_rate * duration)

    try:
        waveform, sr = torchaudio.load(audio_path)
    except Exception as e:
        print(f"파일로드 오류 {audio_path}: {e}")
        return None
    
    if waveform.shape[0] > 1:
        waveform = waveform.mean(dim=0, keepdim=True)
    
    if sr != sample_rate:
        resampler = torchaudio.transforms.Resample(orig_freq=sr, new_freq=sample_rate)
        waveform = resampler(waveform)

    if waveform.shape[1] < num_samples:
        waveform = F.pad(waveform, (0, num_samples  - waveform.shape[1]))
        
    else:
        waveform = waveform[:, :num_samples]

    # 절대값 정규화
    waveform = waveform / (waveform.abs().max() + 1e-9)

    return waveform

def predict(model, audio_tensor, device=DEVICE, class_names=CLASS_NAME):
    '''
    전처리된 오디오 텐서에 대해 예측 수행.
    
    Args:
        model: 학습된 PyTorch 모델
        audio_tensor: [1, data_length] 형태의 오디오 텐서
        device: 사용할 장치 (CPU 또는 CUDA)
        class_names: 클래스 이름 리스트

    Returns:
        pred_labels: 예측된 클래스 이름 리스트
        prob_np: 각 클래스별 확률 (NumPy 배열)
    '''
    audio_tensor = audio_tensor.to(device)

    # 역전파 없음
    with torch.no_grad():
        output_dict = model(audio_tensor)
        prob = output_dict["clipwise_output"]
        prob_np = prob.cpu().numpy()[0]

        pred_tensor = (prob_np > 0.5).astype(int)
        pred_labels = [class_names[i] for i, p in enumerate(pred_tensor) if p == 1]

        # 모든 클래스가 임계값을 넘지 못했을때 가장 확률이 큰 클래스 반환
        if not pred_labels:
            highest_prob_idx = int(np.argmax(prob_np))
            return [class_names[highest_prob_idx]], prob_np

        return pred_labels, prob_np     

In [4]:
model = torch.load(MODEL_PATH, weights_only=False)
model.to(DEVICE)
model.eval()

Cnn14(
  (spectrogram_extractor): Spectrogram(
    (stft): STFT(
      (conv_real): Conv1d(1, 257, kernel_size=(512,), stride=(160,), bias=False)
      (conv_imag): Conv1d(1, 257, kernel_size=(512,), stride=(160,), bias=False)
    )
  )
  (logmel_extractor): LogmelFilterBank()
  (spec_augmenter): SpecAugmentation(
    (time_dropper): DropStripes()
    (freq_dropper): DropStripes()
  )
  (bn0): BatchNorm2d(64, eps=1e-05, momentum=0.1, affine=True, track_running_stats=True)
  (conv_block1): ConvBlock(
    (conv1): Conv2d(1, 64, kernel_size=(3, 3), stride=(1, 1), padding=(1, 1), bias=False)
    (conv2): Conv2d(64, 64, kernel_size=(3, 3), stride=(1, 1), padding=(1, 1), bias=False)
    (bn1): BatchNorm2d(64, eps=1e-05, momentum=0.1, affine=True, track_running_stats=True)
    (bn2): BatchNorm2d(64, eps=1e-05, momentum=0.1, affine=True, track_running_stats=True)
  )
  (conv_block2): ConvBlock(
    (conv1): Conv2d(64, 128, kernel_size=(3, 3), stride=(1, 1), padding=(1, 1), bias=False)
    (con

In [5]:
audio_tensor = preprocess(sample_audio_path)
print("입력 텐서 크기:", audio_tensor.shape)

입력 텐서 크기: torch.Size([1, 16000])


In [6]:
pred_labels, class_prob = predict(model, audio_tensor)

if pred_labels:
    print(f"예측결과: {",".join(pred_labels)}")
    
print("\n클래스별 확률:")
for idx, class_name in enumerate(CLASS_NAME):
    print(f" - {class_name}: {class_prob[idx]:.4f}")

예측결과: fire

클래스별 확률:
 - danger: 0.0000
 - fire: 1.0000
 - gas: 0.0000
 - non: 0.0000
 - tsunami: 0.0000
