In [1]:
import torch
import torch.nn.functional as F
import numpy as np
import Preprocessor as pp
import sounddevice as sd # 마이크 입력을 위한 라이브러리

In [2]:
print(sd.query_devices())

   0 Microsoft 사운드 매퍼 - Input, MME (2 in, 0 out)
>  1 마이크(ABKO MP3300), MME (1 in, 0 out)
   2 마이크(Steam Streaming Microphone), MME (8 in, 0 out)
   3 Microsoft 사운드 매퍼 - Output, MME (0 in, 2 out)
<  4 스피커(High Definition Audio Devic, MME (0 in, 6 out)
   5 스피커(Steam Streaming Speakers), MME (0 in, 8 out)
   6 스피커(Steam Streaming Microphone), MME (0 in, 8 out)
   7 Display(2- High Definition Audi, MME (0 in, 2 out)
   8 디지털 출력(High Definition Audio De, MME (0 in, 2 out)
   9 주 사운드 캡처 드라이버, Windows DirectSound (2 in, 0 out)
  10 마이크(ABKO MP3300), Windows DirectSound (1 in, 0 out)
  11 마이크(Steam Streaming Microphone), Windows DirectSound (8 in, 0 out)
  12 주 사운드 드라이버, Windows DirectSound (0 in, 2 out)
  13 스피커(High Definition Audio Device), Windows DirectSound (0 in, 6 out)
  14 스피커(Steam Streaming Speakers), Windows DirectSound (0 in, 8 out)
  15 스피커(Steam Streaming Microphone), Windows DirectSound (0 in, 8 out)
  16 Display(2- High Definition Audio Device), Windows DirectSound (0 in, 2 

In [3]:
sd.default.device = (1, None)

In [4]:
SAVE_BEST_PATH = "./result/efficientnet_v2_l_best.pt"
CLASS_NAME = ["danger", "fire", "gas", "non", "tsunami"]
DEVICE = torch.device("cuda" if torch.cuda.is_available() else "cpu")

# 하이퍼 파라미터
CLASS_NUM = len(CLASS_NAME)
SAMPLE_RATE = 16000
DURATION = 1
NUM_SAMPLES = SAMPLE_RATE * DURATION
BATCH_SIZE = 32
LEARNING_RATE = 1e-4

In [5]:
def logmel_transform(audio_np):
    # numpy 배열을 torch 텐서로 변환, 배치 차원 추가: (1, num_samples)
    waveform = torch.tensor(audio_np, dtype=torch.float32)

    # 2채널 이상일때 1채널로 변환
    if waveform.shape[0] > 1:
        waveform = waveform.mean(axis=-1)
    
    # 오디오 샘플의 길이조정
    if waveform.shape[1] < NUM_SAMPLES: # 길이가 부족하면 0(무음)을 채워 길이를 연장
        waveform = F.pad(waveform, (0, NUM_SAMPLES - waveform.shape[1])) # num_samples와 현재의 길이의 차 만큼 0을 패딩
    else:
        waveform = waveform[:, :NUM_SAMPLES] # 길이가 길면 슬라이싱

    # 절댓값 정규화(-1 ~ 1)
    waveform = waveform / (waveform.abs().max() + 1e-9)

    logmel = pp.logmel(waveform).unsqueeze(0) # [batch, channel, mel_bins, time]

    return logmel


def predict(model, logmel, device=DEVICE):
    logmel = logmel.to(device)

    with torch.no_grad():
        outputs = model(logmel)
        probabilities = F.softmax(outputs, dim=1).cpu().numpy()[0]
        idx = probabilities.argmax()

    return idx, probabilities

In [7]:
model = torch.load(SAVE_BEST_PATH, map_location=DEVICE, weights_only=False)
model.to(DEVICE)
model.eval()

EfficientNet(
  (features): Sequential(
    (0): Conv2dNormActivation(
      (0): Conv2d(1, 32, kernel_size=(3, 3), stride=(2, 2), padding=(1, 1), bias=False)
      (1): BatchNorm2d(32, eps=0.001, momentum=0.1, affine=True, track_running_stats=True)
      (2): SiLU(inplace=True)
    )
    (1): Sequential(
      (0): FusedMBConv(
        (block): Sequential(
          (0): Conv2dNormActivation(
            (0): Conv2d(32, 32, kernel_size=(3, 3), stride=(1, 1), padding=(1, 1), bias=False)
            (1): BatchNorm2d(32, eps=0.001, momentum=0.1, affine=True, track_running_stats=True)
            (2): SiLU(inplace=True)
          )
        )
        (stochastic_depth): StochasticDepth(p=0.0, mode=row)
      )
      (1): FusedMBConv(
        (block): Sequential(
          (0): Conv2dNormActivation(
            (0): Conv2d(32, 32, kernel_size=(3, 3), stride=(1, 1), padding=(1, 1), bias=False)
            (1): BatchNorm2d(32, eps=0.001, momentum=0.1, affine=True, track_running_stats=True)
  

In [9]:
print("실시간 마이크 입력을 시작합니다.")

try:
    # 실시간 마이크 입력 루프
    while True:
        # 오디오를 실시간으로 마이크에서 녹음
        audio_np = sd.rec(frames=NUM_SAMPLES, # 녹음할 샘플 수 (1초 분량)
                          samplerate=SAMPLE_RATE, # 샘플링 레이트
                          channels=1, # 단일 채널 (모노)
                          dtype="float32" # float32 타입으로 녹음
                          ).transpose(1, 0)
        sd.wait() # 녹음 완료까지 대기

        # audio_np = logmel(audio_np)
        logmel = logmel_transform(audio_np)
        idx, probabilities = predict(model, logmel)

        print(f"{CLASS_NAME[idx]} | {probabilities[idx] * 100:.2f}%")

except KeyboardInterrupt as e:
    print("\n실시간 마이크 입력을 종료합니다.\n", e)

실시간 마이크 입력을 시작합니다.
non | 99.76%
non | 99.97%
non | 41.99%
tsunami | 92.59%
tsunami | 96.26%
tsunami | 96.34%
tsunami | 59.25%
non | 99.69%
non | 96.51%
non | 99.40%
non | 99.94%
non | 98.51%
non | 93.22%
fire | 99.08%
fire | 46.19%
fire | 98.42%
fire | 99.90%
fire | 89.45%
fire | 74.11%
fire | 83.64%
non | 98.44%
non | 99.82%
fire | 99.04%
fire | 77.22%
fire | 99.40%
fire | 99.76%
non | 99.15%
non | 99.49%
non | 90.87%
fire | 99.98%
fire | 99.27%
fire | 99.38%
fire | 53.11%
non | 99.94%
fire | 99.86%
fire | 97.92%
fire | 99.97%
non | 96.77%
non | 99.33%
non | 99.90%

실시간 마이크 입력을 종료합니다.
 
