In [9]:
import torch
import torch.nn.functional as F
import numpy as np
import Preprocessor as pp
import sounddevice as sd # 마이크 입력을 위한 라이브러리

In [10]:
sd.default.device = (1)
print(sd.query_devices())

   0 Microsoft 사운드 매퍼 - Input, MME (2 in, 0 out)
*  1 마이크(ABKO MP3300), MME (1 in, 0 out)
   2 마이크(Steam Streaming Microphone), MME (8 in, 0 out)
   3 Microsoft 사운드 매퍼 - Output, MME (0 in, 2 out)
   4 스피커(High Definition Audio Devic, MME (0 in, 6 out)
   5 스피커(Steam Streaming Speakers), MME (0 in, 8 out)
   6 스피커(Steam Streaming Microphone), MME (0 in, 8 out)
   7 Display(2- High Definition Audi, MME (0 in, 2 out)
   8 디지털 출력(High Definition Audio De, MME (0 in, 2 out)
   9 주 사운드 캡처 드라이버, Windows DirectSound (2 in, 0 out)
  10 마이크(ABKO MP3300), Windows DirectSound (1 in, 0 out)
  11 마이크(Steam Streaming Microphone), Windows DirectSound (8 in, 0 out)
  12 주 사운드 드라이버, Windows DirectSound (0 in, 2 out)
  13 스피커(High Definition Audio Device), Windows DirectSound (0 in, 6 out)
  14 스피커(Steam Streaming Speakers), Windows DirectSound (0 in, 8 out)
  15 스피커(Steam Streaming Microphone), Windows DirectSound (0 in, 8 out)
  16 Display(2- High Definition Audio Device), Windows DirectSound (0 in, 2 

In [11]:
SAVE_BEST_PATH = "./result/resnet152_best.pt"
CLASS_NAME = ["danger", "fire", "gas", "non", "tsunami"]
DEVICE = torch.device("cuda" if torch.cuda.is_available() else "cpu")

# 하이퍼 파라미터
CLASS_NUM = len(CLASS_NAME)
SAMPLE_RATE = 16000
DURATION = 1
NUM_SAMPLE = SAMPLE_RATE * DURATION
BATCH_SIZE = 32
LEARNING_RATE = 1e-4

In [12]:
# def logmel_transform(audio_np):
#     # numpy 배열을 torch 텐서로 변환, 배치 차원 추가: (1, num_samples)
#     waveform = torch.tensor(audio_np, dtype=torch.float32)

#     # 2채널 이상일때 1채널로 변환
#     if waveform.shape[0] > 1:
#         waveform = waveform.mean(axis=-1)
    
#     # 오디오 샘플의 길이조정
#     if waveform.shape[1] < NUM_SAMPLES: # 길이가 부족하면 0(무음)을 채워 길이를 연장
#         waveform = F.pad(waveform, (0, NUM_SAMPLES - waveform.shape[1])) # num_samples와 현재의 길이의 차 만큼 0을 패딩
#     else:
#         waveform = waveform[:, :NUM_SAMPLES] # 길이가 길면 슬라이싱

#     # 절댓값 정규화(-1 ~ 1)
#     waveform = waveform / (waveform.abs().max() + 1e-9)

#     logmel = pp.logmel(waveform).unsqueeze(0) # [batch, channel, mel_bins, time]

#     return logmel


def predict(model, logmel, device=DEVICE):
    logmel = torch.from_numpy(logmel)
    logmel = logmel.to(device)

    with torch.no_grad():
        outputs = model(logmel)
        probabilities = F.softmax(outputs, dim=1).cpu().numpy()[0]
        idx = probabilities.argmax()

    return idx, probabilities

In [13]:
model = torch.load(SAVE_BEST_PATH, map_location=DEVICE, weights_only=False)
model.to(DEVICE)
model.eval()

ResNet(
  (conv1): Conv2d(1, 64, kernel_size=(7, 7), stride=(2, 2), padding=(3, 3), bias=False)
  (bn1): BatchNorm2d(64, eps=1e-05, momentum=0.1, affine=True, track_running_stats=True)
  (relu): ReLU(inplace=True)
  (maxpool): MaxPool2d(kernel_size=3, stride=2, padding=1, dilation=1, ceil_mode=False)
  (layer1): Sequential(
    (0): Bottleneck(
      (conv1): Conv2d(64, 64, kernel_size=(1, 1), stride=(1, 1), bias=False)
      (bn1): BatchNorm2d(64, eps=1e-05, momentum=0.1, affine=True, track_running_stats=True)
      (conv2): Conv2d(64, 64, kernel_size=(3, 3), stride=(1, 1), padding=(1, 1), bias=False)
      (bn2): BatchNorm2d(64, eps=1e-05, momentum=0.1, affine=True, track_running_stats=True)
      (conv3): Conv2d(64, 256, kernel_size=(1, 1), stride=(1, 1), bias=False)
      (bn3): BatchNorm2d(256, eps=1e-05, momentum=0.1, affine=True, track_running_stats=True)
      (relu): ReLU(inplace=True)
      (downsample): Sequential(
        (0): Conv2d(64, 256, kernel_size=(1, 1), stride=(1, 

In [14]:
print("실시간 마이크 입력을 시작합니다.")

try:
    # 실시간 마이크 입력 루프
    while True:
        # 오디오를 실시간으로 마이크에서 녹음
        audio_np = sd.rec(frames=TIME, # 녹음할 샘플 수 (1초 분량)
                          samplerate=SAMPLE_RATE, # 샘플링 레이트
                          channels=1, # 단일 채널 (모노)
                          dtype="float32" # float32 타입으로 녹음
                          ).transpose(1, 0)
        sd.wait() # 녹음 완료까지 대기

        # audio_np = logmel(audio_np)
        logmel = pp.logmel(audio=audio_np, num_samples=NUM_SAMPLE)
        idx, probabilities = predict(model, logmel)

        print(f"{CLASS_NAME[idx]} | {probabilities[idx] * 100:.2f}%")

except KeyboardInterrupt as e:
    print("\n실시간 마이크 입력을 종료합니다.\n", e)

실시간 마이크 입력을 시작합니다.
non | 98.11%
non | 99.99%
non | 99.03%
non | 99.17%
non | 99.79%
non | 99.46%
non | 99.46%
non | 99.50%
non | 99.87%
non | 96.06%
non | 99.12%
non | 98.98%
non | 99.35%
non | 99.84%
non | 96.82%
non | 99.41%
danger | 69.53%
danger | 53.99%
fire | 60.15%
tsunami | 75.83%
tsunami | 75.96%
non | 99.71%
fire | 79.13%
fire | 87.20%
non | 53.75%
fire | 56.35%
non | 99.36%
non | 80.59%
non | 52.24%
non | 82.80%
non | 81.99%
non | 57.75%
non | 99.92%
non | 83.29%
non | 73.68%
fire | 99.24%
gas | 53.03%
fire | 83.99%
non | 60.60%
fire | 73.93%
non | 99.79%
fire | 99.78%
fire | 99.91%
fire | 99.84%
fire | 45.31%
fire | 92.52%
fire | 99.84%
fire | 98.53%
non | 99.08%
non | 62.98%
tsunami | 76.47%
fire | 85.82%
tsunami | 96.77%
fire | 87.32%
non | 91.90%
danger | 72.73%
danger | 73.06%
non | 99.59%
non | 84.69%
danger | 74.20%
non | 99.36%
danger | 98.32%
non | 49.96%
non | 99.85%
non | 98.39%
non | 99.49%
non | 65.05%
non | 81.62%
danger | 54.64%
tsunami | 52.84%
tsunami | 95.2