In [None]:
import torch
import torch.nn.functional as F
import preprocessor as pp
import sounddevice as sd # 마이크 입력을 위한 라이브러리

In [3]:
sd.default.device = (1)
print(sd.query_devices())

   0 Microsoft 사운드 매퍼 - Input, MME (2 in, 0 out)
*  1 마이크(ABKO MP3300), MME (1 in, 0 out)
   2 마이크(Steam Streaming Microphone), MME (8 in, 0 out)
   3 Microsoft 사운드 매퍼 - Output, MME (0 in, 2 out)
   4 스피커(High Definition Audio Devic, MME (0 in, 6 out)
   5 스피커(Steam Streaming Speakers), MME (0 in, 8 out)
   6 스피커(Steam Streaming Microphone), MME (0 in, 8 out)
   7 Display(2- High Definition Audi, MME (0 in, 2 out)
   8 디지털 출력(High Definition Audio De, MME (0 in, 2 out)
   9 주 사운드 캡처 드라이버, Windows DirectSound (2 in, 0 out)
  10 마이크(ABKO MP3300), Windows DirectSound (1 in, 0 out)
  11 마이크(Steam Streaming Microphone), Windows DirectSound (8 in, 0 out)
  12 주 사운드 드라이버, Windows DirectSound (0 in, 2 out)
  13 스피커(High Definition Audio Device), Windows DirectSound (0 in, 6 out)
  14 스피커(Steam Streaming Speakers), Windows DirectSound (0 in, 8 out)
  15 스피커(Steam Streaming Microphone), Windows DirectSound (0 in, 8 out)
  16 Display(2- High Definition Audio Device), Windows DirectSound (0 in, 2 

In [9]:
SAVE_BEST_PATH = "./result/resnet50_best.pt"
CLASS_NAME = ["danger", "fire", "gas", "non", "tsunami"]
DEVICE = torch.device("cuda" if torch.cuda.is_available() else "cpu")

CLASS_NUM = len(CLASS_NAME)
SAMPLE_RATE = 16000
DURATION = 1
NUM_SAMPLE = int(SAMPLE_RATE * DURATION)
BATCH_SIZE = 32
LEARNING_RATE = 1e-4

In [5]:
def predict(model, logmel, device=DEVICE):
    logmel = torch.from_numpy(logmel)
    logmel = logmel.to(device)

    with torch.no_grad():
        outputs = model(logmel)
        probabilities = F.softmax(outputs, dim=1).cpu().numpy()[0]
        idx = probabilities.argmax()

    return idx, probabilities

In [7]:
model = torch.load(SAVE_BEST_PATH, map_location=DEVICE, weights_only=False)
model.to(DEVICE)
model.eval()

ResNet(
  (conv1): Conv2d(1, 64, kernel_size=(7, 7), stride=(2, 2), padding=(3, 3), bias=False)
  (bn1): BatchNorm2d(64, eps=1e-05, momentum=0.1, affine=True, track_running_stats=True)
  (relu): ReLU(inplace=True)
  (maxpool): MaxPool2d(kernel_size=3, stride=2, padding=1, dilation=1, ceil_mode=False)
  (layer1): Sequential(
    (0): Bottleneck(
      (conv1): Conv2d(64, 64, kernel_size=(1, 1), stride=(1, 1), bias=False)
      (bn1): BatchNorm2d(64, eps=1e-05, momentum=0.1, affine=True, track_running_stats=True)
      (conv2): Conv2d(64, 64, kernel_size=(3, 3), stride=(1, 1), padding=(1, 1), bias=False)
      (bn2): BatchNorm2d(64, eps=1e-05, momentum=0.1, affine=True, track_running_stats=True)
      (conv3): Conv2d(64, 256, kernel_size=(1, 1), stride=(1, 1), bias=False)
      (bn3): BatchNorm2d(256, eps=1e-05, momentum=0.1, affine=True, track_running_stats=True)
      (relu): ReLU(inplace=True)
      (downsample): Sequential(
        (0): Conv2d(64, 256, kernel_size=(1, 1), stride=(1, 

In [10]:
print("실시간 마이크 입력을 시작합니다.")

try:
    while True:
        audio_np = sd.rec(frames=NUM_SAMPLE,
                          samplerate=SAMPLE_RATE,
                          channels=1,
                          dtype="float32"
                          ).transpose(1, 0)
        sd.wait()

        logmel = pp.logmel(audio=audio_np, num_samples=NUM_SAMPLE)
        idx, probabilities = predict(model, logmel)

        print(f"{CLASS_NAME[idx]} | {probabilities[idx] * 100:.2f}%")

except KeyboardInterrupt as e:
    print("\n실시간 마이크 입력을 종료합니다.\n", e)

실시간 마이크 입력을 시작합니다.
non | 99.75%
non | 98.92%
non | 99.87%
non | 99.84%
non | 99.49%
non | 99.81%
non | 99.83%
non | 98.91%
fire | 96.96%
fire | 97.45%
tsunami | 71.11%
tsunami | 62.15%
non | 88.42%
non | 99.85%
non | 99.49%
non | 99.72%
non | 99.74%

실시간 마이크 입력을 종료합니다.
 
