In [13]:
import torch
import torch.nn.functional as F
import Preprocessor as pp
import sounddevice as sd # 마이크 입력을 위한 라이브러리

In [14]:
sd.default.device = (0)
print(sd.query_devices())

* 0 ABKO MP3300: USB Audio (hw:2,0), ALSA (1 in, 0 out)
  1 pulse, ALSA (32 in, 32 out)
  2 default, ALSA (32 in, 32 out)


In [15]:
SAVE_BEST_PATH = "model/resnet50_best.pt"
CLASS_NAME = ["alarm", "alarm", "alarm", "non", "alarm"]
DEVICE = torch.device("cuda" if torch.cuda.is_available() else "cpu")

CLASS_NUM = len(CLASS_NAME)
SAMPLE_RATE = 16000
DURATION = 0.5
NUM_SAMPLE = int(SAMPLE_RATE * DURATION)
BATCH_SIZE = 32
LEARNING_RATE = 1e-4

In [16]:
def predict(model, logmel, device=DEVICE):
    logmel = torch.from_numpy(logmel)
    logmel = logmel.to(device)

    with torch.no_grad():
        outputs = model(logmel)
        probabilities = F.softmax(outputs, dim=1).cpu().numpy()[0]
        idx = probabilities.argmax()

    return idx, probabilities

In [17]:
model = torch.load(SAVE_BEST_PATH, map_location=DEVICE, weights_only=False)
model.to(DEVICE)
model.eval()

ResNet(
  (conv1): Conv2d(1, 64, kernel_size=(7, 7), stride=(2, 2), padding=(3, 3), bias=False)
  (bn1): BatchNorm2d(64, eps=1e-05, momentum=0.1, affine=True, track_running_stats=True)
  (relu): ReLU(inplace=True)
  (maxpool): MaxPool2d(kernel_size=3, stride=2, padding=1, dilation=1, ceil_mode=False)
  (layer1): Sequential(
    (0): Bottleneck(
      (conv1): Conv2d(64, 64, kernel_size=(1, 1), stride=(1, 1), bias=False)
      (bn1): BatchNorm2d(64, eps=1e-05, momentum=0.1, affine=True, track_running_stats=True)
      (conv2): Conv2d(64, 64, kernel_size=(3, 3), stride=(1, 1), padding=(1, 1), bias=False)
      (bn2): BatchNorm2d(64, eps=1e-05, momentum=0.1, affine=True, track_running_stats=True)
      (conv3): Conv2d(64, 256, kernel_size=(1, 1), stride=(1, 1), bias=False)
      (bn3): BatchNorm2d(256, eps=1e-05, momentum=0.1, affine=True, track_running_stats=True)
      (relu): ReLU(inplace=True)
      (downsample): Sequential(
        (0): Conv2d(64, 256, kernel_size=(1, 1), stride=(1, 

In [18]:
print("실시간 마이크 입력을 시작합니다.")

try:
    while True:
        audio_np = sd.rec(frames=NUM_SAMPLE,
                          samplerate=SAMPLE_RATE,
                          channels=1,
                          dtype="float32"
                          ).transpose(1, 0)
        sd.wait()

        logmel = pp.logmel(audio=audio_np, num_samples=NUM_SAMPLE)
        idx, probabilities = predict(model, logmel)

        print(f"{CLASS_NAME[idx]} | {probabilities[idx] * 100:.2f}%")

except KeyboardInterrupt as e:
    print("\n실시간 마이크 입력을 종료합니다.\n", e)

실시간 마이크 입력을 시작합니다.
non | 99.04%
non | 99.67%
non | 99.02%
non | 99.83%
non | 99.03%
non | 99.62%
non | 98.41%
non | 99.13%
non | 99.61%
non | 96.17%
non | 99.68%
non | 99.85%
non | 98.91%
non | 99.09%
non | 99.95%
non | 98.79%
non | 98.61%
non | 96.71%
non | 97.28%
non | 99.46%
non | 99.00%
non | 98.70%
non | 54.22%
alarm | 86.46%
non | 79.27%
non | 93.50%
alarm | 80.19%
alarm | 99.62%
alarm | 63.46%
alarm | 93.75%
alarm | 98.53%
alarm | 91.42%
alarm | 99.46%
alarm | 99.15%
alarm | 60.87%
alarm | 79.11%
alarm | 78.46%
alarm | 97.90%
alarm | 97.95%
alarm | 87.79%
alarm | 99.88%
alarm | 82.33%
alarm | 45.55%
non | 53.61%
non | 97.97%
non | 99.53%
non | 97.65%
non | 99.76%
non | 99.47%
non | 99.36%
non | 69.50%
alarm | 77.15%
alarm | 92.45%
alarm | 58.78%
alarm | 77.19%
alarm | 73.07%
non | 96.25%
non | 98.16%
non | 99.71%
non | 99.43%
alarm | 75.29%
alarm | 88.44%
alarm | 70.89%
alarm | 72.75%
non | 99.85%
non | 97.40%
non | 99.87%
non | 92.57%
non | 82.07%
non | 60.47%
non | 98.66%
non 