In [1]:
import os
import librosa
import numpy as np
import torch
import torch.nn as nn
import torch.optim as optim
from torch.utils.data import Dataset, DataLoader
import torch.nn.functional as F

In [2]:
class ResidualBlock(nn.Module):
    def __init__(self, in_channels, out_channels, stride=1):
        super(ResidualBlock, self).__init__()
        self.conv1 = nn.Conv2d(in_channels, out_channels, kernel_size=3, stride=stride, padding=1)
        self.bn1 = nn.BatchNorm2d(out_channels)
        self.relu = nn.ReLU(inplace=True)
        self.conv2 = nn.Conv2d(out_channels, out_channels, kernel_size=3, stride=1, padding=1)
        self.bn2 = nn.BatchNorm2d(out_channels)
        self.downsample = nn.Sequential(
            nn.Conv2d(in_channels, out_channels, kernel_size=1, stride=stride),
            nn.BatchNorm2d(out_channels)
        ) if stride != 1 or in_channels != out_channels else None

    def forward(self, x):
        residual = x
        out = self.conv1(x)
        out = self.bn1(out)
        out = self.relu(out)
        out = self.conv2(out)
        out = self.bn2(out)
        if self.downsample is not None:
            residual = self.downsample(x)
        out += residual
        out = self.relu(out)
        return out


In [3]:
class AudioCNN(nn.Module):
    def __init__(self, num_classes, conv1_channels=128, conv2_channels=256, fc1_out_features=256):
        super(AudioCNN, self).__init__()
        self.conv1 = ResidualBlock(1, conv1_channels)
        self.conv2 = ResidualBlock(conv1_channels, conv2_channels)
        self.pool = nn.AdaptiveAvgPool2d((1, 1))
        self.fc1 = nn.Linear(conv2_channels, fc1_out_features)
        self.fc2 = nn.Linear(fc1_out_features, num_classes)

    def forward(self, x):
        x = self.conv1(x)
        x = self.conv2(x)
        x = self.pool(x)
        x = x.view(x.size(0), -1)
        x = F.relu(self.fc1(x))
        x = self.fc2(x)
        return x

model = AudioCNN(num_classes=50, conv1_channels=128, conv2_channels=256, fc1_out_features=256)

device = torch.device("cuda" if torch.cuda.is_available() else "cpu")
model.to(device)

dummy_input = torch.randn(32, 1, 44, 14)

output = model(dummy_input.to(device))
print("Output size:", output.size())

Output size: torch.Size([32, 50])


In [4]:
learning_rate = 0.001
batch_size = 16
num_epochs = 35

In [6]:
class ESC50Dataset(Dataset):
    def __init__(self, data, labels):
        self.data = data
        self.labels = labels

    def __len__(self):
        return len(self.data)

    def __getitem__(self, idx):
        mel_spec = torch.tensor(self.data[idx], dtype=torch.float32).unsqueeze(0)  # Add channel dimension
        label = torch.tensor(self.labels[idx], dtype=torch.long)
        return mel_spec, label

In [7]:
def load_esc50_data(base_path):
    data = []
    labels = []

    for root, dirs, files in os.walk(base_path):
        for file in files:
            if file.endswith(".wav"):
                file_path = os.path.join(root, file)
                label = int(file.split("-")[-1].split(".")[0])
                audio_data, _ = librosa.load(file_path, sr=None)
                mel_spec = librosa.feature.melspectrogram(y=audio_data, sr=44100)
                mel_spec_db = librosa.power_to_db(mel_spec, ref=np.max)
                data.append(mel_spec_db)
                labels.append(label)

    return np.array(data), np.array(labels)

In [7]:
esc50_path = "./ESC-50-master"

data, labels = load_esc50_data(esc50_path)

split_ratio = 0.8
split_idx = int(len(data) * split_ratio)

train_data, train_labels = data[:split_idx], labels[:split_idx]
test_data, test_labels = data[split_idx:], labels[split_idx:]

train_dataset = ESC50Dataset(train_data, train_labels)
train_dataloader = DataLoader(train_dataset, batch_size, shuffle=True)

test_dataset = ESC50Dataset(test_data, test_labels)
test_dataloader = DataLoader(test_dataset, batch_size, shuffle=False)

In [8]:
optimizer = torch.optim.Adam(model.parameters(), lr=learning_rate)
criterion = nn.CrossEntropyLoss()

In [9]:
for epoch in range(num_epochs):
    model.train()
    for inputs, labels in train_dataloader:
        optimizer.zero_grad()
        outputs = model(inputs.to(device))
        loss = criterion(outputs.to(device), labels.to(device))
        loss.backward()
        optimizer.step()

    model.eval()
    correct = 0
    total = 0
    with torch.no_grad():
        for inputs, labels in test_dataloader:
            outputs = model(inputs.to(device))
            _, predicted = torch.max(outputs.data, 1)
            total += labels.size(0)
            correct += (predicted == labels.to(device)).sum().item()

    accuracy = correct / total
    print(f'Epoch {epoch+1}/{num_epochs}, Loss: {loss.item():.4f}, Test Accuracy: {accuracy:.4f}')

torch.save(model.state_dict(), 'audio_cnn_model.pth')

NameError: name 'train_dataloader' is not defined

In [10]:
model = AudioCNN(num_classes=50, conv1_channels=128, conv2_channels=256, fc1_out_features=256)
model.load_state_dict(torch.load('./audio_cnn_model.pth', map_location=torch.device('cpu')))
model.eval()  

AudioCNN(
  (conv1): ResidualBlock(
    (conv1): Conv2d(1, 128, kernel_size=(3, 3), stride=(1, 1), padding=(1, 1))
    (bn1): BatchNorm2d(128, eps=1e-05, momentum=0.1, affine=True, track_running_stats=True)
    (relu): ReLU(inplace=True)
    (conv2): Conv2d(128, 128, kernel_size=(3, 3), stride=(1, 1), padding=(1, 1))
    (bn2): BatchNorm2d(128, eps=1e-05, momentum=0.1, affine=True, track_running_stats=True)
    (downsample): Sequential(
      (0): Conv2d(1, 128, kernel_size=(1, 1), stride=(1, 1))
      (1): BatchNorm2d(128, eps=1e-05, momentum=0.1, affine=True, track_running_stats=True)
    )
  )
  (conv2): ResidualBlock(
    (conv1): Conv2d(128, 256, kernel_size=(3, 3), stride=(1, 1), padding=(1, 1))
    (bn1): BatchNorm2d(256, eps=1e-05, momentum=0.1, affine=True, track_running_stats=True)
    (relu): ReLU(inplace=True)
    (conv2): Conv2d(256, 256, kernel_size=(3, 3), stride=(1, 1), padding=(1, 1))
    (bn2): BatchNorm2d(256, eps=1e-05, momentum=0.1, affine=True, track_running_stats

In [11]:
class_index_to_name = {
    0: 'airplane',
    1: 'breathing',
    2: 'brushing_teeth',
    3: 'can_opening',
    4: 'car_horn',
    5: 'cat',
    6: 'chainsaw',
    7: 'chirping_birds',
    8: 'church_bells',
    9: 'clapping',
    10: 'clock_alarm',
    11: 'clock_tick',
    12: 'coughing',
    13: 'cow',
    14: 'crackling_fire',
    15: 'crickets',
    16: 'crow',
    17: 'crying_baby',
    18: 'dog',
    19: 'door_wood_creaks',
    20: 'door_wood_knock',
    21: 'drinking_sipping',
    22: 'engine',
    23: 'fireworks',
    24: 'footsteps',
    25: 'frog',
    26: 'glass_breaking',
    27: 'hand_saw',
    28: 'helicopter',
    29: 'hen',
    30: 'insects',
    31: 'keyboard_typing',
    32: 'laughing',
    33: 'mouse_click',
    34: 'pig',
    35: 'pouring_water',
    36: 'rain',
    37: 'rooster',
    38: 'sea_waves',
    39: 'sheep',
    40: 'siren',
    41: 'sneezing',
    42: 'snoring',
    43: 'thunderstorm',
    44: 'toilet_flush',
    45: 'train',
    46: 'vacuum_cleaner',
    47: 'washing_machine',
    48: 'water_drops',
    49: 'wind'
}


In [16]:
import torchaudio
import torchaudio.transforms as T
import torch

def preprocess_audio(audio_path, num_mels=128, sample_rate=44100):
    waveform, _ = torchaudio.load(audio_path, normalize=True)

    mel_transform = T.MelSpectrogram(sample_rate=sample_rate, n_mels=num_mels)
    mel_spec = mel_transform(waveform)

    mel_spec_db = torchaudio.transforms.AmplitudeToDB()(mel_spec)
    mel_spec_db_scaled = torch.nn.functional.interpolate(mel_spec_db.unsqueeze(0), size=(44, 14)).squeeze(0)

    return mel_spec_db_scaled

def inference(model, audio_path, class_index_to_name):
    input_waveform = preprocess_audio(audio_path)

    input_waveform = input_waveform.unsqueeze(0)

    with torch.no_grad():
        output = model(input_waveform)

    _, predicted_class = torch.max(output, 1)

    predicted_class_name = class_index_to_name[predicted_class.item()]

    return predicted_class_name

audio_file_path = '/Users/mugi/course/2023Fall/Statistical-Learning/project/1-32318-A-0.wav'

predicted_class_name = inference(model, audio_file_path, class_index_to_name)

print(f"The predicted class for {audio_file_path} is: {predicted_class_name}")

The predicted class for /Users/mugi/course/2023Fall/Statistical-Learning/project/1-32318-A-0.wav is: mouse_click
