In [1]:
import os
import librosa
import numpy as np
import torch
import torch.nn as nn
import torch.optim as optim
from torch.utils.data import Dataset, DataLoader
import torch.nn.functional as F

In [2]:
class ResidualBlock(nn.Module):
    def __init__(self, in_channels, out_channels, stride=1):
        super(ResidualBlock, self).__init__()
        self.conv1 = nn.Conv2d(in_channels, out_channels, kernel_size=3, stride=stride, padding=1)
        self.bn1 = nn.BatchNorm2d(out_channels)
        self.relu = nn.ReLU(inplace=True)
        self.conv2 = nn.Conv2d(out_channels, out_channels, kernel_size=3, stride=1, padding=1)
        self.bn2 = nn.BatchNorm2d(out_channels)
        self.downsample = nn.Sequential(
            nn.Conv2d(in_channels, out_channels, kernel_size=1, stride=stride),
            nn.BatchNorm2d(out_channels)
        ) if stride != 1 or in_channels != out_channels else None

    def forward(self, x):
        residual = x
        out = self.conv1(x)
        out = self.bn1(out)
        out = self.relu(out)
        out = self.conv2(out)
        out = self.bn2(out)
        if self.downsample is not None:
            residual = self.downsample(x)
        out += residual
        out = self.relu(out)
        return out


In [3]:
class AudioCNN(nn.Module):
    def __init__(self, num_classes, conv1_channels=128, conv2_channels=256, fc1_out_features=256, num_residual_blocks=3):
        super(AudioCNN, self).__init__()
        self.conv1 = ResidualBlock(1, conv1_channels)
        self.conv2 = ResidualBlock(conv1_channels, conv2_channels)
        # Add more residual blocks
        self.res_blocks = nn.ModuleList([ResidualBlock(conv2_channels, conv2_channels) for _ in range(num_residual_blocks)])
        self.pool = nn.AdaptiveAvgPool2d((1, 1))
        self.fc1 = nn.Linear(conv2_channels, fc1_out_features)
        self.fc2 = nn.Linear(fc1_out_features, num_classes)

    def forward(self, x):
        x = self.conv1(x)
        x = self.conv2(x)
        for block in self.res_blocks:
            x = block(x)
        x = self.pool(x)
        x = x.view(x.size(0), -1)
        x = F.relu(self.fc1(x))
        x = self.fc2(x)
        return x


model = AudioCNN(
    num_classes=50,
    conv1_channels=128,
    conv2_channels=256,
    fc1_out_features=256,
    num_residual_blocks=3 
)

device = torch.device("cuda" if torch.cuda.is_available() else "cpu")
model.to(device)

dummy_input = torch.randn(32, 1, 44, 14)

output = model(dummy_input.to(device))
print("Output size:", output.size())


Output size: torch.Size([32, 50])


In [4]:
learning_rate = 0.001
batch_size = 16
num_epochs = 35

In [5]:
class ESC50Dataset(Dataset):
    def __init__(self, data, labels):
        self.data = data
        self.labels = labels

    def __len__(self):
        return len(self.data)

    def __getitem__(self, idx):
        mel_spec = torch.tensor(self.data[idx], dtype=torch.float32).unsqueeze(0)  
        label = torch.tensor(self.labels[idx], dtype=torch.long)
        return mel_spec, label

In [6]:
def load_esc50_data(base_path):
    data = []
    labels = []

    for root, dirs, files in os.walk(base_path):
        for file in files:
            if file.endswith(".wav"):
                file_path = os.path.join(root, file)
                label = int(file.split("-")[-1].split(".")[0])
                audio_data, _ = librosa.load(file_path, sr=None)
                mel_spec = librosa.feature.melspectrogram(y=audio_data, sr=44100)
                mel_spec_db = librosa.power_to_db(mel_spec, ref=np.max)
                data.append(mel_spec_db)
                labels.append(label)

    return np.array(data), np.array(labels)

In [7]:
esc50_path = "./ESC-50-master"

data, labels = load_esc50_data(esc50_path)

split_ratio = 0.8
split_idx = int(len(data) * split_ratio)

train_data, train_labels = data[:split_idx], labels[:split_idx]
test_data, test_labels = data[split_idx:], labels[split_idx:]

train_dataset = ESC50Dataset(train_data, train_labels)
train_dataloader = DataLoader(train_dataset, batch_size, shuffle=True)

test_dataset = ESC50Dataset(test_data, test_labels)
test_dataloader = DataLoader(test_dataset, batch_size, shuffle=False)

In [8]:
optimizer = torch.optim.Adam(model.parameters(), lr=learning_rate)
criterion = nn.CrossEntropyLoss()

In [9]:
for epoch in range(num_epochs):
    model.train()
    for inputs, labels in train_dataloader:
        optimizer.zero_grad()
        outputs = model(inputs.to(device))
        loss = criterion(outputs.to(device), labels.to(device))
        loss.backward()
        optimizer.step()

    model.eval()
    correct = 0
    total = 0
    with torch.no_grad():
        for inputs, labels in test_dataloader:
            outputs = model(inputs.to(device))
            _, predicted = torch.max(outputs.data, 1)
            total += labels.size(0)
            correct += (predicted == labels.to(device)).sum().item()

    accuracy = correct / total
    print(f'Epoch {epoch+1}/{num_epochs}, Loss: {loss.item():.4f}, Test Accuracy: {accuracy:.4f}')

torch.save(model.state_dict(), 'audio_cnn_model.pth')

Epoch 1/35, Loss: 3.3013, Test Accuracy: 0.0675
Epoch 2/35, Loss: 3.2468, Test Accuracy: 0.0550
Epoch 3/35, Loss: 3.0744, Test Accuracy: 0.1375
Epoch 4/35, Loss: 2.9996, Test Accuracy: 0.1475
Epoch 5/35, Loss: 3.1119, Test Accuracy: 0.2050
Epoch 6/35, Loss: 2.5650, Test Accuracy: 0.1975
Epoch 7/35, Loss: 2.3747, Test Accuracy: 0.2700
Epoch 8/35, Loss: 1.9242, Test Accuracy: 0.2100
Epoch 9/35, Loss: 1.8662, Test Accuracy: 0.2025
Epoch 10/35, Loss: 2.4055, Test Accuracy: 0.2525
Epoch 11/35, Loss: 2.6771, Test Accuracy: 0.2425
Epoch 12/35, Loss: 2.0656, Test Accuracy: 0.2675
Epoch 13/35, Loss: 1.4134, Test Accuracy: 0.2250
Epoch 14/35, Loss: 1.7206, Test Accuracy: 0.3100
Epoch 15/35, Loss: 1.9509, Test Accuracy: 0.2350
Epoch 16/35, Loss: 1.2457, Test Accuracy: 0.3350
Epoch 17/35, Loss: 1.1802, Test Accuracy: 0.3275
Epoch 18/35, Loss: 1.4648, Test Accuracy: 0.3775
Epoch 19/35, Loss: 1.6516, Test Accuracy: 0.3525
Epoch 20/35, Loss: 1.7162, Test Accuracy: 0.2450
Epoch 21/35, Loss: 1.9048, Te

In [31]:
model = AudioCNN(
    num_classes=50,
    conv1_channels=128,
    conv2_channels=256,
    fc1_out_features=256,
    num_residual_blocks=3
)

model.load_state_dict(torch.load('2_audio_cnn_model_finetuned.pth'))
device = torch.device("cuda" if torch.cuda.is_available() else "cpu")
model.to(device)

criterion = nn.CrossEntropyLoss()
optimizer = torch.optim.Adam(model.parameters(), lr=0.0001)

num_epochs = 5

for epoch in range(num_epochs):
    model.train()
    for inputs, labels in train_dataloader:
        optimizer.zero_grad()
        outputs = model(inputs.to(device))
        loss = criterion(outputs.to(device), labels.to(device))
        loss.backward()
        optimizer.step()

    torch.save(model.state_dict(), f'audio_cnn_model_epoch_{epoch+1}.pth')

    model.eval()
    correct = 0
    total = 0
    with torch.no_grad():
        for inputs, labels in test_dataloader:
            outputs = model(inputs.to(device))
            _, predicted = torch.max(outputs.data, 1)
            total += labels.size(0)
            correct += (predicted == labels.to(device)).sum().item()

    accuracy = correct / total
    print(f'Epoch {epoch+1}/{num_epochs}, Loss: {loss.item():.4f}, Test Accuracy: {accuracy:.4f}')

torch.save(model.state_dict(), 'audio_cnn_model_final.pth')

Epoch 1/5, Loss: 0.3217, Test Accuracy: 0.6225
Epoch 2/5, Loss: 0.8038, Test Accuracy: 0.6100
Epoch 3/5, Loss: 0.9560, Test Accuracy: 0.6325
Epoch 4/5, Loss: 0.8010, Test Accuracy: 0.6250
Epoch 5/5, Loss: 0.1838, Test Accuracy: 0.6350


In [7]:
class_index_to_name = {
    0: 'airplane',
    1: 'breathing',
    2: 'brushing_teeth',
    3: 'can_opening',
    4: 'car_horn',
    5: 'cat',
    6: 'chainsaw',
    7: 'chirping_birds',
    8: 'church_bells',
    9: 'clapping',
    10: 'clock_alarm',
    11: 'clock_tick',
    12: 'coughing',
    13: 'cow',
    14: 'crackling_fire',
    15: 'crickets',
    16: 'crow',
    17: 'crying_baby',
    18: 'dog',
    19: 'door_wood_creaks',
    20: 'door_wood_knock',
    21: 'drinking_sipping',
    22: 'engine',
    23: 'fireworks',
    24: 'footsteps',
    25: 'frog',
    26: 'glass_breaking',
    27: 'hand_saw',
    28: 'helicopter',
    29: 'hen',
    30: 'insects',
    31: 'keyboard_typing',
    32: 'laughing',
    33: 'mouse_click',
    34: 'pig',
    35: 'pouring_water',
    36: 'rain',
    37: 'rooster',
    38: 'sea_waves',
    39: 'sheep',
    40: 'siren',
    41: 'sneezing',
    42: 'snoring',
    43: 'thunderstorm',
    44: 'toilet_flush',
    45: 'train',
    46: 'vacuum_cleaner',
    47: 'washing_machine',
    48: 'water_drops',
    49: 'wind'
}


In [None]:
import torch
from torchvision import transforms
from scipy.io import wavfile
import numpy as np

def preprocess_audio(file_path):
    audio_data, _ = librosa.load(file_path, sr=None)
    mel_spec = librosa.feature.melspectrogram(y=audio_data, sr=44100)
    mel_spec_db = librosa.power_to_db(mel_spec, ref=np.max)

    mel_spec_db = torch.tensor(mel_spec_db, dtype=torch.float32)
    mel_spec_db = mel_spec_db.unsqueeze(0).unsqueeze(0)
    
    return mel_spec_db

model = AudioCNN(
    num_classes=50,
    conv1_channels=128,
    conv2_channels=256,
    fc1_out_features=256,
    num_residual_blocks=3
)

model.load_state_dict(torch.load('audio_cnn_model_final.pth'))

model.eval()

audio_file_path = './1-32318-A-0.wav'

with torch.no_grad():
    input_data = preprocess_audio(audio_file_path)
    output_probs = model(input_data)

predicted_class_index = torch.argmax(output_probs).item()

predicted_class_name = class_index_to_name.get(predicted_class_index, "Unknown")

print(f"The predicted class is: {predicted_class_name}")