In [1]:
# Importing modules
import glob
import torch
import torchaudio
from collections import Counter
from torch.utils.data import DataLoader, Dataset
from torch import nn, optim

In [2]:
# Custom dataset
class datasets(Dataset):
    def __init__(self, path):
        files = glob.glob(f"{path}*.wav")
        self.items = [(f,int(f.split("-")[-1].split(".")[0])) for f in files]
        self.length = len(self.items)

    def __getitem__(self, index):
        filename, label = self.items[index]
        audio_tensor, sampling_rate = torchaudio.load(filename)
        label_tensor = torch.tensor(label)
        return audio_tensor, label_tensor
        
    def __len__(self):
        return self.length

In [3]:
# Datasets
train_dataset = datasets("train_dir/")
val_dataset = datasets("val_dir/")

# Dataloaders
train_dataloader = DataLoader(train_dataset, batch_size=32, shuffle=True)
val_dataloader = DataLoader(val_dataset, batch_size=32, shuffle=True)

In [4]:
# AudioNet Model
class AudioNet(nn.Module):
    
    def __init__(self):
        super(AudioNet, self).__init__()
        self.conv1 = nn.Conv1d(1, 128, 80, 4)
        self.bn1 = nn.BatchNorm1d(128)
        self.relu1 = nn.ReLU()
        self.pool1 = nn.MaxPool1d(4)
        self.conv2 = nn.Conv1d(128, 128, 3)
        self.bn2 = nn.BatchNorm1d(128)
        self.relu2 = nn.ReLU()
        self.pool2 = nn.MaxPool1d(4)
        self.conv3 = nn.Conv1d(128, 256, 3)
        self.bn3 = nn.BatchNorm1d(256)
        self.relu3 = nn.ReLU()
        self.pool3 = nn.MaxPool1d(4)
        self.conv4 = nn.Conv1d(256, 512, 3)
        self.bn4 = nn.BatchNorm1d(512)
        self.relu4 = nn.ReLU()
        self.pool4 = nn.MaxPool1d(4)
        self.avgpool = nn.AdaptiveAvgPool1d(1)
        self.fc1 = nn.Linear(512, 50)
        
    def forward(self, x):
        x = self.conv1(x)
        x = self.relu1(self.bn1(x))
        x = self.pool1(x)
        x = self.conv2(x)
        x = self.relu2(self.bn2(x))
        x = self.pool2(x)
        x = self.conv3(x)
        x = self.relu3(self.bn3(x))
        x = self.pool3(x)
        x = self.conv4(x)
        x = self.relu4(self.bn4(x))
        x = self.pool4(x)
        x = self.avgpool(x)
        x = x.view(x.size(0), -1)  # Flatten the tensor
        x = self.fc1(x)
        return nn.LogSoftmax(dim=1)(x)

In [5]:
# Model
model = AudioNet()
model = model.to(torch.device("cpu"))

# Optimizer
optimizer = optim.Adam(model.parameters(), lr=0.001)

# Loss function
criterion = nn.CrossEntropyLoss()

In [6]:
# Training
for epoch in range(1, 11):
    
    # Training loop
    model.train()
    for batch in train_dataloader:
        optimizer.zero_grad()
        X, y = batch
        X, y = X.to(torch.device("cpu")), y.to(torch.device("cpu"))
        y_hat = model(X)
        loss = criterion(y_hat, y)
        loss.backward()
        optimizer.step()
        
    # Validation loop
    val_loss = 0
    model.eval()
    with torch.no_grad():
        for batch in val_dataloader:
            X, y = batch
            X, y = X.to(torch.device("cpu")), y.to(torch.device("cpu"))
            y_hat = model(X)
            loss = criterion(y_hat, y)
            val_loss += loss.item()
    
    # Logging
    val_loss = val_loss/len(val_dataloader)
    print(f"Epoch: {epoch} | Validation loss: {val_loss}")

Epoch: 1 | Validation loss: 3.9388858721806455
Epoch: 2 | Validation loss: 4.003377125813411
Epoch: 3 | Validation loss: 3.701098295358511
Epoch: 4 | Validation loss: 3.356357849561251
Epoch: 5 | Validation loss: 3.1326538966252255
Epoch: 6 | Validation loss: 3.059098848929772
Epoch: 7 | Validation loss: 3.07684326171875
Epoch: 8 | Validation loss: 2.9576371082892785
Epoch: 9 | Validation loss: 2.9602874792539158
Epoch: 10 | Validation loss: 3.208728606884296
