In [1]:
import zipfile
import numpy as np
from glob import glob
import matplotlib.pyplot as plt
import torch
import torch.nn as nn
from torchsummary import summary
import torchvision
from torchvision import datasets, transforms as T
from efficientnet_pytorch import EfficientNet
import os
import torch.optim as optim
from PIL import ImageFile
from sklearn.metrics import accuracy_score
import pandas as pd
from sklearn.model_selection import train_test_split
from torch.utils.data import Dataset
import copy
import plotly_express as px

#Checking the availability of a GPU
use_cuda = torch.cuda.is_available()

Load Data From Disk and Hyperparameters:

In [2]:
featuresdf = pd.read_pickle('LogGC_224_comp3.pkl')
img_height = len(featuresdf['feature'][0])
img_width = img_height

batch_size = 32
n_epochs = 30
train_base = True
optimizer_name = "Adam"
SGD_momentum = 0.9
learning_rate = 0.0015

Image Transforms:

In [3]:
class MyAddGaussNoise(object):
    def __init__(self, input_size, mean=0.0, std=None, add_noise_probability=1.0):
        assert isinstance(input_size, (int, tuple))
        assert isinstance(mean, (int, float))
        assert isinstance(std, (int, float)) or std is None
        assert isinstance(add_noise_probability, (float))


        if isinstance(input_size, int):
            self.input_size = (input_size, input_size)
        else:
            assert len(input_size) == 2
            self.input_size = input_size

        self.mean = mean

        if std is not None:
            assert std > 0.0
            self.std = std
        else:
            self.std = std

        assert add_noise_probability > 0.0 and add_noise_probability <= 1.0
        self.add_noise_prob = add_noise_probability


    def __call__(self, spectrogram):
        if np.random.random() > self.add_noise_prob:
            return spectrogram

        # set some std value 
        min_pixel_value = np.min(spectrogram)
        if self.std is None:
            std_factor = 0.03     # factor number 
        std = np.abs(min_pixel_value*std_factor)

        # generate a white noise spectrogram
        gauss_mask = np.random.normal(self.mean, 
                                    std, 
                                    size=self.input_size).astype('float32')
        
        # add white noise to the sound spectrogram
        noisy_visual = spectrogram + gauss_mask

        return noisy_visual

class MyRightShift(object):
    def __init__(self, input_size, width_shift_range, shift_probability=1.0):
        assert isinstance(input_size, (int, tuple))
        assert isinstance(width_shift_range, (int, float))
        assert isinstance(shift_probability, (float))

        if isinstance(input_size, int):
            self.input_size = (input_size, input_size)
        else:
            assert len(input_size) == 2
            self.input_size = input_size

        if isinstance(width_shift_range, int):
            assert width_shift_range > 0
            assert width_shift_range <= self.input_size[1]
            self.width_shift_range = width_shift_range
        else:
            assert width_shift_range > 0.0
            assert width_shift_range <= 1.0
            self.width_shift_range = int(width_shift_range * self.input_size[1])
                        
        assert shift_probability > 0.0 and shift_probability <= 1.0
        self.shift_prob = shift_probability

    def __call__(self, image):
        if np.random.random() > self.shift_prob:
            return image

        # create a new array filled with the min value
        shifted_image= np.full(self.input_size, np.min(image), dtype='float32')

        # randomly choose a start postion
        rand_position = np.random.randint(1, self.width_shift_range)

        # shift the image
        shifted_image[:,rand_position:] = copy.deepcopy(image[:,:-rand_position])

        return shifted_image

#applying required transformations on the dataset
img_transforms = {
    'train':
    T.Compose([
        MyAddGaussNoise(input_size = img_height,add_noise_probability=0.5),
        MyRightShift(input_size = img_height, width_shift_range=0.9, shift_probability=0.5),
        T.ToTensor(),
        T.Normalize([0.5],[0.5])
        ]),

    'valid':
    T.Compose([
        T.ToTensor(),
        T.Normalize([0.5], [0.5])
        ]),

    'test':
    T.Compose([
        T.ToTensor(),
        T.Normalize([0.5], [0.5])
        ]),
     }

Custom Dataset Object (UrbanSound8kDataset):

In [4]:
class UrbanSound8kDataset(Dataset):
    def __init__(self, featuresdf, transform=None):
        assert isinstance(featuresdf, pd.DataFrame)
        assert len(featuresdf.columns) == 3
        
        self.transform = transform

        self.featuresdf = featuresdf

    def __len__(self):
        return len(self.featuresdf)

    def __getitem__(self, index):
        if torch.is_tensor(index):
            index = index.tolist()

        cochleagram, label, fold = self.featuresdf.iloc[index]

        if self.transform is not None:
           cochleagram = self.transform(cochleagram)

        if not torch.is_tensor(cochleagram):
            cochleagram = torch.as_tensor(cochleagram.astype('float'))

        label = torch.as_tensor(np.array(label)).type(torch.LongTensor)

        cochleagram = cochleagram.expand(3,-1,-1).float()

        return cochleagram, label

Model Architecture:

In [5]:
#importing the pretrained EfficientNet model

model_transfer = EfficientNet.from_pretrained('efficientnet-b0')

# Freeze weights
for param in model_transfer.parameters():
    param.requires_grad = train_base
in_features = model_transfer._fc.in_features

# Defining Dense top layers after the convolutional layers
model_transfer._fc = nn.Sequential(   
    nn.Linear(in_features, 10),
    )

# selecting loss function
criterion_transfer = nn.CrossEntropyLoss()

#if scheduler_toggle == True:
#    scheduler = torch.optim.lr_scheduler.StepLR(optimizer, step_size=2, gamma=0.1)

#using Adam classifier
if optimizer_name == 'Adam':
    optimizer_transfer = optim.Adam(model_transfer.parameters(), lr=learning_rate)

if optimizer_name == 'SGD':
    optimizer_transfer = optim.SGD(model_transfer.parameters(), lr=learning_rate, momentum=SGD_momentum)

if use_cuda:
    model_transfer = model_transfer.cuda()

model_transfer

Loaded pretrained weights for efficientnet-b0


EfficientNet(
  (_conv_stem): Conv2dStaticSamePadding(
    3, 32, kernel_size=(3, 3), stride=(2, 2), bias=False
    (static_padding): ZeroPad2d((0, 1, 0, 1))
  )
  (_bn0): BatchNorm2d(32, eps=0.001, momentum=0.010000000000000009, affine=True, track_running_stats=True)
  (_blocks): ModuleList(
    (0): MBConvBlock(
      (_depthwise_conv): Conv2dStaticSamePadding(
        32, 32, kernel_size=(3, 3), stride=[1, 1], groups=32, bias=False
        (static_padding): ZeroPad2d((1, 1, 1, 1))
      )
      (_bn1): BatchNorm2d(32, eps=0.001, momentum=0.010000000000000009, affine=True, track_running_stats=True)
      (_se_reduce): Conv2dStaticSamePadding(
        32, 8, kernel_size=(1, 1), stride=(1, 1)
        (static_padding): Identity()
      )
      (_se_expand): Conv2dStaticSamePadding(
        8, 32, kernel_size=(1, 1), stride=(1, 1)
        (static_padding): Identity()
      )
      (_project_conv): Conv2dStaticSamePadding(
        32, 16, kernel_size=(1, 1), stride=(1, 1), bias=False
    

Train Loop:

In [6]:
# Creating the function for training
def train_model(n_epochs, loaders, model, optimizer, criterion, save_path):
    """returns trained model"""
    # initialize tracker for minimum validation loss
    valid_loss_min = np.Inf 
    trainingloss = []
    validationloss = []

    for epoch in range(1, n_epochs+1):
        # initialize the variables to monitor training and validation loss
        train_loss = 0.0
        valid_loss = 0.0
        correct = 0.0
        total = 0.0

        preds = []
        targets = []
        
        ###################
        # training the model #
        ###################
        model.train()
        for batch_idx, (data, target) in enumerate(loaders['train']):     
            if use_cuda:
                data, target = data.cuda(), target.cuda()   
            
            optimizer.zero_grad()
            output = model(data)
            loss = criterion(output, target)
            loss.backward()
            optimizer.step()
            #scheduler.step()
           
            train_loss = train_loss + ((1 / (batch_idx + 1)) * (loss.data - train_loss))
    
        ######################    
        # validating the model #
        ######################
        model.eval()
        for batch_idx, (data, target) in enumerate(loaders['valid']):
            if use_cuda:
                data, target = data.cuda(), target.cuda()

            output = model(data)
            loss = criterion(output, target)
            valid_loss = valid_loss + ((1 / (batch_idx + 1)) * (loss.data - valid_loss))
            pred = output.data.max(1,keepdim=True)[1]
            preds.append(pred)
            targets.append(target)
            # compare predictions
            correct += np.sum(np.squeeze(pred.eq(target.data.view_as(pred))).cpu().numpy())
            total += data.size(0)
        
        train_loss = train_loss/len(train_ds)
        valid_loss = valid_loss/len(val_ds)
        valid_acc = correct / total
        current_lr = optimizer.param_groups[0]['lr']

        trainingloss.append(train_loss)
        validationloss.append(valid_loss)

        # printing training/validation statistics 
        print('Epoch: {} \nTraining Loss: {:.6f} \nValidation Loss: {:.6f} \nValidation Accuracy: {:.6f} \nCorrect: {} / {} \nCurrent LR: {}'.format(
            epoch, 
            train_loss,
            valid_loss,
            valid_acc,
            correct,
            total,
            current_lr
            ))
        
        ## saving the model if validation loss has decreased
        if valid_loss < valid_loss_min:
            torch.save(model.state_dict(), save_path)
            
            valid_loss_min = valid_loss
            
    # return trained model
    return model, trainingloss, validationloss

Split Data into Train, Test, Validation and Create DataLoaders:

In [7]:
# creating data: train, validation, test
train_df, val_df = train_test_split(featuresdf, test_size=0.4)
val_df, test_df = train_test_split(val_df,test_size=0.5)

train_ds = UrbanSound8kDataset(train_df, transform=img_transforms['train'])
val_ds = UrbanSound8kDataset(val_df, transform=img_transforms['valid'])
test_ds = UrbanSound8kDataset(test_df, transform=img_transforms['valid'])

#Creating loaders for the dataset
loaders_transfer={
    'train':torch.utils.data.DataLoader(train_ds,batch_size,shuffle=True),
    'valid':torch.utils.data.DataLoader(val_ds,batch_size,shuffle=False),
    'test':torch.utils.data.DataLoader(test_ds,batch_size,shuffle=False),
}

Train and Save the Model:

In [8]:
train_model(n_epochs, loaders_transfer, model_transfer, optimizer_transfer, criterion_transfer, 'model.pt')

Epoch: 1 
Training Loss: 0.000213 
Validation Loss: 0.001064 
Validation Accuracy: 0.570201 
Correct: 995.0 / 1745.0 
Current LR: 0.0015
Epoch: 2 
Training Loss: 0.000132 
Validation Loss: 0.000715 
Validation Accuracy: 0.738682 
Correct: 1289.0 / 1745.0 
Current LR: 0.0015
Epoch: 3 
Training Loss: 0.000095 
Validation Loss: 0.000370 
Validation Accuracy: 0.807450 
Correct: 1409.0 / 1745.0 
Current LR: 0.0015
Epoch: 4 
Training Loss: 0.000078 
Validation Loss: 0.000429 
Validation Accuracy: 0.778797 
Correct: 1359.0 / 1745.0 
Current LR: 0.0015
Epoch: 5 
Training Loss: 0.000072 
Validation Loss: 0.000441 
Validation Accuracy: 0.760458 
Correct: 1327.0 / 1745.0 
Current LR: 0.0015
Epoch: 6 
Training Loss: 0.000057 
Validation Loss: 0.000346 
Validation Accuracy: 0.822350 
Correct: 1435.0 / 1745.0 
Current LR: 0.0015
Epoch: 7 
Training Loss: 0.000053 
Validation Loss: 0.000272 
Validation Accuracy: 0.861891 
Correct: 1504.0 / 1745.0 
Current LR: 0.0015
Epoch: 8 
Training Loss: 0.000047 


(EfficientNet(
   (_conv_stem): Conv2dStaticSamePadding(
     3, 32, kernel_size=(3, 3), stride=(2, 2), bias=False
     (static_padding): ZeroPad2d((0, 1, 0, 1))
   )
   (_bn0): BatchNorm2d(32, eps=0.001, momentum=0.010000000000000009, affine=True, track_running_stats=True)
   (_blocks): ModuleList(
     (0): MBConvBlock(
       (_depthwise_conv): Conv2dStaticSamePadding(
         32, 32, kernel_size=(3, 3), stride=[1, 1], groups=32, bias=False
         (static_padding): ZeroPad2d((1, 1, 1, 1))
       )
       (_bn1): BatchNorm2d(32, eps=0.001, momentum=0.010000000000000009, affine=True, track_running_stats=True)
       (_se_reduce): Conv2dStaticSamePadding(
         32, 8, kernel_size=(1, 1), stride=(1, 1)
         (static_padding): Identity()
       )
       (_se_expand): Conv2dStaticSamePadding(
         8, 32, kernel_size=(1, 1), stride=(1, 1)
         (static_padding): Identity()
       )
       (_project_conv): Conv2dStaticSamePadding(
         32, 16, kernel_size=(1, 1), stride=

Load the Trained Model

In [9]:
model = EfficientNet.from_pretrained('efficientnet-b0')
in_features = model._fc.in_features
model._fc = nn.Sequential(   
    nn.Linear(in_features, 10),
    )
model.load_state_dict(torch.load('model.pt'))
model.cuda()
model.eval()

Loaded pretrained weights for efficientnet-b0


EfficientNet(
  (_conv_stem): Conv2dStaticSamePadding(
    3, 32, kernel_size=(3, 3), stride=(2, 2), bias=False
    (static_padding): ZeroPad2d((0, 1, 0, 1))
  )
  (_bn0): BatchNorm2d(32, eps=0.001, momentum=0.010000000000000009, affine=True, track_running_stats=True)
  (_blocks): ModuleList(
    (0): MBConvBlock(
      (_depthwise_conv): Conv2dStaticSamePadding(
        32, 32, kernel_size=(3, 3), stride=[1, 1], groups=32, bias=False
        (static_padding): ZeroPad2d((1, 1, 1, 1))
      )
      (_bn1): BatchNorm2d(32, eps=0.001, momentum=0.010000000000000009, affine=True, track_running_stats=True)
      (_se_reduce): Conv2dStaticSamePadding(
        32, 8, kernel_size=(1, 1), stride=(1, 1)
        (static_padding): Identity()
      )
      (_se_expand): Conv2dStaticSamePadding(
        8, 32, kernel_size=(1, 1), stride=(1, 1)
        (static_padding): Identity()
      )
      (_project_conv): Conv2dStaticSamePadding(
        32, 16, kernel_size=(1, 1), stride=(1, 1), bias=False
    

Perform Inference on the Test Set:

In [10]:
preds = []
targets = []
correct = 0

class_map = {'0' : 'air_conditioner', '1' : 'car_horn', '2' : 'children_playing', '3' : 'dog_bark', '4' : 'drilling', 
                 '5' : 'engine_idling', '6' : 'gun_shot', '7' : 'jackhammer', '8' : 'siren', '9' : 'street_music'}


for batch_idx, (data, target) in enumerate(loaders_transfer['test']):
    data, target = data.cuda(), target.cuda()
    output = model(data)
    prediction = torch.argmax(output, dim=1)
    preds.append(prediction.cpu().numpy())
    targets.append(target.cpu().numpy())

targets = np.concatenate(targets)
preds = np.concatenate(preds)

results_df = pd.DataFrame(data=zip(targets,preds),columns=['target','prediction'])
results_df['correct'] = results_df['target'] == results_df['prediction']
results_df['correct'] = results_df['correct'].astype(str)
results_df = results_df.groupby(by=['target','correct'],as_index=False).count()
results_df = results_df.pivot_table(index='target',columns='correct',values='prediction').fillna(0).reset_index()
results_df['acc'] = results_df['True'] / (results_df['True']+results_df['False'])
results_df['target'] = results_df['target'].astype(str)
results_df = results_df.replace({"target":class_map})
results_df = results_df.sort_values(by=['acc'],ascending=True).reset_index()

total = sum(results_df['True']) + sum(results_df['False'])
correct = sum(results_df['True'])

fig = px.bar(results_df,x="acc",y="target",orientation='h', template='plotly_dark',
           labels={"acc":"Classification Accuracy",
           'target' : 'Audio Class'
           },
           title='Performance Report by Class')

fig.layout.showlegend = False
fig.update_layout(xaxis_tickformat="2%")

print("Final Test Accuracy: {} ({} / {})".format(
   correct/total,
   correct,
   total
))

fig.show(), results_df

Final Test Accuracy: 0.9226361031518625 (1610 / 1745)


(None,
 correct  index            target  False  True       acc
 0            9      street_music     43   177  0.804545
 1            2  children_playing     21   179  0.895000
 2            7        jackhammer     16   159  0.908571
 3            5     engine_idling     13   179  0.932292
 4            0   air_conditioner     13   203  0.939815
 5            4          drilling     12   190  0.940594
 6            3          dog_bark      9   171  0.950000
 7            1          car_horn      4    94  0.959184
 8            6          gun_shot      2    69  0.971831
 9            8             siren      2   189  0.989529)