# Pytorch Audio Classification (In Progress)

## Imports

In [None]:
!pip install torchsummary

In [None]:
import warnings
warnings.filterwarnings('ignore')

In [None]:
from __future__ import print_function, division

import pathlib
import os
import numpy as np
import matplotlib.pyplot as plt
from PIL import Image
from torchsummary import summary

import torch
import torchaudio
import torch.nn as nn
import torch.nn.functional as F
import torch.nn.init as I
import torch.optim as optim
import torchvision

from torch.utils.data import Dataset, DataLoader
import multiprocessing
from tqdm import tqdm
import pandas as pd
import librosa

import shutil
from pathlib import Path
import gc

## Analysing CSV

In [None]:
example_test_audio_metadata = pd.read_csv('../input/birdsong-recognition/example_test_audio_metadata.csv')
example_test_audio_metadata.head()

In [None]:
example_test_audio_summary = pd.read_csv('../input/birdsong-recognition/example_test_audio_summary.csv')
example_test_audio_summary.head()

In [None]:
test = pd.read_csv('../input/birdsong-recognition/test.csv')
test.head()

In [None]:
train_df = pd.read_csv('../input/birdsong-recognition/train.csv')
train_df.head()

## Constants

In [None]:
EPOCHS = 50
DURATION = 5
TEST = True
NO_OF_TEST_CLASSES = 5
data_dir = pathlib.Path('/kaggle/input/birdsong-recognition/train_audio/')
# Image size of spectrogram for 5 seconds 
IMG_SIZE=(552, 128)

In [None]:
device = torch.device('cuda' if torch.cuda.is_available() else 'cpu')
print(f'Running on : {device}')

# Train Data Processing

Creating a new data frame with the required columns for analysis. For training filtering the audio which has a duration of more than 5 seconds, and then calculating offset to read 5 seconds of audio data in the middle.

In [None]:
train_df = train_df[["ebird_code", "channels", "duration", "filename", "species","bird_seen","latitude", "longitude"]]
train_df = train_df[train_df["duration"] >= DURATION]
train_df['offset'] = train_df.apply(lambda row: int((row.duration - DURATION) / 2) ,axis=1)
train_df['path']= train_df.apply(lambda row: f'/kaggle/input/birdsong-recognition/train_audio/{row.ebird_code}/{row.filename}' ,axis=1)
train_df.head()

In [None]:
CLASS_NAMES = [item.name for item in data_dir.glob('*')]
if TEST:
    CLASS_NAMES = CLASS_NAMES[:NO_OF_TEST_CLASSES]
    print(f"Test running on {NO_OF_TEST_CLASSES} classes")
    print(f"Class Names", CLASS_NAMES)

In [None]:
CLASS_MAP = {x: CLASS_NAMES.index(x) for x in CLASS_NAMES}
print(f"Total Classes: {len(CLASS_NAMES)}")
print(f"Classes: ", CLASS_MAP)

Parsing class name.

## Creating a batch of data

Creating batches of data. If split_window = [0, 0.25, 0.5, 1], the data splitted into 0%-25%, 25%-50% and 50%-100%. Created it for test run. Here I am creating 100% of split, Because I am training only for only 10 classes in test purpose

In [None]:
training_data = train_df[train_df.ebird_code.isin(CLASS_NAMES)]
training_data = training_data[['path', 'offset']]
training_data.head()

In [None]:
paths = training_data.values
paths = [[pathlib.Path(item[0]), item[1]] for item in paths]

groups = [[[y[0], y[1]] for y in paths if y[0].parts[-2]==x] for x in CLASS_NAMES]

plot_data = []

# Percentage window split. if split_window = [0, 0.25, 0.5, 1], the data splitted into 0%-25%, 25%-50% and 50%-100%
split_window = [0, 1]  
batches = [[] for _ in range(len(split_window)-1)]

for group in groups:
    plot_data.append(group[1]) 
    tr_b = [group[int(len(group) * split_window[i]) : int(len(group) * split_window[i+1])] for i in range(len(split_window)-1)]
    batches = [tr_b[i] + batches[i] for i in range(len(batches))]
    
print("Trainning audio batches count: ", [len(l) for l in batches])

# Analysis

In [None]:
# Pie chart, where the slices will be ordered and plotted counter-clockwise:
labels = 'Bird Seen', 'Bird Unseen'
bird_seen_count = train_df[train_df.bird_seen == 'yes'].shape[0]
sizes = [bird_seen_count, train_df.shape[0] - bird_seen_count]
explode = (0, 0.1) 

fig1, ax1 = plt.subplots()
ax1.pie(sizes, explode=explode, labels=labels, autopct='%1.1f%%',
        shadow=True, startangle=90)
ax1.axis('equal')  # Equal aspect ratio ensures that pie is drawn as a circle.

plt.show()

In [None]:
# Pie chart, where the slices will be ordered and plotted counter-clockwise:
labels = 'Stereo Audio Files', 'Mono Audio Files'
mono_audio_count = train_df[train_df.channels == '1 (mono)'].shape[0]
sizes = [train_df.shape[0] - mono_audio_count, mono_audio_count]
explode = (0, 0.1) 

fig1, ax1 = plt.subplots()
ax1.pie(sizes, explode=explode, labels=labels, autopct='%1.1f%%',
        shadow=True, startangle=90)
ax1.axis('equal')  # Equal aspect ratio ensures that pie is drawn as a circle.

plt.show()

# Test Mode Data

Data distribution in test mode

In [None]:
if TEST:
    labels = CLASS_NAMES
    distribution = train_df[train_df.ebird_code.isin(CLASS_NAMES)]
    distribution = distribution.groupby(['ebird_code'])['ebird_code'].value_counts().to_frame()
    distribution = distribution.rename(columns={ distribution.columns[0]: "count" })
    sizes = list(distribution['count'].values)

    fig1, ax1 = plt.subplots()
    ax1.pie(sizes, labels=labels, autopct='%1.1f%%',
            shadow=True, startangle=90)
    ax1.axis('equal')  # Equal aspect ratio ensures that pie is drawn as a circle.

    plt.show()

## Plotting audio samples

[**MelSpectrogram:**](https://medium.com/analytics-vidhya/understanding-the-mel-spectrogram-fca2afa2ce53#:~:text=A%20mel%20spectrogram%20is%20a,converted%20to%20the%20mel%20scale) A mel spectrogram is a spectrogram where the frequencies are converted to the mel scaleMelSpectrogram: 

In [None]:
for i in range(5):
    data = plot_data[i]
    path = data[0]
    name = path.parts[-2]
    offset = data[1]
    S, sr = librosa.load(path, mono=True, duration=DURATION, offset=offset)
    specgram = torchaudio.transforms.MelSpectrogram()(torch.tensor(S))
    img = torchvision.transforms.ToPILImage()(specgram).convert("RGB")
    img = img.resize(IMG_SIZE, Image.ANTIALIAS)    
    plt.figure()
    plt.subplot(2, 1, 1)
    plt.plot(S)
    plt.title(name)
    plt.subplot(2, 1, 2)
    plt.imshow(img)
    plt.title(name + ": "+ 'Mel spectrogram')
    plt.tight_layout()

## Converting Audio To Images

Here the audio files are read from the offset with required durations(5 seconds) and converted to MelSpectrogram. Reading audio as mono channel audio. The MelSpectrogram images are resized into 552, 128 (Image size of spectrogram for 5 seconds) and saved as images. The audio to image conversion happens in a multiprocess pool to speed up the process.

The conversion processing happens as mini-batches, each mini-batch processing 100 audio files.

In [None]:
def delete_previous_image_dir():
    if os.path.exists('/kaggle/working/train_images/'):
        shutil.rmtree('/kaggle/working/train_images/')
    
def create_dir():
    if not os.path.exists('train_images'):
        os.makedirs('train_images')
    for name in CLASS_NAMES:
        p = Path(f"/kaggle/working/train_images/{name}/")
        if not os.path.exists(p):
            p.mkdir(parents=True)
    

def convert_audio_to_image(data):
    path = data[0]
    offset = data[1]
    class_name = path.parts[-2]
    name = path.parts[-1].split(".")[0]
    try:
        S, sr = librosa.load(path, mono=True, duration=DURATION, offset=offset)
        specgram = torchaudio.transforms.MelSpectrogram()(torch.tensor(S))
        img = torchvision.transforms.ToPILImage()(specgram).convert("RGB")
        img.save(f"/kaggle/working/train_images/{class_name}/{name}.png")
        
        del S
        del specgram
        del img
    except Exception as e:
        print(f"Exception in reading: {path}")
    gc.collect()

def process_audio(paths):
    for path in tqdm(paths):
        convert_audio_to_image(path)
        
def process_audio_in_pool(data):
    pool = multiprocessing.Pool(multiprocessing.cpu_count())
    pool.map(convert_audio_to_image, data)  
    pool.close()
    pool.join()

delete_previous_image_dir()
create_dir()

batch = batches[0]
total = int(len(batch) / 100) + (1 if len(batch) % 100 > 1 else 0)
current = 0
while len(batch) > 0:
    # Batch contain 100 images
    print(f"Processing mini batch(may take some time): {current + 1} / {total}")
    process_audio_in_pool(batch[:100])
    batch = batch[100:]
    current += 1

In [None]:
data_dir = pathlib.Path('/kaggle/working/train_images/')
paths = list(data_dir.glob('*/*.png'))
image_count = len(paths)
print(f"Total Image: {image_count}")

In [None]:
groups = [[y for y in paths if y.parts[-2]==x] for x in CLASS_NAMES]

train_path = []
test_path = []

for group in groups:
    l = len(group)
    l = int(l * 0.8)
    train_path += group[:l]
    test_path += group[l:]

    
print("Trainning images: ", len(train_path))
print("Testing audio: ", len(test_path))

# Input Pipeline

In [None]:
class AudioDataset(Dataset):

    paths = []
    
    def __init__(self, paths):
        self.paths = paths
        
    def __len__(self):
        return len(self.paths)

    def __getitem__(self, idx):
        path = self.paths[idx]
        img = Image.open(path)
        img = img.resize(IMG_SIZE, Image.ANTIALIAS)    
        pil_to_tensor = torchvision.transforms.ToTensor()(img)
        return pil_to_tensor, CLASS_MAP[path.parts[-2]]

In [None]:
train_data = AudioDataset(paths=train_path)
test_data = AudioDataset(paths=test_path)

train_loader = DataLoader(train_data, batch_size=16, shuffle=True)
test_loader = DataLoader(test_data, batch_size=16, shuffle=True)

# Network

# Transfer Learning

Creating transfer learning network with resnet18 as base model.

In [None]:
net = torchvision.models.resnet18()
net = net.cuda() if torch.cuda.is_available() else net
net

In [None]:
criterion = nn.CrossEntropyLoss()
optimizer = optim.SGD(net.parameters(), lr=0.0001, momentum=0.9)


num_ftrs = net.fc.in_features
net.fc = nn.Linear(num_ftrs, len(CLASS_NAMES))
net.fc = net.fc.cuda() if torch.cuda.is_available() else net.fc

In [None]:
summary(net, (3, IMG_SIZE[0], IMG_SIZE[1]))

In [None]:
for epoch in range(EPOCHS):
    print(f"Running Epoch {epoch + 1}")
    # loop over the dataset multiple times
    running_loss = 0.0
    no_of_batches = 0
    tk0 = tqdm(train_loader, total=int(len(train_loader)))
    for i, data in enumerate(tk0):
        # get the inputs; data is a list of [inputs, labels]
        inputs, labels = data
        inputs, labels = inputs.to(device), labels.to(device)

        # zero the parameter gradients
        optimizer.zero_grad()

        # forward + backward + optimize
        outputs = net(inputs)
        loss = criterion(outputs, labels)
        loss.backward()
        optimizer.step()

        # print statistics
        running_loss += loss.item()
        no_of_batches += 1
    correct = 0
    total = 0
    with torch.no_grad():
        tk1 = tqdm(test_loader, total=int(len(test_loader)))
        for i, data in enumerate(tk1):
            inputs, labels = data
            inputs, labels = inputs.to(device), labels.to(device)
            outputs = net(inputs)
            _, predicted = torch.max(outputs.data, 1)
            total += labels.size(0)
            correct += (predicted == labels).sum().item()

    print('[Epoch - %d] loss: %.3f   Accuracy: %d %%' % (epoch + 1, running_loss / no_of_batches, 100 * correct / total))
print('Finished Training')

Ref: https://pytorch.org/tutorials/beginner/audio_preprocessing_tutorial.html