## Using an RNN classifier

Results: ~3% accuracy

Optimizations applied:
- initially I tried using an MLP, which led to problems due to the different lengths of the audio files, then I switched to an RNN (can handle variable length inputs)
- find a suitable minimum duration of the audio files: 24s (median)
- pad the audio files to the minimum length (repeat shorter files until they reach the median length)
- frame audio files to a multiple of {seconds}, because some audio files are much longer and it would be a waste of data to truncate them
- take class weights into account (the dataset is imbalanced)
- played with the hyperparameters of the RNN a bit

Conclusion: This is a poor approach for the given problem. The RNN is not able to learn much from the audio files. Changing from 1s to 24s does not change the learning outcome significantly. 

Next: A spectrogram approach (computer vision).

In [None]:
import pandas as pd
import numpy as np
import joblib
import seaborn as sns
import matplotlib.pyplot as plt
from sklearn.model_selection import train_test_split
import torch
import torch.nn as nn
import torch.optim as optim
from torch.optim.lr_scheduler import ReduceLROnPlateau
from torch.utils.data import Dataset, DataLoader, random_split
from torchvision.models import resnet18, ResNet18_Weights
from torchvision import transforms
from IPython.display import Audio
import librosa
from sklearn.preprocessing import LabelEncoder
from sklearn.utils.class_weight import compute_class_weight
from sklearn.metrics import confusion_matrix, accuracy_score, f1_score
import torch.nn.functional as F

import random
import glob
import os

import sys
sys.path.append("..")
import utils

In [None]:
RANDOM_SEED = 21

# Set seed for experiment reproducibility
random.seed(RANDOM_SEED)
np.random.seed(RANDOM_SEED)
torch.manual_seed(RANDOM_SEED)
torch.cuda.manual_seed(RANDOM_SEED)
torch.backends.cudnn.deterministic = True
torch.backends.cudnn.benchmark = True

In [None]:
is_in_kaggle_env = utils.get_is_in_kaggle_env()

data_path = '/kaggle/input/birdclef-2023/' if is_in_kaggle_env else '../data/'

device = 'cpu' if is_in_kaggle_env else utils.determine_device()

if not is_in_kaggle_env and not os.path.exists('../data'):
    print("Downloading data...")
    !kaggle competitions download -c 'birdclef-2023'
    !mkdir ../data
    !unzip -q birdclef-2023.zip -d ../data
    !rm birdclef-2023.zip

df_metadata_csv = pd.read_csv(f"{data_path}/train_metadata.csv")

audio_data_dir = f"{data_path}/train_audio/"

In [None]:
class_counts = df_metadata_csv["primary_label"].value_counts()

two_or_less_samples_rows = df_metadata_csv[df_metadata_csv["primary_label"].isin(class_counts[class_counts < 3].index)]

print(f"Number of unique classes with less than 2 samples: {len(two_or_less_samples_rows['primary_label'].unique())}")
print(f"Number of rows with less than 2 samples: {len(two_or_less_samples_rows)}")
print(f"Primary labels with less than 2 samples: {two_or_less_samples_rows['primary_label'].unique()}")

In [None]:
# Drop rows with primary_label that have two or less samples
print(f"Number of rows before dropping: {len(df_metadata_csv)}")
df_metadata_csv = df_metadata_csv[~df_metadata_csv["primary_label"].isin(class_counts[class_counts < 3].index)]
print(f"Number of rows after dropping: {len(df_metadata_csv)}")

In [None]:
unique_classes = df_metadata_csv.primary_label.unique()
print(f"Number of classes: {len(unique_classes)}")

In [None]:
log_dims = False


class BirdClef23Dataset(Dataset):
    def __init__(self, df, audio_data_dir, label_encoder, seconds, hop_size_s=5.0):
        self.df = df
        self.audio_data_dir = audio_data_dir
        self.label_encoder = label_encoder
        self.seconds = seconds
        self.hop_size_s = hop_size_s

    def __getitem__(self, index):
        if torch.is_tensor(index):
            index = index.tolist()

        audio_path = os.path.join(self.audio_data_dir, self.df.iloc[index, 11])
        audio_numpy, audio_sr = librosa.load(audio_path, sr=32000)

        if audio_sr != 32000:
            raise ValueError(f"Sample rate is not 32000, it is {audio_sr} for {audio_path}")

        log_dims and print(f"audio_numpy dims 0: {audio_numpy.shape}")

        # Artifically increase audio length if below {seconds}
        if audio_numpy.shape[0] < 32000 * self.seconds:
            log_dims and print(f"Padding audio from {audio_numpy.shape[0]} to {32000 * self.seconds}")

            padding_needed = int(32000 * self.seconds - audio_numpy.shape[0])
            
            pad_width = (0, padding_needed)
            
            audio_numpy = np.pad(audio_numpy, pad_width, 'wrap')

        log_dims and print(f"audio_numpy dims 1: {audio_numpy.shape}")

        # Frame the audio (that means split it into windows of size {seconds} that overlap by {hop_size_s})
        frame_length = int(self.seconds * audio_sr)
        hop_length = int(self.hop_size_s * audio_sr)
        audio_numpy = librosa.util.frame(audio_numpy, frame_length=frame_length, hop_length=hop_length, axis=0)

        log_dims and print(f"audio_numpy dims 2: {audio_numpy.shape}")

        # Convert to tensor and add channel dimension
        audio_tensor = torch.from_numpy(audio_numpy.copy()).float().to(device)
        log_dims and print(f"audio_tensor dims 0: {audio_tensor.shape}")

        audio_tensor = audio_tensor.unsqueeze(1)
        log_dims and print(f"audio_tensor dims 1: {audio_tensor.shape}\n")

        primary_label_raw = self.df.iloc[index, 0]
        primary_label = self.label_encoder.transform([primary_label_raw])[0]

        row_id = audio_path.split('/')[-1].split('.')[0]

        return row_id, audio_tensor, audio_numpy, primary_label
    
    def __len__(self):
        return len(self.df)


def get_data_loader(dataset, batch_size=32, data_percentage=None, shuffle=False):
    if data_percentage is not None:
        data_len = int(len(dataset) * data_percentage)
        dataset, _ = random_split(dataset, [data_len, len(dataset) - data_len])

    data_loader = DataLoader(dataset, batch_size=batch_size, shuffle=shuffle)

    return data_loader


def split_df(df, primary_label='primary_label', percentages=[60, 20, 20]):
    """
    - Percentages: [train, valid, test]
    - Splits a dataframe into three dataframes (train, valid, test), stratified by primary_label
    - Also returns the class weights (based on the training set)
    """
    print(f"Splitting dataframe into train {percentages[0]}%, valid {percentages[1]}%, test {percentages[2]}%, stratified by {primary_label}")
    
    train_perc, valid_perc, test_perc = [perc / 100 for perc in percentages]
    train_valid_split = round(train_perc / (train_perc + valid_perc), 2)
    
    temp_df, test_df = train_test_split(df, test_size=test_perc, stratify=df[primary_label], random_state=RANDOM_SEED)
    
    train_df, valid_df = train_test_split(temp_df, test_size=1-train_valid_split, stratify=temp_df[primary_label], random_state=RANDOM_SEED)

    classes = np.unique(train_df[primary_label])
    class_weights = compute_class_weight(class_weight='balanced', classes=classes, y=train_df[primary_label])

    return train_df, valid_df, test_df, class_weights


# class AudioClassifier(torch.nn.Module):
#     def __init__(self, num_classes, seconds):
#         super().__init__()
#         self.num_classes = num_classes
#         self.seconds = seconds
#         # 32000 because of the sampling rate
#         self.fc1 = torch.nn.Linear(32000 * self.seconds, 1000)
#         self.fc2 = torch.nn.Linear(1000, 100)
#         self.fc3 = torch.nn.Linear(100, self.num_classes)
#         self.sigmoid = torch.nn.Sigmoid()

#     def forward(self, x):
#         x = x.view(-1, 32000 * self.seconds)
#         x = self.fc1(x)
#         x = self.fc2(x)
#         x = self.fc3(x)
#         x = self.sigmoid(x)
#         return x


class AudioRNN(torch.nn.Module):
    # RNN in pytorch: https://pytorch.org/docs/stable/generated/torch.nn.RNN.html and https://blog.floydhub.com/a-beginners-guide-on-recurrent-neural-networks-with-pytorch/
    def __init__(self, num_classes, seconds, hidden_size):
        super().__init__()
        self.num_classes = num_classes
        self.seconds = seconds
        self.hidden_size = hidden_size
        self.input_size = 32000 * self.seconds
        self.rnn = torch.nn.RNN(input_size=self.input_size, hidden_size=self.hidden_size, batch_first=True)
        self.fc = torch.nn.Linear(self.hidden_size, self.num_classes)
        self.softmax = torch.nn.Softmax(dim=1)

    def forward(self, x):
        batch_size = x.size(0)
        x = x.view(batch_size, -1, self.input_size) # reshape input to batch_size x seq_len x input_size
        h0 = torch.zeros(1, batch_size, self.hidden_size).to(device)
        out, hn = self.rnn(x, h0) # pass input and hidden state through RNN
        logits = self.fc(out[:, -1, :])
        probas = self.softmax(logits)
        return logits, probas


def train(model, train_loader, valid_loader, loss_func, optimizer, num_epochs):
    minibatch_loss, train_acc_lst, valid_acc_lst, train_loss_lst, valid_loss_lst = [], [], [], [], []
    
    for epoch in range(num_epochs):
        print(f"Epoch {epoch + 1}/{num_epochs}")
        model.train()
        
        for i, (row_id, audio_tensor, audio_numpy, primary_label) in enumerate(train_loader):
            print(f"Batch {i + 1}/{len(train_loader)}", end="\r")

            features = audio_tensor.to(device)
            targets = primary_label.to(device)

            log_dims and print(f"Input tensor shape: {audio_tensor.shape}")
            logits, probas = model(features)

            loss = loss_func(logits, targets)

            optimizer.zero_grad()
            
            loss.backward()

            minibatch_loss.append(loss.item())
            
            optimizer.step()
            
        train_acc, train_loss = validate(model, train_loader, loss_func)
        valid_acc, valid_loss = validate(model, valid_loader, loss_func)

        train_acc_lst.append(train_acc)
        valid_acc_lst.append(valid_acc)
        train_loss_lst.append(train_loss)
        valid_loss_lst.append(valid_loss)

        print(f"Epoch {epoch+1}/{num_epochs} done. Train Loss: {train_loss:.4f}, Valid Loss: {valid_loss:.4f}, Train Accuracy: {train_acc:.2f}%, Valid Accuracy: {valid_acc:.2f}%")
              
    return minibatch_loss, train_loss_lst, valid_loss_lst, train_acc_lst, valid_acc_lst


def validate(model, data_loader, loss_fn=F.cross_entropy):
    model.eval()
    
    correct_pred, num_examples, cross_entropy = 0.0, 0.0, 0.0

    with torch.no_grad():
        for row_id, audio_tensor, audio_numpy, primary_label in data_loader:
            features = audio_tensor.to(device)
            targets = primary_label.to(device)

            logits, probas = model(features) # forward propagation z=logits, a=f(z)
            cross_entropy += loss_fn(logits, targets)

            _, predicted_labels = torch.max(probas, 1)
            num_examples += targets.size(0)

            correct_pred += (predicted_labels == targets).sum()

    accuracy = correct_pred / num_examples * 100
    loss = cross_entropy / num_examples
    return accuracy, loss


# --- training
ignore_existing_label_encoder = True
if ignore_existing_label_encoder or not os.path.exists('label_encoder.joblib'):
    print('Creating label encoder...')
    label_encoder = LabelEncoder()
    label_encoder.fit(list(unique_classes))
    joblib.dump(label_encoder, 'label_encoder.joblib')
else:
    print('Loading label encoder...')
    label_encoder = joblib.load('label_encoder.joblib')

train_df, valid_df, test_df, class_weights = split_df(df_metadata_csv)

seconds = 10
batch_size = 1 # otherwise the training will fail because of the varying length of the audio files, which results in varying shape of tensors due to framing the audio
data_percentage = 1 # 1 means 100% of the data
num_epochs = 2

train_dataset = BirdClef23Dataset(train_df, audio_data_dir, label_encoder, seconds)
valid_dataset = BirdClef23Dataset(valid_df, audio_data_dir, label_encoder, seconds)
test_dataset = BirdClef23Dataset(test_df, audio_data_dir, label_encoder, seconds)

train_loader = get_data_loader(train_dataset, batch_size, data_percentage, shuffle=True)
valid_loader = get_data_loader(valid_dataset, batch_size, data_percentage, shuffle=False)
test_loader = get_data_loader(test_dataset, batch_size, data_percentage, shuffle=False)

# model = AudioClassifier(num_classes=len(unique_classes), seconds=seconds).to(device)
model = AudioRNN(num_classes=len(unique_classes), seconds=seconds, hidden_size=128).to(device)
print(f"Initialized model {model._get_name()}")

loss_function = torch.nn.CrossEntropyLoss(weight=torch.from_numpy(class_weights).float().to(device))
optimizer = torch.optim.Adam(model.parameters(), lr=0.001)
scheduler = torch.optim.lr_scheduler.ReduceLROnPlateau(optimizer, mode='min', factor=0.1, patience=5, verbose=True)

minibatch_loss, train_loss_lst, valid_loss_lst, train_acc_lst, valid_acc_lst = train(model, train_loader, valid_loader, loss_function, optimizer, num_epochs)

In [None]:
utils.plot_minibatch_loss(minibatch_loss)

In [None]:
utils.plot_train_and_valid_loss_and_accuracy(train_loss_lst, valid_loss_lst, train_acc_lst, valid_acc_lst)

In [None]:
# get first item from train loader
row_id, audio_tensor, audio_numpy, primary_label = next(iter(train_loader))
print(f"primary_label: {primary_label}, row_id: {row_id}")

class_name = label_encoder.inverse_transform(primary_label)
print(f"class_name: {class_name}")

df_metadata_csv[df_metadata_csv['filename'].str.contains("XC321277")]