In [None]:
import csv
import math
import os
from pathlib import Path
import random
import shutil
import time
import uuid

import IPython.display as ipd
import numpy as np
import librosa
import pandas as pd
import torch
import torch.nn as nn
import torch.nn.functional as F
import torchaudio
from torch.utils.data import Dataset, DataLoader, RandomSampler, TensorDataset
from tqdm.notebook import tqdm
import matplotlib.pyplot as plt
from sklearn.model_selection import StratifiedKFold

torchaudio.set_audio_backend("sox_io")

SAMPLE_RATE = 48000
RECORDING_LENGTH = 2880000

In [None]:
!pip install git+https://github.com/facebookresearch/fvcore.git

In [None]:
rng_seed = 37
random.seed(rng_seed)
np.random.seed(rng_seed)
os.environ['PYTHONHASHSEED'] = str(rng_seed)
torch.manual_seed(rng_seed)
torch.cuda.manual_seed(rng_seed)
torch.backends.cudnn.deterministic = True
torch.backends.cudnn.benchmark = False

In [None]:
DATA_DIR = Path('/kaggle/input/rfcx-species-audio-detection')
OUTPUT_DIR = Path('/kaggle/working')

TRAIN_DIR = DATA_DIR / 'train'
TEST_DIR = DATA_DIR / 'test'
MODEL_PATH = Path('../input/timm-resnest-weights/resnest50-528c19ca.pth')
dest = OUTPUT_DIR / 'waveform-tensors'
weights_dir = OUTPUT_DIR / 'weights'

Path.mkdir(dest, exist_ok=True)
Path.mkdir(weights_dir, exist_ok=True)

In [None]:
df = pd.read_csv(DATA_DIR / 'train_tp.csv')

In [None]:
df.head()

In [None]:
f_min_df = int(min(df['f_min']) * 0.9)
f_max_df = int(max(df['f_max']) * 1.1)

In [None]:
f_max_df

In [None]:
class AudioResNest(nn.Module):
    def __init__(self, n_outputs, mod_path, load_path, load_name, linear):
        super().__init__()
        self.preprocess = nn.Sequential(
            torchaudio.transforms.MelSpectrogram(
                sample_rate=48000,
                n_fft=4096,
                hop_length=512,
                f_min=f_min_df,
                f_max=f_max_df,
                n_mels=64,
                power=2.,
            ),
            torchaudio.transforms.AmplitudeToDB(stype='power'),
        )
#         self.resnest = torch.hub.load('pytorch/vision', 'resnet50', pretrained=True)
#         self.resnest = models.resnet18(pretrained=True)
#         self.resnest = torch.hub.load('pytorch/vision', 'resnet18', pretrained=True)
#         self.resnest = torch.hub.load('zhanghang1989/ResNeSt', 'resnest50', pretrained=False
#         self.resnest = torch.hub.load('zhanghang1989/ResNeSt', 'resnest50', pretrained=True)
        self.resnest = torch.hub.load(load_path, load_name, pretrained=True)
        self.resnest.load_state_dict(torch.load(mod_path))
#         self.resnest.load_state_dict(torch.load(MODEL_PATH))
        if linear:
            self.resnest.fc = nn.Sequential(
                nn.Linear(2048, 1024),
                nn.ReLU(),
                nn.Dropout(p=0.2),
                nn.Linear(1024, 1024),
                nn.ReLU(),
                nn.Dropout(p=0.2),
                nn.Linear(1024, n_outputs)
            )
        else:
            self.resnest.fc = nn.Sequential(
                nn.Linear(2048, 1024),
                nn.ReLU(),
                nn.Dropout(p=0.2),
                nn.Linear(1024, 1024),
                nn.ReLU(),
                nn.Dropout(p=0.2),
                nn.Softmax(dim=1)
            )

    def forward(self, x):
        x -= torch.mean(x)
        x /= torch.max(torch.abs(x))
        mel_spec = self.preprocess(x)
        mel_spec -= torch.mean(mel_spec)
        mel_spec /= torch.std(mel_spec)
        mel_spec = torch.stack((mel_spec, mel_spec, mel_spec), dim=1)
        logits = self.resnest(mel_spec)
        return logits

In [None]:
import warnings
warnings.filterwarnings("ignore", message="PySoundFile failed. Trying audioread instead.")

def get_segment(audio, sr, t_min, t_max):
    start_idx = int(sr * t_min)
    end_idx = int(sr * t_max)
    seg = audio[start_idx:end_idx]
    seg -= np.mean(seg)
    seg /= np.max(np.abs(seg))
    return seg

def get_stft(audio, sr, f_min, f_max, n_fft, hop_len):
    f_min_bin = int(np.floor(f_min / sr * n_fft))
    f_max_bin = int(np.ceil(f_max / sr * n_fft))
    stft = np.abs(librosa.stft(audio, n_fft=n_fft, hop_length=hop_len))
    stft = stft[f_min_bin:f_max_bin + 1, :]
    return stft

def find_call_segments(arr, window_size=50, hop_len=50):
    start_idx = 0
    segment_energies = []
    segment_indices = []
    call_segment_indices = []
    if len(arr) > window_size:
        while start_idx + window_size < len(arr):
            segment_energies.append(np.sum(arr[start_idx:(start_idx+window_size)]))
            segment_indices.append((start_idx, start_idx + window_size))
            start_idx += hop_len

        mean_segment_energy = np.mean(segment_energies)
        large_segment_indices = [ind for ind, en in zip(segment_indices, segment_energies) if en >= mean_segment_energy]
        for ind in large_segment_indices:
            call_segment_indices.append(ind[0] + np.argmax(arr[ind[0]:ind[1]]))
            
    else:
        call_segment_indices.append(np.argmax(arr))

    return call_segment_indices

def get_call_indices(stft, sr, t_min, n_fft, hop_length):
    start_idx = int(t_min * sr)
    call_intensity = np.mean(stft, axis=0)
    call_frame_ind = find_call_segments(call_intensity)
    call_audio_ind = [start_idx + librosa.frames_to_samples(ind, hop_length=hop_length, n_fft=n_fft) for ind in call_frame_ind]
    return call_audio_ind

def get_audio_segment(audio, sr, mid_idx, segment_len_samples):
    if len(audio) > segment_len_samples:
        start_idx = int(mid_idx - segment_len_samples / 2)
        end_idx = int(mid_idx + segment_len_samples / 2)
        if start_idx < 0:
            start_idx = 0
            end_idx = segment_len_samples
        if end_idx > len(audio) - 1:
            end_idx = len(audio)
            start_idx = int(len(audio) - segment_len_samples)
    else:
        start_idx = 0
        end_idx = len(audio) - 1
    return audio[start_idx:end_idx]

CALL_LEN_SECONDS = 1.0
CALL_LEN_SAMPLES = int(CALL_LEN_SECONDS * SAMPLE_RATE)

INPUT_LEN_SECONDS = 0.5
INPUT_LEN_SAMPLES = int(INPUT_LEN_SECONDS * SAMPLE_RATE)

# data = []
# for row_idx, row in tqdm(df.iterrows(), total=df.shape[0]):
#     fpath = TRAIN_DIR / (row['recording_id'] + '.flac')
#     audio, sr = librosa.load(fpath, sr=SAMPLE_RATE)
#     seg = get_segment(audio, sr, row['t_min'], row['t_max'])
#     n_fft = 1024
#     hop_length = 512
#     stft = get_stft(seg, sr, row['f_min'], row['f_max'], n_fft, hop_length)
#     call_mid_indices = get_call_indices(stft, sr, row['t_min'], n_fft, hop_length)
#     fnames = []
#     for call_mid_idx in call_mid_indices:
#         call = get_audio_segment(audio, sr, call_mid_idx, CALL_LEN_SAMPLES)
#         assert len(call) == CALL_LEN_SAMPLES
#         fname = str(uuid.uuid4()) + '.pt'
#         torch.save(torch.from_numpy(call), dest / fname)
#         fnames.append(fname)
#     data.append((row['recording_id'], row['species_id'], fnames))

data = []
for row_idx, row in tqdm(df.iterrows(), total=df.shape[0]):
    fpath = TRAIN_DIR / (row['recording_id'] + '.flac')
    audio, sr = librosa.load(fpath, sr=SAMPLE_RATE)
    seg = get_segment(audio, sr, row['t_min'], row['t_max'])
    n_fft = 1024
    hop_length = 512
    stft = get_stft(seg, sr, row['f_min'], row['f_max'], n_fft, hop_length)
    call_mid_indices = get_call_indices(stft, sr, row['t_min'], n_fft, hop_length)
    for call_mid_idx in call_mid_indices:
        call = get_audio_segment(audio, sr, call_mid_idx, CALL_LEN_SAMPLES)
        assert len(call) == CALL_LEN_SAMPLES
        fname = str(uuid.uuid4()) + '.pt'
        torch.save(torch.from_numpy(call), dest / fname)
        data.append((row['recording_id'], row['species_id'], fname))

df_segmented = pd.DataFrame(data, columns=['recording_id', 'species_id', 'filename'])

In [None]:
df_segmented.head()

In [None]:
class RainforestDataset(Dataset):
    def __init__(self, dataframe):
        self.waveforms = []
        self.labels = []
        self.n_species = len(set(list(dataframe['species_id'])))
        for idx, row in dataframe.iterrows():
#             label = row['species_id']
#             label_array = torch.zeros(self.n_species, dtype=torch.long)
#             label_array[label] = 1.
#             self.labels.append(label_array)
            self.labels.append(row['species_id'])
#             segments = []
#             for fname in row['filenames']:
#                 audio_segment = torch.load(dest / fname)
#                 segments.append(audio_segment)
            audio = torch.load(dest / row['filename'])
            self.waveforms.append(audio)

        self.preprocess = nn.Sequential(
            torchaudio.transforms.MelSpectrogram(
                sample_rate=48000,
                n_fft=4096,
                hop_length=512,
                f_min=f_min_df,
                f_max=f_max_df,
                n_mels=65,
                power=2.,
            ),
            torchaudio.transforms.AmplitudeToDB(stype='power'),
        )

    def get_n_species(self):
        return self.n_species

    def __len__(self):
        return len(self.waveforms)

    def __getitem__(self, idx):
        # waveform = random.choice(self.waveforms[idx])
        waveform = self.waveforms[idx]
        offset = random.randrange(CALL_LEN_SAMPLES - INPUT_LEN_SAMPLES)
        waveform = waveform[offset:offset+INPUT_LEN_SAMPLES]
        return waveform, self.labels[idx]
    
    def show_sample(self, idx):
        seg = self.waveforms[idx]
        # for seg in segments:
        seg -= torch.mean(seg)
        seg /= torch.max(torch.abs(seg))
        ipd.display(ipd.Audio(seg, rate=SAMPLE_RATE))
        specgram = self.preprocess(seg)
        specgram -= torch.mean(specgram)
        specgram /= torch.std(specgram)
        print(specgram.shape)
        plt.figure()
        plt.imshow(specgram)
        plt.title(f'something')

    def show_random_sample(self):
        idx = random.randrange(len(self))
        self.show_sample(idx)

In [None]:
ds = RainforestDataset(df_segmented)
ds.show_random_sample()

In [None]:
torch.cuda.empty_cache()

In [None]:
#TO TRAIN
n_epochs = 10
n_splits=3

In [None]:
batch_size = 16
device = torch.device("cuda:0" if torch.cuda.is_available() else "cpu")

skf = StratifiedKFold(n_splits=n_splits, shuffle=True)
targets = df_segmented.species_id

# params = [['../input/timm-resnest-weights/resnest50-528c19ca.pth',
#            'zhanghang1989/ResNeSt',
#            'resnest50',
#            True],
#           ['../input/timm-resnest-weights/resnest50_fast_1s4x24d-d4a4f76f.pth',
#            'zhanghang1989/ResNeSt',
#            'resnest50',
#            False],
#           ['../input/timm-resnest-weights/gluon_resnest26-50eb607c.pth',
#            'zhanghang1989/ResNeSt',
#            'resnest50',
#            False
#           ]]

params = [['../input/timm-resnest-weights/resnest50_fast_1s4x24d-d4a4f76f.pth',
           'zhanghang1989/ResNeSt',
           'resnest50',
           True]]

for p in params:
    print(f'Training for {p}')
    for fold_idx, (train_idx, val_idx) in enumerate(skf.split(np.zeros(len(targets)), targets)):
        print(f'Training with fold {fold_idx}')
        weights_path = weights_dir / f'weights_{fold_idx}.pt'
        train_ds = RainforestDataset(df_segmented.loc[train_idx])
        val_ds = RainforestDataset(df_segmented.loc[val_idx])
        train_dl = DataLoader(train_ds, batch_size=batch_size, shuffle=True)
        val_dl = DataLoader(val_ds, batch_size=batch_size)

        model = AudioResNest(train_ds.get_n_species(), p[0], p[1], p[2], p[3])

        optimizer = torch.optim.Adam(params=model.parameters(), lr=1e-3, amsgrad=False, weight_decay=1e-4)
        scheduler = torch.optim.lr_scheduler.ReduceLROnPlateau(optimizer, patience=4, verbose=True, factor=0.1)

        # criterion = nn.BCEWithLogitsLoss()
        criterion = nn.CrossEntropyLoss()

        model = model.to(device)
        criterion = criterion.to(device)

        best_val_acc = 0.
        best_val_loss = math.inf

        for epoch in tqdm(range(n_epochs), desc='Training'):
            start_time = time.time()
            model.train()
            train_loss = 0.
            train_acc = 0.

            for x_batch, y_batch in train_dl:

                logits = model(x_batch.to(device))
                loss = criterion(logits, y_batch.to(device))
                optimizer.zero_grad()
                loss.backward()
                optimizer.step()
                train_loss += loss.item()
                # train_acc += torch.sum(torch.argmax(logits, dim=1) == torch.argmax(y_batch.to(device), dim=1))
                train_acc += torch.sum(torch.argmax(logits, dim=1) == y_batch.to(device))

            train_loss /= len(train_dl)
            train_acc /= len(train_ds)

            with torch.no_grad():
                val_loss = 0.
                val_acc = 0.
                model.eval()
                for x_batch, y_batch in val_dl:

                    logits = model(x_batch.to(device))
                    loss = criterion(logits, y_batch.to(device))
                    val_loss += loss.item()
                    # val_acc += torch.sum(torch.argmax(logits, dim=1) == torch.argmax(y_batch.to(device), dim=1))
                    val_acc += torch.sum(torch.argmax(logits, dim=1) == y_batch.to(device))

                val_loss /= len(val_dl)
                val_acc /= len(val_ds)

            elapsed = time.time() - start_time
            print(f'Epoch {epoch} (time: {elapsed:.0f}s): train_loss: {train_loss}, train_acc: {train_acc}, val_loss: {val_loss}, val_acc: {val_acc}')

            if val_acc > best_val_acc:
                print(f'Saving new best model at epoch {epoch} (val_acc improved from {best_val_acc} to {val_acc})')
                torch.save(model, weights_path)
                best_val_acc = val_acc

            scheduler.step(val_loss)

In [None]:
# # using ResNeSt-50 as an example
# from resnest.torch import resnest50
# net = resnest50(pretrained=True)

In [None]:
# import torchvision.models as models
# resnet18 = models.resnet18(pretrained=True)

In [None]:
shutil.rmtree(dest)

In [None]:
def load_test_file(fpath):
    audio, sr = librosa.load(fpath, sr=None)
    audio -= np.mean(audio)
    audio /= np.max(np.abs(audio))
    hop_len = int(INPUT_LEN_SAMPLES / 2)
#     waveforms = []
#     start_idx = 0
#     while start_idx + INPUT_LEN_SAMPLES < len(audio):
#         audio_segment = audio[start_idx:start_idx+INPUT_LEN_SAMPLES]
#         waveforms.append(torch.from_numpy(audio_segment))
#         start_idx += hop_len
    chunks = [audio[i : i + INPUT_LEN_SAMPLES] for i in range(0, len(audio)-INPUT_LEN_SAMPLES, hop_len)]
    chunks.append(audio[-INPUT_LEN_SAMPLES:])
    signal_chunks = sorted(chunks, key=lambda x: np.sum(x**2), reverse=True)[:5]
    signal_chunks = [torch.from_numpy(chunk) for chunk in signal_chunks]
    # could also do energy filtering here
    # max_energy = np.max([np.sum(chunk**2) for chunk in chunks])
    # signal_chunks = [torch.from_numpy(chunk) for chunk in chunks if np.sum(chunk**2) > 0.8*max_energy]
    return torch.stack(signal_chunks)

def get_probabilities(melspecs, weights_dir, device, n_classes, batch_size=512):
    ds = TensorDataset(melspecs)
    dl = DataLoader(ds, batch_size=batch_size)
    probs = torch.zeros((len(melspecs), n_classes))
    ws = [w for w in weights_dir.iterdir()]
    for w in ws:
        model = torch.load(w)
        model.to(device)
        model.eval()
        fold_probs = []
        for batch in dl:
            x = batch[0].to(device)
            logits = model(x).detach().cpu()
            ###
            logits = F.softmax(logits, dim=1)
            ###
            fold_probs.append(logits)
        fold_probs = torch.vstack(fold_probs)
        probs += fold_probs
    max_prob_per_class, _ = probs.max(dim=0)
    return list(max_prob_per_class.numpy())

In [None]:
with open('submission.csv', 'w', newline='') as csvfile:
    submission_writer = csv.writer(csvfile, delimiter=',')
    submission_writer.writerow(['recording_id','s0','s1','s2','s3','s4','s5','s6','s7','s8','s9','s10','s11',
                               's12','s13','s14','s15','s16','s17','s18','s19','s20','s21','s22','s23'])

        for fpath in tqdm(list(TEST_DIR.iterdir())):
            data = load_test_file(fpath)
        maxed_output = get_probabilities(data, weights_dir, device, train_ds.get_n_species())
        write_array = [fpath.stem]
        for out in maxed_output:
            write_array.append(out.item())
        submission_writer.writerow(write_array)

print('Submission generated')

# Test demo

In [None]:
paths_train = ['/kaggle/input/rfcx-species-audio-detection/train/00204008d.flac',
               '/kaggle/input/rfcx-species-audio-detection/train/003b04435.flac',
               '/kaggle/input/rfcx-species-audio-detection/train/0079ff47b.flac']

In [None]:
fpath = '/kaggle/input/rfcx-species-audio-detection/train/006ab765f.flac'
# for fpath in paths_train:
print(fpath)
data_train = load_test_file(fpath)

In [None]:
maxed_output = get_probabilities(data_train, weights_dir, device, train_ds.get_n_species())

In [None]:
np.argmax(maxed_output)

In [None]:
mapping_tp = pd.read_csv('/kaggle/input/rfcx-species-audio-detection/train_tp.csv')
mapping_fp = pd.read_csv('/kaggle/input/rfcx-species-audio-detection/train_fp.csv')

In [None]:
mapping_tp.species_id.unique()

In [None]:
ds.show_sample(1)

In [None]:
mapping_tp.loc[mapping_tp.recording_id == '006ab765f'].index[0]

In [None]:
mapping_tp.iloc[:5]

In [None]:
mapping_fp.loc[mapping_fp.recording_id == '003b04435']