In [3]:
!pip install torch==1.6.0
!pip install opencv-python
!pip install torchvision==0.2.2
!pip install albumentations
!pip install tensorflow
!pip install pytorch-lightning

Defaulting to user installation because normal site-packages is not writeable
Collecting torch==1.6.0
  Downloading torch-1.6.0-cp36-cp36m-manylinux1_x86_64.whl (748.8 MB)
[K     |████████████████████████████████| 748.8 MB 16 kB/s 
Installing collected packages: torch
Successfully installed torch-1.6.0
You should consider upgrading via the '/usr/bin/python3 -m pip install --upgrade pip' command.[0m
Defaulting to user installation because normal site-packages is not writeable
Collecting opencv-python
  Downloading opencv_python-4.4.0.46-cp36-cp36m-manylinux2014_x86_64.whl (49.5 MB)
[K     |████████████████████████████████| 49.5 MB 29.4 MB/s 
Installing collected packages: opencv-python
Successfully installed opencv-python-4.4.0.46
You should consider upgrading via the '/usr/bin/python3 -m pip install --upgrade pip' command.[0m
Defaulting to user installation because normal site-packages is not writeable
Collecting torchvision==0.2.2
  Downloading torchvision-0.2.2-py2.py3-none-any.w

In [1]:
from pathlib import Path
import numpy as np
import pandas as pd
import typing as tp
import yaml
import random
import os
import sys
import soundfile as sf
import librosa
import cv2
import matplotlib.pyplot as plt
import time
import glob
from tqdm import tqdm

import pickle

import torch
import torch.nn as nn
import torch.nn.functional as F
import torch.utils.data as data
import pytorch_lightning as pl
from pytorch_lightning import Trainer
from stft import Spectrogram, LogmelFilterBank
from torchlibrosa.augmentation import SpecAugmentation

from pytorch_lightning.callbacks.early_stopping import EarlyStopping
# import resnest.torch as resnest_torch
from timm.models.efficientnet import tf_efficientnet_b0_ns


from torchvision import models

from sklearn.model_selection import StratifiedKFold
from sklearn.metrics import f1_score
# from resnet import ResNet, Bottleneck

from albumentations.core.transforms_interface import DualTransform, BasicTransform
import albumentations as albu
from functools import partial


from sklearn.model_selection import StratifiedKFold

pd.options.display.max_rows = 500
pd.options.display.max_columns = 500

## util

In [2]:
config_set = {
    'dataset': {
          'name': 'SpectrogramDataset',
          'params': {
            'img_size': 224, 
            'melspectrogram_parameters': {
                'n_fft': 2048,
                'hop_length': 512,
                'win_length': 2048,
                'n_mels': 384, 
                'fmin': 50, 
                'fmax': 24000, 
            }
      }
    },
    'loader': {
      'train': {
        'batch_size': 6,
        'shuffle': True,
        'num_workers': 2,
        'pin_memory': True,
        'drop_last': True,
      },
      'valid': {
        'batch_size': 2,
        'shuffle': False,
        'num_workers': 2,
        'pin_memory': True,
        'drop_last': True,
      }
    }
}
SEED=1213
PERIOD = 10
SPECIES_NUM = 24
EPOCH = 50
HOP_LEN = 512
SR = 48000
BATCH = 2

In [3]:
config = config_set

In [4]:
def set_seed(seed: int = 42):
    random.seed(seed)
    np.random.seed(seed)
    os.environ["PYTHONHASHSEED"] = str(seed)
    torch.manual_seed(seed)
    torch.cuda.manual_seed(seed)  # type: ignore
set_seed(SEED)

In [5]:
INPUT_ROOT = Path("/home/knikaido/work/Rainforest-Connection/data")
RAW_DATA = INPUT_ROOT / "rfcx-species-audio-detection"
TRAIN_AUDIO_DIR = RAW_DATA / "train"
# TRAIN_RESAMPLED_AUDIO_DIRS = [
#   INPUT_ROOT / "birdsong-resampled-train-audio-{:0>2}".format(i)  for i in range(5)
# ]
TEST_AUDIO_DIR = RAW_DATA / "test"
OUTPUT_DIR = './output/'

In [6]:
train_gby = pd.read_pickle(RAW_DATA / "train_gby_mel.pkl")
train_gby.head()

Unnamed: 0,recording_id,species_id,songtype_id,t_min,f_min,t_max,f_max,name
0,003bec244,[14],[1],[44.544],[2531.25],[45.1307],[5531.25],/home/knikaido/work/Rainforest-Connection/Git/...
1,006ab765f,[23],[1],[39.9615],[7235.16],[46.0452],[11283.4],/home/knikaido/work/Rainforest-Connection/Git/...
2,007f87ba2,[12],[1],[39.135999999999996],[562.5],[42.272],[3281.25],/home/knikaido/work/Rainforest-Connection/Git/...
3,0099c367b,[17],[4],[51.4206],[1464.26],[55.1996],[4565.04],/home/knikaido/work/Rainforest-Connection/Git/...
4,009b760e6,[10],[1],[50.0854],[947.461],[52.5293],[10852.7],/home/knikaido/work/Rainforest-Connection/Git/...


In [7]:
def mono_to_color(
    X: np.ndarray, mean=None, std=None,
    norm_max=None, norm_min=None, eps=1e-6
):
    # Stack X as [X,X,X]
    X = np.stack([X, X, X], axis=-1)

    # Standardize
    mean = mean or X.mean()
    X = X - mean
    std = std or X.std()
    Xstd = X / (std + eps)
    _min, _max = Xstd.min(), Xstd.max()
    norm_max = norm_max or _max
    norm_min = norm_min or _min
    if (_max - _min) > eps:
        # Normalize to [0, 255]
        V = Xstd
        V[V < norm_min] = norm_min
        V[V > norm_max] = norm_max
        V = 255 * (V - norm_min) / (norm_max - norm_min)
        V = V.astype(np.uint8)
    else:
        # Just zero
        V = np.zeros_like(Xstd, dtype=np.uint8)
    return V

In [8]:
def get_criterion():
    pos_weights = torch.ones(SPECIES_NUM)
    pos_weights = pos_weights * SPECIES_NUM
    loss_function = nn.BCEWithLogitsLoss(pos_weight=pos_weights)
    return loss_function

In [9]:
def interpolate(x: torch.Tensor, ratio: int):
    """Interpolate data in time domain. This is used to compensate the
    resolution reduction in downsampling of a CNN.

    Args:
      x: (batch_size, time_steps, classes_num)
      ratio: int, ratio to interpolate
    Returns:
      upsampled: (batch_size, time_steps * ratio, classes_num)
    """
    (batch_size, time_steps, classes_num) = x.shape
    upsampled = x[:, :, None, :].repeat(1, 1, ratio, 1)
    upsampled = upsampled.reshape(batch_size, time_steps * ratio, classes_num)
    return upsampled


def pad_framewise_output(framewise_output: torch.Tensor, frames_num: int):
    """Pad framewise_output to the same length as input frames. The pad value
    is the same as the value of the last frame.
    Args:
      framewise_output: (batch_size, frames_num, classes_num)
      frames_num: int, number of frames to pad
    Outputs:
      output: (batch_size, frames_num, classes_num)
    """
    pad = framewise_output[:, -1:, :].repeat(
        1, frames_num - framewise_output.shape[1], 1)
    """tensor for padding"""

    output = torch.cat((framewise_output, pad), dim=1)
    """(batch_size, frames_num, classes_num)"""

    return output

def init_layer(layer):
    nn.init.xavier_uniform_(layer.weight)

    if hasattr(layer, "bias"):
        if layer.bias is not None:
            layer.bias.data.fill_(0.)
            
def init_bn(bn):
    bn.bias.data.fill_(0.)
    bn.weight.data.fill_(1.0)


class AttBlock(nn.Module):
    def __init__(self,
                 in_features: int,
                 out_features: int,
                 activation="linear",
                 temperature=1.0):
        super().__init__()

        self.activation = activation
        self.temperature = temperature
        self.att = nn.Conv1d(
            in_channels=in_features,
            out_channels=out_features,
            kernel_size=1,
            stride=1,
            padding=0,
            bias=True)
        self.cla = nn.Conv1d(
            in_channels=in_features,
            out_channels=out_features,
            kernel_size=1,
            stride=1,
            padding=0,
            bias=True)

        self.bn_att = nn.BatchNorm1d(out_features)
        self.init_weights()

    def init_weights(self):
        init_layer(self.att)
        init_layer(self.cla)
        init_bn(self.bn_att)

    def forward(self, x):
        # x: (n_samples, n_in, n_time)
        norm_att = torch.softmax(torch.tanh(self.att(x)), dim=-1)
        cla = self.nonlinear_transform(self.cla(x))
        x = torch.sum(norm_att * cla, dim=2)
        return x, norm_att, cla

    def nonlinear_transform(self, x):
        if self.activation == 'linear':
            return x
        elif self.activation == 'sigmoid':
            return torch.sigmoid(x)

In [10]:
encoder_params = {
    "tf_efficientnet_b0_ns": {
        "features": 1280,
        "init_op": partial(tf_efficientnet_b0_ns, pretrained=True, drop_path_rate=0.2)
    }
}

In [11]:
class LitModule(pl.LightningModule):
    
    def __init__(self, train_len, mel_params, num_classes=SPECIES_NUM):
        super().__init__()
        window = 'hann'
        center = True
        pad_mode = 'reflect'
        ref = 1.0
        amin = 1e-10
        top_db = None
        self.interpolate_ratio = 30  # Downsampled ratio
        # load pretrained models, using ResNeSt-50 as an example
        self.encoder = encoder_params['tf_efficientnet_b0_ns']["init_op"]()

        in_features = 1024

        self.fc1 = nn.Linear(1280, in_features, bias=True)
        self.att_block = AttBlock(in_features, num_classes, activation="sigmoid")
        
        
        # Spectrogram extractor
        self.spectrogram_extractor = Spectrogram(n_fft=mel_params['n_fft'], hop_length=mel_params['hop_length'], 
            win_length=mel_params['win_length'], window=window, center=center, pad_mode=pad_mode, 
            freeze_parameters=True)

        # Logmel feature extractor
        self.logmel_extractor = LogmelFilterBank(sr=SR, n_fft=mel_params['n_fft'], 
            n_mels=mel_params['n_mels'], fmin=mel_params['fmin'], fmax=mel_params['fmax'], ref=ref, amin=amin, top_db=top_db, 
            freeze_parameters=True)

        # Spec augmenter
        self.spec_augmenter = SpecAugmentation(time_drop_width=64, time_stripes_num=2, 
            freq_drop_width=8, freq_stripes_num=2)

        self.bn0 = nn.BatchNorm2d(mel_params['n_mels'])
        self.bn1 = nn.BatchNorm2d(mel_params['n_mels'])
        self.bn2 = nn.BatchNorm2d(mel_params['n_mels'])

        self.init_weight()


    def init_weight(self):
        init_layer(self.fc1)
        init_bn(self.bn0)
        init_bn(self.bn1)
        init_bn(self.bn2)

    def forward(self, x, istrain=True):
        
        x = self.spectrogram_extractor(x)
        # batch_size x 1 x time_steps x freq_bins
        x, x1, x2 = self.logmel_extractor(x)
        # batch_size x 1 x time_steps x mel_bins

        frames_num = x.shape[2]

        x = x.transpose(1, 3)
        x = self.bn0(x)
        x = x.transpose(1, 3)
        
        x1 = x1.transpose(1, 3)
        x1 = self.bn1(x1)
        x1 = x1.transpose(1, 3)
        
        x2 = x2.transpose(1, 3)
        x2 = self.bn2(x2)
        x2 = x2.transpose(1, 3)
        #print(x.shape)
        
#         if istrain:
#             x = self.spec_augmenter(x)
#         if istrain and mixup_lambda is not None  :
#             x = do_mixup(x, mixup_lambda)

        # Output shape (batch size, channels, time, frequency)
#         x = x.reshape(x.shape[0], 1, x.shape[2], x.shape[3])
#         x_half = x ** 0.5
#         x_amp = x ** 1.5
        x = torch.cat((x, x1, x2), 1)
#         x = x.expand(x.shape[0], 3, x.shape[2], x.shape[3])
        
        x = self.encoder.forward_features(x)

        # Aggregate in frequency axis
        x = torch.mean(x, dim=3)

        # channel smoothing
        x1 = F.max_pool1d(x, kernel_size=3, stride=1, padding=1)
        x2 = F.avg_pool1d(x, kernel_size=3, stride=1, padding=1)
        x = x1 + x2

        x = F.dropout(x, p=0.5, training=self.training)
        x = x.transpose(1, 2)
        x = F.relu_(self.fc1(x))
        x = x.transpose(1, 2)
        x = F.dropout(x, p=0.5, training=self.training)
        (clipwise_output, norm_att, segmentwise_output) = self.att_block(x)
        logit = torch.sum(norm_att * self.att_block.cla(x), dim=2)
        segmentwise_output = segmentwise_output.transpose(1, 2)

        # Get framewise output
        framewise_output = interpolate(segmentwise_output,
                                       self.interpolate_ratio)
        framewise_output = pad_framewise_output(framewise_output, frames_num)

        output_dict = {
            "framewise_output": framewise_output,
            "logit": logit,
            "clipwise_output": clipwise_output
        }

        return output_dict
    
    def configure_optimizers(self):
#         optimizer = torch.optim.SGD(model.parameters(), lr=0.001, weight_decay=0.01, momentum=0.9)
        optimizer = torch.optim.AdamW(model.parameters(), lr=0.001, betas=(0.9, 0.999), eps=1e-08, weight_decay=0.01, amsgrad=True)
#         scheduler = {'scheduler': optim.lr_scheduler.OneCycleLR(
#                                                 optimizer,
#                                                 max_lr=self.learning_rate,
#                                                 steps_per_epoch=int(len(self.train_dataloader())),
#                                                 epochs=self.hparams.epochs,
#                                                 anneal_strategy="linear",
#                                                 final_div_factor = 30,
#                                             ),
#                                 'name': 'learning_rate',
#                                 'interval':'step',
#                                 'frequency': 1}
        scheduler = {'scheduler': torch.optim.lr_scheduler.CosineAnnealingWarmRestarts(optimizer, T_0=10, T_mult=1, eta_min=0.001*0.01),
                                'name': 'learning_rate',
                                'interval':'epoch',
                                'frequency': 1}
#         scheduler = torch.optim.lr_scheduler.CosineAnnealingLR(optimizer, T_max=10)
#         lr_scheduler = {"scheduler": scheduler }
        return [optimizer], [scheduler]
    
    def training_step(self, train_batch, batch_idx):
        x, y = train_batch
        x, y_a, y_b, lam = mixup_data(x, y, alpha=0.5)
#         mixup_lambda = self.mixup_augmenter.get_lambda(6).to("cuda:0")
#         y = do_mixup(y, mixup_lambda)
        y_pred = self.forward(x, istrain=True)
        loss = mixup_criterion(self.criterion, y_pred['clipwise_output'], y_a, y_b, lam)
#         print(y_pred['clipwise_output'].shape, y.shape)
#         loss = self.criterion(y_pred['clipwise_output'], y)
        self.log('train_loss', loss,  on_epoch=True, prog_bar=True, logger=True)
        return loss
    
    def validation_step(self, val_batch, batch_idx):
        x, y = val_batch
        y_pred = self.forward(x, istrain=False)
#         print(y_pred)
        loss_max = self.criterion(torch.max(y_pred['framewise_output'], 1)[0], y)
        loss_clip = self.criterion(y_pred['clipwise_output'], y)
        loss = loss_clip + 0.5*loss_max
        y_pred_act = y_pred['clipwise_output']
        lwlap_step, weight_step = lwlap_wrapper(y, y_pred_act)
        lwlap_step = (lwlap_step * weight_step).sum()
#         print('valid_epoch_loss = ', loss)
#         print('valid_epoch_loss = ', lwlap_step)
        self.log('val_loss', loss, on_epoch=True, prog_bar=True, logger=True)
        self.log('lwlap_score', lwlap_step, on_epoch=True, prog_bar=True, logger=True)
        return loss, lwlap_step

    def validation_epoch_end(self, validation_step_outputs):
        validation_step_outputs = np.array(validation_step_outputs)
        validation_step_losses = validation_step_outputs[:, 0]
        mean_loss = torch.stack([x for x in validation_step_losses]).mean()
        
        validation_step_scores = validation_step_outputs[:, 1]
        mean_score = torch.stack([x for x in validation_step_scores]).mean()

        print('valid_epoch_loss = ', mean_loss)
        print('valid_epoch_lwlap = ', mean_score)
        self.log('valid_epoch_loss', mean_loss, prog_bar=True, logger=True)
        self.log('valid_epoch_lwlap', mean_score, prog_bar=True, logger=True)
#         tqdm.write('Dice: \t%.3f' % mean_loss)
        return mean_loss, mean_score

In [12]:
model = LitModule(0, config['dataset']['params']['melspectrogram_parameters'])

  "Empty filters detected in mel frequency basis. "


In [13]:
model.load_state_dict(torch.load(OUTPUT_DIR + 'model'))

<All keys matched successfully>

In [14]:
def signal_to_mel(y, sr, mel_params):
    
    len_y = len(y)
    effective_length = int(SR * PERIOD)
    
    start = 0
    end = start + effective_length
    
    images = []
    while(start < len_y):
        if(end > len_y):
            break
        y_ele = y[start:end]
        
        images.append(y_ele.astype(np.float32))
        
        start = end
        end += effective_length
                
    return np.array(images)

In [15]:
test_wav_pathes = sorted(glob.glob(str(TEST_AUDIO_DIR / '*.flac')))
len(test_wav_pathes)

1992

In [16]:
device = torch.device("cuda")
model.eval().to(device)
preds = []

for path in tqdm(test_wav_pathes):
    y, sr = sf.read(path)
    mel_img = signal_to_mel(y, sr, config["dataset"]["params"])
    
    b_start = 0
    b_end = BATCH
    b_preds = []
    while(b_start < mel_img.shape[0]):
        if(b_end >= mel_img.shape[0]):
            mel_batch = mel_img[b_start:]
        else:
            mel_batch = mel_img[b_start:b_end]
        mel_batch = torch.from_numpy(mel_batch).clone().to(device)
        pred = model(mel_batch, istrain=False)
        pred = pred['clipwise_output'].to('cpu').detach().numpy().copy()
        b_preds.extend(pred)
        b_start += BATCH
        b_end += BATCH
    pred = np.mean(b_preds, axis=0)
    preds.append(pred)    
            
preds = np.array(preds)
#     break

100%|██████████| 1992/1992 [02:18<00:00, 14.39it/s]


In [17]:
sub = pd.read_csv(str(RAW_DATA / 'sample_submission.csv'))

In [18]:
sub.loc[:, 's0':'s23'] = preds

In [19]:
sub.to_csv(OUTPUT_DIR + '130_sub.csv', index=False)

In [20]:
sub

Unnamed: 0,recording_id,s0,s1,s2,s3,s4,s5,s6,s7,s8,s9,s10,s11,s12,s13,s14,s15,s16,s17,s18,s19,s20,s21,s22,s23
0,000316da7,0.008565,0.000345,0.000871,0.531262,0.000050,0.033817,0.000169,0.001880,0.000128,0.000055,0.000030,0.000185,0.277754,0.000228,0.002744,0.000417,0.000173,0.001890,0.080444,0.000534,0.000824,0.000241,0.000358,0.001283
1,003bc2cb2,0.000187,0.193345,0.000046,0.221639,0.000293,0.000494,0.000195,0.001274,0.000028,0.000113,0.000462,0.000252,0.000040,0.000321,0.000343,0.000169,0.904719,0.000091,0.000005,0.000076,0.000085,0.001210,0.000013,0.000093
2,0061c037e,0.022046,0.001115,0.002901,0.207012,0.000837,0.114210,0.002348,0.115878,0.001536,0.003012,0.002117,0.007497,0.000074,0.000714,0.000507,0.001250,0.009131,0.013275,0.000166,0.003483,0.312101,0.000584,0.007449,0.016187
3,010eb14d3,0.918325,0.000167,0.000032,0.001351,0.000130,0.000630,0.000042,0.000102,0.251901,0.000089,0.000257,0.000033,0.000017,0.000047,0.000112,0.000024,0.000053,0.000010,0.010194,0.000249,0.000166,0.000798,0.002331,0.000026
4,011318064,0.000549,0.000110,0.000023,0.100294,0.000004,0.008597,0.000522,0.000746,0.000032,0.000003,0.000014,0.000176,0.000145,0.000648,0.827654,0.060311,0.000011,0.000128,0.097481,0.000088,0.000012,0.004502,0.000034,0.000324
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
1987,ff68f3ac3,0.000439,0.000012,0.000083,0.042681,0.015786,0.164982,0.000213,0.046398,0.000077,0.000225,0.000434,0.000781,0.000466,0.000685,0.000105,0.107990,0.000279,0.000383,0.000172,0.001027,0.069142,0.000066,0.000070,0.447579
1988,ff973e852,0.000163,0.000385,0.000170,0.008976,0.000044,0.000843,0.000335,0.588329,0.000307,0.065406,0.001579,0.008121,0.000052,0.000288,0.003748,0.070470,0.000740,0.049459,0.000091,0.000527,0.033657,0.000046,0.027300,0.006367
1989,ffa5cf6d6,0.000238,0.002620,0.001317,0.556760,0.000260,0.012209,0.000764,0.114726,0.000243,0.089620,0.000241,0.000765,0.001089,0.000980,0.001375,0.340814,0.005060,0.028921,0.000395,0.000497,0.000880,0.000744,0.000126,0.001661
1990,ffa88cbb8,0.000985,0.002954,0.006017,0.760843,0.000173,0.008583,0.000185,0.230889,0.000474,0.043133,0.000073,0.000528,0.007026,0.000621,0.002182,0.000230,0.153878,0.000564,0.000313,0.000747,0.005109,0.000603,0.000868,0.001242


In [None]:
device = torch.device("cuda")
model.eval().to(device)
preds = []

for path in tqdm(test_wav_pathes):
    y, sr = sf.read(path)
    mel_img = signal_to_mel(y, sr, config["dataset"]["params"])
#     mel_img = torch.from_numpy(mel_img).clone().to(device)
#     pred = model(mel_img)
# #         pred = torch.mean(pred['clipwise_output'], 0)
# #         pred = pred.to('cpu').detach().numpy().copy()
#     preds.append(pred)
    
    
    b_start = 0
    b_end = BATCH
    b_preds = []
    while(b_start < mel_img.shape[0]):
        if(b_end >= mel_img.shape[0]):
            mel_batch = mel_img[b_start:]
        else:
            mel_batch = mel_img[b_start:b_end]
        mel_batch = torch.from_numpy(mel_batch).clone().to(device)
        pred = model(mel_batch)
        pred = pred['framewise_output'].to('cpu').detach().numpy().copy()
        b_preds.extend(pred)
        b_start += BATCH
        b_end += BATCH
    b_preds = np.array(b_preds).reshape(-1, 24)
    pred = np.max(b_preds, axis=0)
    preds.append(pred)    
    
            
preds = np.array(preds)