In [None]:
!pip install soundfile

In [None]:
!pip install efficientnet_pytorch

In [None]:
!pip install torchcontrib

In [None]:
%cd ../input/fmixpytorch/FMix-master
from fmix import sample_and_apply
%cd /kaggle/working

In [None]:
%cd ../input/cpythongit/cpython-master
from Lib import copy
%cd /kaggle/working

In [None]:
import pandas as pd
import numpy as np                                                             
import soundfile as sf
import matplotlib.pyplot as plt
import torch
import torchaudio
from torchvision import transforms
from torch.utils.data import Dataset
from sklearn import model_selection
from PIL import Image
import albumentations
from torch.utils.data import DataLoader
import efficientnet_pytorch
import torch.nn.functional as F
from torchcontrib.optim import SWA
import gc
import torch.nn as nn
from sklearn.metrics import label_ranking_average_precision_score

import warnings
warnings.filterwarnings("ignore")

In [None]:
# checking if cuda is available
from torch import device as device_

device = device_("cuda" if torch.cuda.is_available() else "cpu")
print(device)

In [None]:
filename = '../input/rfcx-species-audio-detection/train/06c44d203.flac'

waveform, sample_rate = sf.read(filename , start = int((2880000/60) * 1.28), stop = int((2880000/60) * 2.0213) )

waveform = torch.from_numpy(waveform)

waveform = torch.reshape(waveform, (1, waveform.shape[0]))

specgram = torchaudio.transforms.Spectrogram()(waveform)

specgram = specgram.repeat(3, 1, 1)

print("Shape of spectrogram: {}".format(specgram.size()))

plt.figure()
plt.imshow(specgram.log2()[0,:,:].numpy(), cmap='bwr')

In [None]:
df = pd.read_csv("../input/rfcx-species-audio-detection/train_tp.csv")
df.species_id.value_counts()

In [None]:
df['species_id'].nunique()

In [None]:
df.shape

In [None]:
df["kfold"] = -1    
df = df.sample(frac=1).reset_index(drop=True)
y = df.species_id.values
kf = model_selection.StratifiedKFold(n_splits=5)

for f, (t_, v_) in enumerate(kf.split(X=df, y=y)):
    df.loc[v_, 'kfold'] = f
df.head(10)

In [None]:
fold = 0
df_train = df[df.kfold != fold].reset_index(drop=True)
df_valid = df[df.kfold == fold].reset_index(drop=True)

In [None]:
class audio_classification(Dataset):
    def __init__(self, ids, recording_id, t_min, t_max, species_id, is_valid = 0):
        self.ids = ids
        self.recording_id = recording_id
        self.t_min = t_min
        self.t_max = t_max
        self.species_id = species_id
        self.is_valid = is_valid
        if self.is_valid == 1:
            self.aug = albumentations.Compose([
               albumentations.Resize(256 , 256, always_apply = True)
            ])
        else:                  # transfoms for training images 
            self.aug = albumentations.Compose([
                albumentations.Resize(256 , 256, always_apply = True) ,
                albumentations.ShiftScaleRotate(shift_limit = 0.0625,
                                                scale_limit = 0.1 ,
                                                rotate_limit = 5,
                                                p = 0.9)
            ])
        
    def __len__(self):
        return len(self.ids)
    
    def __getitem__(self, index):
        filename = "../input/rfcx-species-audio-detection/train/"+ self.recording_id[index] +".flac"
        waveform , _ = sf.read(filename , start = int((2880000/60) * self.t_min[index]), stop = int((2880000/60) * self.t_max[index]) )
        
        waveform = torch.from_numpy(waveform)

        waveform = torch.reshape(waveform, (1, waveform.shape[0]))

        specgram = torchaudio.transforms.Spectrogram()(waveform)

        specgram = specgram.repeat(3, 1, 1)
        
        specgram = np.transpose(specgram.numpy(), (1,2,0))
        
        specgram = self.aug(image = specgram)['image']
        
        specgram = np.transpose(specgram, (2,0,1)).astype(np.float32)
        
        return {
            'specgram' : torch.tensor(specgram, dtype = torch.float) ,
            'label' : torch.tensor(np.eye(24, dtype='float64')[int(self.species_id[index])])
        }

In [None]:
train_data = audio_classification(ids = [i for i in range(len(df_train))], 
                                  recording_id = df_train['recording_id'],
                                  t_min = df_train['t_min'],
                                  t_max = df_train['t_max'],
                                  species_id = df_train['species_id'])

val_data = audio_classification(ids = [i for i in range(len(df_valid))], 
                                recording_id = df_valid['recording_id'],
                                t_min = df_valid['t_min'],
                                t_max = df_valid['t_max'],
                                species_id = df_valid['species_id'],
                                is_valid = 1)

In [None]:
idx = 1

print(val_data[idx]['label'])

img = val_data[idx]['specgram']
plt.figure()
plt.imshow(img.log2()[0,:,:].numpy(), cmap='bwr')

In [None]:
TRAIN_BATCH_SIZE = 8

training_dataloader = DataLoader(train_data,
                        num_workers= 4,
                        batch_size= TRAIN_BATCH_SIZE,
                        shuffle=True,
                        drop_last=True
                       )

val_dataloader = DataLoader(val_data,
                        num_workers= 4,
                        batch_size= TRAIN_BATCH_SIZE,
                        shuffle=False,
                        drop_last=False
                       )

In [None]:
class EfficientNet_b5(nn.Module):
    def __init__(self):
        super(EfficientNet_b5, self).__init__()
        self.model = efficientnet_pytorch.EfficientNet.from_pretrained('efficientnet-b5')
        self.dropout = nn.Dropout(0.1)
        self.final_layer = nn.Linear(2048 , 24)
        
    def forward(self, image_inputs):
        batch_size, _, _, _ = image_inputs.shape
    
        x = self.model.extract_features(image_inputs)
        x = self.model._avg_pooling(x)
        x = F.adaptive_avg_pool2d(x, 1).reshape(batch_size, -1)
        outputs = self.final_layer(self.dropout(x))

        return outputs
    
model = EfficientNet_b5()

In [None]:
EPOCHS = 6
num_train_steps = int(len(train_data) / TRAIN_BATCH_SIZE / EPOCHS)

# printing the no of training steps for each epoch of our training dataloader  
print(f'num_train_steps = {num_train_steps}')

model = model.to(device)

base_optimizer = torch.optim.Adadelta(model.parameters(), lr = 1e-3 * 0.95)

optimizer = SWA(base_optimizer, swa_start=5, swa_freq=5, swa_lr=0.05)

loss_fn = torch.nn.BCEWithLogitsLoss()

scheduler = torch.optim.lr_scheduler.ReduceLROnPlateau(optimizer, patience = 3, verbose = True)

In [None]:
# defining the training loop
def train_loop_fn(data_loader, model, optimizer, device, scheduler=None):
    running_loss = 0.0
    all_targets = 0
    all_predictions = 0
    
    model.train()
    
    alpha, decay_power = 1.0, 3.0
    
    for batch_index,dataset in enumerate(data_loader):
        image = dataset["specgram"]
        label = dataset["label"]
        
        image, perm, lambda_value = sample_and_apply(image, alpha, decay_power, (256, 256))
        
        image = image.to(device, dtype=torch.float)
        label = label.to(device, dtype=torch.float)
        
        optimizer.zero_grad()

        outputs = model(image)
        
        y_true = label.detach().cpu().numpy()
        y_pred = outputs.detach().cpu().numpy()
        
        loss = loss_fn(outputs, label) * lambda_value + loss_fn(outputs, label[perm]) * (1 - lambda_value)

        loss.backward()
        optimizer.step()

        running_loss += loss.item()
        
        if batch_index > 0:
            all_targets = np.concatenate((all_targets, y_true), axis=0)
            all_predictions = np.concatenate((all_predictions, y_pred), axis=0)
        else:
            all_targets = y_true
            all_predictions = y_pred
        
        del image, label
        gc.collect()
        torch.cuda.empty_cache()
            
    train_loss = running_loss / float(len(train_data))
    train_label_ranking_average_precision_score = label_ranking_average_precision_score(all_targets, all_predictions)
    
    scheduler.step(train_loss)
    
    return train_loss, train_label_ranking_average_precision_score

In [None]:
def eval_loop_fn(data_loader, model, device):
    running_loss = 0.0
    all_targets = 0
    all_predictions = 0
    
    model.eval()
    
    for batch_index,dataset in enumerate(data_loader):
        image = dataset["specgram"]
        label = dataset["label"]
        
        image = image.to(device, dtype=torch.float)
        label = label.to(device, dtype=torch.float)

        outputs = model(image)
        
        y_true = label.detach().cpu().numpy()
        y_pred = outputs.detach().cpu().numpy()
        
        loss = loss_fn(label, outputs)
        
        running_loss += loss.item()
        
        if batch_index > 0:
            all_targets = np.concatenate((all_targets, y_true), axis=0)
            all_predictions = np.concatenate((all_predictions, y_pred), axis=0)
        else:
            all_targets = y_true
            all_predictions = y_pred
        
        del image, label
        gc.collect()
        torch.cuda.empty_cache()
    
    valid_loss = running_loss / float(len(val_data))
    valid_label_ranking_average_precision_score = label_ranking_average_precision_score(all_targets, all_predictions)
    
    return valid_loss , valid_label_ranking_average_precision_score

In [None]:
def _run():
    no_of_folds = 5
    for i in range(no_of_folds):
        a_string = "*" * 20

        print(a_string, " FOLD NUMBER ", i, a_string)
        
        df_train = df[df.kfold != i].reset_index(drop=True)
        df_valid = df[df.kfold == i].reset_index(drop=True)
        
        all_accuracies = []
        
        for epoch in range(EPOCHS):
            print(f"Epoch --> {epoch+1} / {EPOCHS}")
            print(f"-------------------------------")

            train_loss, train_label_ranking_average_precision_score = train_loop_fn(training_dataloader, model, optimizer, device, scheduler)
            print('training Loss: {:.4f} & training Validation Label Ranking Average Precision Score : {:.2f}%'.format(train_loss, train_label_ranking_average_precision_score*100))

            valid_loss , valid_label_ranking_average_precision_score = eval_loop_fn(val_dataloader, model, device)
            print('validation Loss: {:.4f} & Validation Label Ranking Average Precision Score : {:.2f}%'.format(valid_loss , valid_label_ranking_average_precision_score*100))
            
            all_accuracies.append(valid_label_ranking_average_precision_score)
        print('\n')
        
        if i < 1:
            best_accuracy = max(all_accuracies)
            best_model = copy.deepcopy(model)
        else:
            if best_accuracy > max(all_accuracies):
                continue
            else:
                best_accuracy = max(all_accuracies)
                best_model = copy.deepcopy(model)
        
        optimizer.swap_swa_sgd()
    
    torch.save(best_model.state_dict(),'./Audio_Classsification_GPU_CutMix_EfficientNet-B5_FOLD.pt')
    print()
    print("The highest accuracy we got among all the folds is {:.2f}%".format(best_accuracy*100))
    
    return best_model
        
if __name__ == "__main__":
    best_model = _run()

In [None]:
sample_submission = pd.read_csv("../input/rfcx-species-audio-detection/sample_submission.csv")
sample_submission.head()

In [None]:
class audio_test_set_classification(Dataset):
    def __init__(self, ids, recording_id):
        self.ids = ids
        self.recording_id = recording_id
        self.aug = albumentations.Compose([
               albumentations.Resize(256 , 256, always_apply = True),
               albumentations.ShiftScaleRotate(shift_limit = 0.0625,
                                                scale_limit = 0.1 ,
                                                rotate_limit = 5,
                                                p = 0.9)
            ])
        
    def __len__(self):
        return len(self.ids)
    
    def __getitem__(self, index):
        filename = "../input/rfcx-species-audio-detection/test/"+ self.recording_id[index] +".flac"
        
        waveform , _ = sf.read(filename)
        
        waveform = torch.from_numpy(waveform)

        waveform = torch.reshape(waveform, (1, waveform.shape[0]))

        specgram = torchaudio.transforms.Spectrogram()(waveform)

        specgram = specgram.repeat(3, 1, 1)
        
        specgram = np.transpose(specgram.numpy(), (1,2,0))
        
        specgram = self.aug(image = specgram)['image']
        
        specgram = np.transpose(specgram, (2,0,1)).astype(np.float32)
        
        return {
            'recording_id' : self.recording_id[index],
            'specgram' : torch.tensor(specgram, dtype = torch.float)
        }

In [None]:
test_data = audio_test_set_classification(ids = [i for i in range(len(sample_submission))], recording_id = sample_submission['recording_id'])

test_dataloader = DataLoader(test_data,
                        num_workers=4,
                        batch_size=8,
                        drop_last=False
                       )
idx = 111 
print(test_data[idx]['recording_id'],".flac")
img = test_data[idx]['specgram']
plt.figure()
plt.imshow(img.log2()[0,:,:].numpy(), cmap='bwr')

In [None]:
# test time augmentation for running  inference 5 times
best_model.eval()

final_preds = None

for batch_index,dataset in enumerate(test_dataloader):
    recording_id = dataset["recording_id"]
    specgram = dataset["specgram"]

    specgram = specgram.to(device, dtype=torch.float)

    with torch.no_grad():
        for i in range(5):
            preds = best_model(specgram)
            preds = preds.detach().cpu().numpy()
            if i > 0:
                temp = np.add(preds, temp)
            else:
                temp = preds

    temp = temp/5
    if batch_index > 0:
        final_preds = np.concatenate((final_preds, temp), axis=0)
    else:
        final_preds = preds      

In [None]:
final_preds.shape

In [None]:
sample_submission.iloc[:,1:] = final_preds

In [None]:
sample_submission.to_csv("submission.csv", index=False)