In [1]:
%%html
<style type='text/css'>
.CodeMirror{
font-size: 14px;
</style>
CUDA_LAUNCH_BLOCKING=1

In [2]:
# I had to find the right version of pytorch with the widget here https://pytorch.org/
# I *think* this will work with AWS
#!pip3 install torch==1.11.0+cu113 torchvision==0.12.0+cu113 torchaudio==0.11.0+cu113 -f https://download.pytorch.org/whl/cu113/torch_stable.html

In [3]:
# other dependencies
#!pip install timm ipywidgets

In [4]:
## nnAudio
#!pip install git+https://github.com/KinWaiCheuk/nnAudio.git#subdirectory=Installation

### 1 Import the kitchen sink

In [5]:
%env CUBLAS_WORKSPACE_CONFIG=:4096:8

env: CUBLAS_WORKSPACE_CONFIG=:4096:8


In [6]:
# humbug main imports

import os
import pandas as pd
import sys
sys.path.insert(0, os.path.abspath('../lib'))
import config
from evaluate import get_results
import numpy as np

# Troubleshooting and visualisation
import IPython.display as ipd


In [7]:
# humbug lib imports
from sklearn.metrics import accuracy_score
from PyTorch import config_pytorch
from datetime import datetime
import math
import pickle

from torch.utils.data import TensorDataset, DataLoader
import torch.nn.functional as F
import torch.nn as nn
import torch
import torch.optim as optim
import numpy as np
from sklearn.metrics import accuracy_score
from datetime import datetime
import os
import time

import matplotlib
import matplotlib.pyplot as plt
import sklearn
from sklearn.metrics import confusion_matrix, classification_report
from sklearn.metrics import precision_recall_curve, plot_precision_recall_curve
from sklearn.metrics import average_precision_score
import sys

from tqdm.notebook import tqdm

In [8]:
# additional pytorch tools
import random
import torchaudio
import torchaudio.transforms as T
import torchvision.transforms as VT
from torch.cuda.amp import autocast, GradScaler
from timm.scheduler.cosine_lr import CosineLRScheduler
import timm
import timm.optim
from timm.loss import BinaryCrossEntropy
from timm.utils import NativeScaler
from timm.models import model_parameters
from glob import glob

In [9]:
## nnAudio
from nnAudio import features
from torch.utils.data.sampler import SubsetRandomSampler
from torch.utils.data import Dataset, DataLoader

In [10]:
#Global Training variables 
num_workers= 8
pin_memory=True
train_size = 100000
batch_size = 64
test_batch_size = 64

### Run all these function definition cells
These have been extracted from the lib folder and are here to make them more easily editable.  Most of the action happens in *get_feat_torch*, which does feature extraction and *train_model*

In [11]:
def load_model(filepath, model):
    # Instantiate model to inspect
    device = torch.device('cuda:0' if torch.cuda.is_available() else torch.device("cpu"))
    print(f'Training on {device}')
        
    if torch.cuda.device_count() > 1:
        print("Using data parallel")
        model = nn.DataParallel(model, device_ids=list(range(torch.cuda.device_count())))
    model = model.to(device)
    # Load trained parameters from checkpoint (may need to download from S3 first)


    if torch.cuda.is_available():
        map_location=lambda storage, loc: storage.cuda()
    else:
        map_location='cpu'
        
    checkpoint = model.load_state_dict(torch.load(filepath))

    return model

In [12]:
def test_model(model, test_loader, criterion, class_threshold=0.5, device=None):
    with torch.no_grad():
        if device is None:
            torch.device('cuda:0' if torch.cuda.is_available() else 'cpu')
        
        sigmoid = nn.Sigmoid()
        test_loss = 0.0
        model.eval()
        
        all_y = []
        all_y_pred = []
        counter = 1
        for x, y, idx in tqdm(test_loader, desc='validation', leave=True):
            
            x, y = x.to(device), y.unsqueeze(1).float().to(device)
            
            y_pred = model(x)['prediction']
                        
            loss = criterion(y_pred, y)

            test_loss += loss.item()
            
            all_y.append(y.cpu().detach())
            all_y_pred.append(y_pred.cpu().detach())
            
            del x
            del y
            del y_pred
            
            counter +=1

        all_y = torch.cat(all_y)
        all_y_pred = torch.cat(all_y_pred)
        
        test_loss = test_loss/len(test_loader)
        test_acc = accuracy_score(all_y.numpy(), (sigmoid(all_y_pred).numpy() > class_threshold).astype(float))
    
    
    return test_loss, test_acc

In [13]:
def test_model_siamese(model, test_loader, criterion, class_threshold=0.5, device=None):
    with torch.no_grad():
        model.eval()
        mean_val_y_pred = []
        all_y_pred = []
        val_running_loss = 0.0
        for img1, img2, y in val_loader:
            x1 = img1.to(device)
            x2 = img2.to(device)
            y = y.to(device)
            model = model.to(device)
            y_pred = model(x1,x2)
            loss = criterion(y_pred, y)
            val_running_loss += loss.item()
            #test_loss += loss.item()
            #all_y.append(y.cpu().detach())
            all_y_pred.append(y_pred.cpu().detach())
            mean_val_y_pred.append(torch.mean(y_pred.cpu().detach()))
            
            del x1, 
            del x2
            del y
            del y_pred
            
        #all_y = torch.cat(all_y)
        all_y_pred = torch.cat(all_y_pred)
        avg_val_loss = val_running_loss / len(val_loader)
        #test_loss = test_loss/len(test_loader)
        #test_acc = accuracy_score(all_y.numpy(), (sigmoid(all_y_pred).numpy() > class_threshold).astype(float))
    
    
    return avg_val_loss

In [14]:
def train_siamese(model, train_loader, val_loader,test_loader , classes, test_batch_size):
    loss_scaler = NativeScaler()
    global_step = 0
    torch.manual_seed(0)
    device = torch.device('cuda:0' if torch.cuda.is_available() else 'cpu')
    print(f'Training on {device}')    
    model = model.to(device)
    criterion = BinaryCrossEntropy(smoothing=0.1)
    optimiser = timm.optim.RAdam(model.parameters(), lr=config_pytorch.lr/10)
    num_epochs = config_pytorch.epochs
    all_train_loss = []
    all_val_acc = []
    best_val_loss = np.inf
    best_train_loss = np.inf
    best_epoch = -1
    checkpoint_name = None
    overrun_counter = 0
    lr_log = []
        
    for e in range(num_epochs):
        train_running_loss = 0.0 
        model.train()
        all_y_pred = []
        y_pred_mean = []
        start_time = time.time()
        
        for ind , (img1, img2, y) in enumerate(train_loader):
            # Forward
            x1 = img1.to(device)
            x2 = img2.to(device)
            y = y.to(device)
            global_step += 1
            optimiser.zero_grad()
            # AMP
            if ind % 300 == 0 :
                elapsed_time = time.time()
                time_since_epoch = (elapsed_time - start_time)/60
                print("epoch = "+ str(e) + "processed batch " + str(ind) + " of " + str(len(train_loader)))
                print("duration = " + str(time_since_epoch))
                
            
            with autocast():
                y_pred = model(x1,x2)
                #print("y_pred = " + str(y_pred))
                loss = criterion(y_pred, y)
                   
            loss_scaler(loss, optimiser, parameters=model_parameters(model))
                       
            train_running_loss += loss.item()
            avg_train_loss = train_running_loss / len(train_loader)
            #y_pred_bat_mean = (torch.mean(y_pred.cpu().detach()))
            lr_log.append(optimiser.param_groups[0]['lr'])
            del x1
            del x2
            del y
        
#         optimiser.sync_lookahead()
        all_train_loss.append(train_running_loss/len(train_loader))
        val_loss = test_model_siamese(model, val_loader, criterion, 0.5, device=device)
        
        #check if the current val_loss is less than the best -val loss
        if val_loss < best_val_loss:
            print("updating the best val loss..")
            best_val_loss = val_loss
            print("saving the model ...")
            checkpoint_name = f'model_e{e}_{datetime.now().strftime("%Y_%m_%d_%H_%M_%S")}.pth'
            torch.save(model.state_dict(), os.path.join(config.model_dir, 'pytorch', checkpoint_name))
            print('Saving model to:', os.path.join(config.model_dir, 'pytorch', checkpoint_name)) 
            overrun_counter = -1
            #print("also get the test score over here")
            eval_siamese_dupe(model , test_loader, classes,test_batch_size)
            #eval_siamese(model , test_loader, classes)
        else:
            overrun_counter += 1
            
        if overrun_counter > config_pytorch.max_overrun:
            break
            
    
    return model, lr_log
        
   

In [15]:
def eval_siamese_dupe(model, test_loader,classes, test_bat_size = test_batch_size):
    
    device = torch.device('cuda:0' if torch.cuda.is_available() else 'cpu')
    ind_other = classes.index('others')
    print("length of test loader = " + str(len(test_loader)))
    test_recs = len(test_loader)*test_bat_size
    print()
    with torch.no_grad():
        model.eval()
        correct = 0
        count = 0
        y_hat_list = []
        y_test = []
        y_hat_dict = {}
        for ind,(mainImg, imgSets, label) in enumerate(test_loader):
            #print("loader ind = " +str(ind))
            #print("num of images in the loader = " +str(len(mainImg)))
            #print(" len ImgSets = " +str(len(imgSets)))
            imgSets_stk = torch.stack(imgSets)
            #print("imgSets_stk shape = " + str(imgSets_stk.shape))
            imgSets_stk_tpose  = torch.transpose(imgSets_stk , 0 ,1)
            #print("imgSets_stk tpose = " + str(imgSets_stk_tpose.shape))
            bat_len = mainImg.shape[0]
            for i in range(bat_len):
                anc_img = mainImg[i,:,:]
                #print("anc_img shape =" + str(anc_img.shape))
                anc_img.to(device)
                label_anc_img = label[i]
                y_test.append(label_anc_img.cpu().detach().numpy().item())
                
                #now loop over images to determine their classification
                output_list = []
                testImg_stack = imgSets_stk_tpose[i,:,:,:].to(device)
                for j in range(imgSets_stk_tpose.shape[1]):
                    #print("i = " +str(i))
                    testImg = testImg_stack[j,:,:]
                    #print("testImg shape = " + str(testImg.shape))
                    output = model(anc_img.cuda(), testImg.cuda())
                    output_cpu = output.cpu().detach()
                    #print("output = \n" + str(output_cpu) )
                    output_list.append(output_cpu)
                    del output
                #print("output_list = " +str(output_list))
                #print("max value in the list = " +str(max(output_list)))
                y_hat_ind = float(output_list.index(max(output_list)))
                y_hat_list.append(y_hat_ind)
                #y_hat_dict.update(ind = str(y_hat_ind) )
                #with_in_loader_ind+=1
            
        #print("prediction list = " + str(y_hat_list))
        #print("Label  list = " + str(y_test))
        
        print("Now printing classification report...")
        from sklearn.metrics import classification_report
        #print("len of y_hat = " + str(len(y_hat_list)))
        #print("len of y_test = " + str(len(y_test)))
        
        print(classification_report(np.array(y_test), np.array(y_hat_list), target_names= classes))
        
     
            
#                 if output > predVal:
#                     pred = i
#                     predVal = output
#             label = label.to(device)
#             if pred == label:
#                 correct += 1
#             count += 1
#             if count % 20 == 0:
#                 print("Current Count is: {}".format(count))
#                 print('Accuracy on n way: {}'.format(correct/count))

In [16]:
def eval_siamese(model, test_loader,classes, test_bat_size = 2):
    
    device = torch.device('cuda:0' if torch.cuda.is_available() else 'cpu')
    ind_other = classes.index('others')
    print("length of test loader = " + str(len(test_loader)))
    with torch.no_grad():
        model.eval()
        correct = 0
        count = 0
        y_hat_list = []
        y_test = []
        y_hat_dict = {}
        for ind,(mainImg, imgSets, label) in enumerate(test_loader):
            print("loader ind = " +str(ind))
            print("len main Image = " + str(len(mainImg)))
            print("len label = " + str(len(label)))
            mainImg = mainImg.to(device)
            predVal = 0
            pred = -1
            y_hat = label.cpu().detach().numpy()
            print("y_hat = " +str(y_hat))
            y_test.append(y_hat)
            # determine which category an image belongs to
            output_list = []
            for i , testImg in enumerate(imgSets):
                print("i = " +str(i))
                testImg = testImg.to(device)
                output = model(mainImg, testImg)
                output_cpu = output.cpu().detach()
                print("output = \n" + str(output_cpu) )
                output_list.append(output_cpu)
                del output
            print("output_list = " +str(output_list))
            print("max value in the list = " +str(max(output_list)))
            y_hat_ind = output_list.index(max(output_list))
            y_hat_list.append(y_hat_ind)
            y_hat_dict.update(ind = str(y_hat_ind) )
            
        print("prediction list = " + str(y_hat_list))
        print("Label  list = " + str(y_test))
        
        print("Now printing classification report...")
        from sklearn.metrics import classification_report
        #print("len of y_hat = " + str(len(y_hat_list)))
        #print("len of y_test = " + str(len(y_test)))
        
        print(classification_report(y_test, y_hat_list, target_names= classes))
        
     
            
#                 if output > predVal:
#                     pred = i
#                     predVal = output
#             label = label.to(device)
#             if pred == label:
#                 correct += 1
#             count += 1
#             if count % 20 == 0:
#                 print("Current Count is: {}".format(count))
#                 print('Accuracy on n way: {}'.format(correct/count))

In [17]:
def get_offsets_df(df, short_audio=False):
    audio_offsets = []
    min_length = config.win_size*config.NFFT/(((1/config.n_hop)*config.NFFT)*config.rate)
    step_frac = config.step_size/config.win_size
    for _,row in df.iterrows():
        if row['length'] > min_length:
            step_size = step_frac*min_length
            audio_offsets.append({'id':row['id'], 'offset':0,'sound_type': row['sound_type'], 'length': row['length'],'specie_ind': row['specie_ind']})
            for i in range(1, int((row['length']-min_length)//step_size)):
                audio_offsets.append({'id': row['id'], 'offset':int(min_length+(i*step_size)*config.rate),'sound_type':row['sound_type'], 'length': row['length'],'specie_ind': row['specie_ind']})
        elif short_audio:
            audio_offsets.append({'id':row['id'], 'offset':0,'sound_type': row['sound_type'], 'length': row['length'],'specie_ind': row['specie_ind']})
    return pd.DataFrame(audio_offsets)       

In [18]:
def concat_df(df_offset, indices):
    list_df_ind = []
    #print("len of indices = " + str(len(indices)))
    for ind in indices :
        df_name = "df_"+ str(ind)
        df_name = df_offset[df_offset['specie_ind'] == ind]
        list_df_ind.append(df_name)
    df_offset_trimmed = pd.concat(list_df_ind)
    return(df_offset_trimmed)

In [19]:
#check the min length based on config params
min_length = (config.win_size * config.n_hop) / config.rate
min_length

1.92

### 3 The Data

### Read CSV and get train/test groups

In [20]:
classes = ['an arabiensis','culex pipiens complex', 'ae aegypti','an funestus ss','an squamosus',
               'an coustani','ma uniformis','ma africanus' , 'others']
classes_no_other = ['an arabiensis','culex pipiens complex', 'ae aegypti','an funestus ss','an squamosus',
               'an coustani','ma uniformis','ma africanus' ]
other_ind = classes.index('others')

In [21]:
df = pd.read_csv(config.data_df)
#df = df.loc[df['Grade'].notnull()]
df = df.loc[df['species'].notnull()]
df

Unnamed: 0,id,length,name,sample_rate,record_datetime,sound_type,species,gender,fed,plurality,age,method,mic_type,device_type,country,district,province,place,location_type
1,53,0.463456,CDC_Ae-aegypti_labelled_800.wav,8000,08-09-16 08:00,mosquito,ae aegypti,,,Single,,,phone,Alcatel 4009X,USA,Georgia,Atlanta,"CDC insect cultures, Atlanta",culture
2,57,0.170249,CDC_Ae-aegypti_labelled_800.wav,8000,08-09-16 08:00,mosquito,ae aegypti,,,Single,,,phone,Alcatel 4009X,USA,Georgia,Atlanta,"CDC insect cultures, Atlanta",culture
3,61,0.104041,CDC_Ae-aegypti_labelled_800.wav,8000,08-09-16 08:00,mosquito,ae aegypti,,,Single,,,phone,Alcatel 4009X,USA,Georgia,Atlanta,"CDC insect cultures, Atlanta",culture
4,69,0.274290,CDC_Ae-aegypti_labelled_800.wav,8000,08-09-16 08:00,mosquito,ae aegypti,,,Single,,,phone,Alcatel 4009X,USA,Georgia,Atlanta,"CDC insect cultures, Atlanta",culture
5,56,0.420894,CDC_Ae-aegypti_labelled_800.wav,8000,08-09-16 08:00,mosquito,ae aegypti,,,Plural,,,phone,Alcatel 4009X,USA,Georgia,Atlanta,"CDC insect cultures, Atlanta",culture
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
8999,3562,6.083093,#988-1001.wav,44100,01-07-18 12:00,mosquito,an harrisoni,Female,t,Single,,ABN,telinga,olympus,Thailand,Sai Yok District,Kanchanaburi Province,field site near Pu Teuy Village,cup
9000,3556,6.719908,#988-1001.wav,44100,01-07-18 12:00,mosquito,an maculatus,Female,t,Single,,ABN,telinga,olympus,Thailand,Sai Yok District,Kanchanaburi Province,field site near Pu Teuy Village,cup
9009,3553,6.128580,#988-1001.wav,44100,01-07-18 12:00,mosquito,an maculatus,Female,t,Single,,ABN,telinga,olympus,Thailand,Sai Yok District,Kanchanaburi Province,field site near Pu Teuy Village,cup
9011,3561,11.614280,#988-1001.wav,44100,01-07-18 12:00,mosquito,an harrisoni,Female,t,Single,,ABN,telinga,olympus,Thailand,Sai Yok District,Kanchanaburi Province,field site near Pu Teuy Village,cup


In [22]:
# adding a colum for specie encoding
df['specie_ind'] = "NULL_VAL"

    

In [23]:
# Adding a new column to encode specie_index in the same order as the list "classes"
ind = 0
for specie in classes_no_other:
    print("specie = " + str(specie) + "and its index = " + str(ind) )
    row_indexes=df[df['species']==specie].index 
    df.loc[row_indexes,'specie_ind']= ind
    ind+=1

    
other_df_ind = df[df['specie_ind'] == "NULL_VAL"].index
df.loc[other_df_ind,'specie_ind']= other_ind                  

specie = an arabiensisand its index = 0
specie = culex pipiens complexand its index = 1
specie = ae aegyptiand its index = 2
specie = an funestus ssand its index = 3
specie = an squamosusand its index = 4
specie = an coustaniand its index = 5
specie = ma uniformisand its index = 6
specie = ma africanusand its index = 7


In [24]:
USE_SHORT_AUDIO = True

In [25]:
df_offset = get_offsets_df(df, short_audio=USE_SHORT_AUDIO)

In [26]:
from sklearn.model_selection import train_test_split
df_train_offset_temp,df_test_offset  = train_test_split(df_offset, test_size=0.2)
df_train_offset,df_val_offset  = train_test_split(df_train_offset_temp, test_size=0.2)


In [27]:
for i in range(0,len(classes)):
    df_temp = df_val_offset[df_val_offset['specie_ind'] == i]
    print("i = " +str(i))
    print(len(df_temp))

i = 0
7528
i = 1
3455
i = 2
566
i = 3
3350
i = 4
914
i = 5
497
i = 6
674
i = 7
317
i = 8
6989


In [28]:
#df_temp.reset_index(inplace = True)
df_train_offset.reset_index(inplace = True)
df_test_offset.reset_index(inplace = True)
df_val_offset.reset_index(inplace = True)



In [29]:
# get the frame offsets for each audio file into dataframes
# audio_df_train = get_offsets_df(df_train, short_audio=USE_SHORT_AUDIO)
# audio_df_test_A = get_offsets_df(df_test_A, short_audio=False)
# audio_df_test_B = get_offsets_df(df_test_B, short_audio=False)

In [30]:
# This function pads a file with 0s to make it a 1.92 sec file
def pad_zero(x_temp,rate = config.rate, min_length = config.min_duration ):
    #print("inside padding zero...")
    left_pad_amt = int((rate*min_length-x_temp.shape[1])//2)
    #print("left_pad_amt = " + str(left_pad_amt))
    left_pad = torch.zeros(1,left_pad_amt) #+ (0.1**0.5)*torch.randn(1, left_pad_amt)
    right_pad_amt = int(rate*min_length-x_temp.shape[1]-left_pad_amt)
    right_pad = torch.zeros(1,right_pad_amt)# + (0.1**0.5)*torch.randn(1, right_pad_amt)
    f = torch.cat([left_pad,x_temp,right_pad],dim=1)[0]
    f = f.unsqueeze(dim = 0)
    #print("returning a tensor of shape = " + str(f.shape))
    return(f)

In [31]:
class Moz_train_dataset(Dataset):
    """Face Landmarks dataset."""

    def __init__(self, audio_df, setSize , data_dir, min_length, cache=None, transform=None,rate = config.rate):
        """
        Args:
            audio_df (DataFrame): from get_offsets_df function 
            data_dir (string): Directory with all the wavs.
            cache (dict): Empty dictionary used as cache
            transform (callable, optional): Optional transform to be applied
                on a sample.
        """
        self.audio_df = audio_df
        self.data_dir = data_dir
        self.min_length = min_length
        self.transform = transform
        self.cache = cache
        self.setSize = setSize
        

    def __len__(self):
        return self.setSize
    
    def _get_tensor_(self, path, resample=None):
        waveform, inp_samp_rate = torchaudio.load(path)
        
        if inp_samp_rate != config.rate:
            import torchaudio.transforms as T
            resampler = T.Resample(inp_samp_rate, config.rate, dtype=waveform.dtype)
            waveform = resampler(waveform)
        
        if waveform.shape[1] < config.rate*config.min_duration :
            #print("need to pad")
            waveform = pad_zero(waveform)
            #waveform = waveform.unsqueeze(dim = 0)
        
        f = waveform[0]
        mu = torch.std_mean(f)[1]
        st = torch.std_mean(f)[0]
            #return waveform, rate, waveform
        f_out = torch.clamp(f, min=mu-st*3, max=mu+st*3).unsqueeze(0)
        
        return f_out, config.rate

    def __getitem__(self, idx):
        all_uniq_ind = self.audio_df.specie_ind.unique()
        if idx % 2 == 0:
            # select the same character for both images
            category = int(np.random.choice(a= all_uniq_ind, size= 1, replace=False))
            #select two images belonging to the category chosen.
            df_temp = self.audio_df[self.audio_df['specie_ind']== category].sample(2,replace = False )
            df_temp.reset_index(inplace = True)
            #print("df_temp = " + str(df_temp))
            row1 = df_temp.iloc[0]
            row2 = df_temp.iloc[1]
            #print("row1.id = " + str(row1.id))
            #print("row2.id = " + str(row2.id))
            label = 1.0
            # x_full  and x2_full represnts the entire tensor representations of the wav file
            x1_full, _ = self._get_tensor_(os.path.join(self.data_dir,f"{row1.id}.wav"), resample=config.rate)
            x2_full, _ = self._get_tensor_(os.path.join(self.data_dir,f"{row2.id}.wav"), resample=config.rate)
            #print("x1_full shape = " + str(x1_full.shape))
            #print("x2_full shape = " + str(x2_full.shape))
            
            
            r1_offset = row1.offset
            r2_offset = row2.offset
            #print("row1_offset = " + str(r1_offset))
            #print("row2_offset = " + str(r2_offset))
            
            x1 = x1_full[:,r1_offset:int(r1_offset+config.rate*self.min_length)]
            x2 = x2_full[:,r2_offset:int(r2_offset+config.rate*self.min_length)]
            #print("x1 shape = " + str(x1.shape))
            #print("x2 shape = " + str(x2.shape))
            
        else:
            #print("^^^^^^^^ODD INDEX^^^^^^^^^^")
            category1, category2 = np.random.choice(a= all_uniq_ind, size= 2, replace=False)
            df_temp_cat1 = self.audio_df[self.audio_df['specie_ind']== int(category1)].sample(1,replace = False )
            df_temp_cat2 = self.audio_df[self.audio_df['specie_ind']== int(category2)].sample(1,replace = False )
            df_temp_cat1.reset_index(inplace = True)
            df_temp_cat2.reset_index(inplace = True)
            label = 0.0
            
            row1 = df_temp_cat1.iloc[0]
            row2 = df_temp_cat2.iloc[0]
            #print("row1.id = " + str(row1.id))
            #print("row2.id = " + str(row2.id))
            
            # x_full  and x2_full represnts the entire tensor representations of the wav file
            x1_full, _ = self._get_tensor_(os.path.join(self.data_dir,f"{row1.id}.wav"), resample=config.rate)
            x2_full, _ = self._get_tensor_(os.path.join(self.data_dir,f"{row2.id}.wav"), resample=config.rate)
            #print("x1_full shape = " + str(x1_full.shape))
            #print("x2_full shape = " + str(x2_full.shape))
            
            r1_offset = row1.offset
            r2_offset = row2.offset
            #print("row1_offset = " + str(r1_offset))
            #print("row2_offset = " + str(r2_offset))
            
            x1 = x1_full[:,r1_offset:int(r1_offset+config.rate*self.min_length)]
            x2 = x2_full[:,r2_offset:int(r2_offset+config.rate*self.min_length)]
            #print("x1 shape = " + str(x1.shape))
            #print("x2 shape = " + str(x2.shape))
            
            
                                   
        
        return x1,x2,torch.from_numpy(np.array([label], dtype=np.float32))  
    
    

In [32]:
class Moz_test_dataset(Dataset):
    """Face Landmarks dataset."""

    def __init__(self, audio_df, setSize , data_dir, min_length, cache=None, transform=None,rate = config.rate,numway = len(classes)):
        """
        Args:
            audio_df (DataFrame): from get_offsets_df function 
            data_dir (string): Directory with all the wavs.
            cache (dict): Empty dictionary used as cache
            transform (callable, optional): Optional transform to be applied
                on a sample.
        """
        self.audio_df = audio_df
        self.data_dir = data_dir
        self.min_length = min_length
        self.transform = transform
        self.cache = cache
        self.setSize = setSize
        self.numway = numway

    def __len__(self):
        return self.setSize
    
    def _get_tensor_(self, path, resample=None):
        waveform, inp_samp_rate = torchaudio.load(path)
        
        if inp_samp_rate != config.rate:
            import torchaudio.transforms as T
            resampler = T.Resample(inp_samp_rate, config.rate, dtype=waveform.dtype)
            waveform = resampler(waveform)
        
        if waveform.shape[1] < config.rate*config.min_duration :
            #print("need to pad")
            waveform = pad_zero(waveform)
            #waveform = waveform.unsqueeze(dim = 0)
        
        f = waveform[0]
        mu = torch.std_mean(f)[1]
        st = torch.std_mean(f)[0]
            #return waveform, rate, waveform
        f_out = torch.clamp(f, min=mu-st*3, max=mu+st*3).unsqueeze(0)
        
        return f_out, config.rate

    def __getitem__(self, idx):
        all_uniq_ind = self.audio_df.specie_ind.unique()
        # find one main image
        all_uniq_ind = df.specie_ind.unique()
        category =  int(np.random.choice(a= all_uniq_ind, size= 1, replace=False))
        #sample a rando value from the category chosen above
        df_main = self.audio_df[self.audio_df['specie_ind']== category].sample(1,replace = False )
        df_main.reset_index(inplace = True)
        #print("df_main = " + str(df_main))
                 
        # find n numbers of distinct images, 1 in the same set as the main
        testSet = []
        label = int(np.random.choice(a= all_uniq_ind, size= 1, replace=False))
        #print("label ->" +str(label))
        #print("self.numway = " +str(self.numway))
        for i in range(self.numway):
            #print("i = " +str(i))
            if i == label:
                #estImgName = random.choice(os.listdir(imgDir))
                df_name = "df_temp_" + str(label)
                df_temp = self.audio_df[self.audio_df['specie_ind']== label].sample(1,replace = False )
                #print("i == label")
                #print("df_temp = " +str(df_temp))
                            
            else:
                df_name = "df_temp_" + str(i)
                testCategory = int(np.random.choice(a= all_uniq_ind, size= 1, replace=False))
                df_temp = self.audio_df[self.audio_df['specie_ind']== testCategory].sample(1,replace = False )
                
                

            testSet.append(df_temp)
        df_test = pd.concat(testSet, ignore_index=True)
        #print("df_test = " +str(df_test))
            #you loophere on the dataframe to get x's 
            
            
            
        # x_full  and x2_full represnts the entire tensor representations of the wav file
        #x1_full, _ = self._get_tensor_(os.path.join(self.data_dir,f"{row1.id}.wav"), resample=config.rate)
        main_row = df_main.iloc[0]
        #print(main_row)
        x_main_full, _ = self._get_tensor_(os.path.join(self.data_dir,f"{main_row.id}.wav"), resample=config.rate)
        #print("x_main_full shape = " + str(x_main_full.shape))
        
        main_offset = main_row.offset
        #print("main_offset  = " + str(main_offset))
        
        x_main = x_main_full[:,main_offset:int(main_offset+config.rate*self.min_length)]
        #print("x_main shape = " + str(x_main.shape))
        
        x_test = []
        for ind,row in df_test.iterrows():
            row_id = row['id']
            #print("inside loop.... row_id = " + str(row_id))
            x_full, _ = self._get_tensor_(os.path.join(self.data_dir,f"{str(row_id)}.wav"), resample=config.rate)
            #print("inside loop.... x_full = " + str(x_full.shape))
            offset = row['offset']
            #print("inside loop...offset  = " + str(offset))
            x_temp = x_full[:,offset:int(offset+config.rate*self.min_length)]
            #print("inside loop...x_temp shape   = " + str(x_temp.shape))
            
            x_test.append(x_temp)
            
        #x_test = torch.stack(x_test)
        #x_test_tpose = torch.transpose(x_test, 0,1)
        
        return x_main,x_test,torch.from_numpy(np.array([label], dtype=np.float32))  
    
    

In [33]:
class Model(nn.Module):
    def __init__(self, model_name, image_size,threshold = .5):
        super().__init__()
        # num_classes=0 removes the pretrained head
        self.backbone = timm.create_model(model_name,
                        pretrained=True,  in_chans=1, 
                        drop_path_rate=0.0, global_pool='max',
                        drop_rate=0.0)
        
        self.spec_layer = features.STFT(n_fft=config.NFFT, freq_bins=None, hop_length=config.n_hop,
                              window='hann', freq_scale='linear', center=True, pad_mode='reflect',
                          fmin=400, fmax=2000, sr=config.rate, output_format="Magnitude", trainable=False)
        self.out = nn.Linear(self.backbone.num_features, 1)
        self.sizer = VT.Resize((image_size,image_size))
        self.timeMasking = T.TimeMasking(time_mask_param=int(config.win_size*0.4), iid_masks=True)
        self.freqMasking = T.FrequencyMasking(freq_mask_param=int((config.NFFT//4)*0.15), iid_masks=True)
        #self.norm_layer = Normalization(mode='framewise')
        #self.pcen_layer = PCENTransform(eps=1e-6, s=0.025, alpha=0.6, delta=0.1, r=0.2, trainable=True)
        #1000 due to the size of the final layer in convnext
        self.fcOut = nn.Linear(1000, 1)
        self.out_new  = nn.Sigmoid()
        self.threshold  = threshold
        
        
    def forward(self, x1,x2):
        # first compute spectrogram
        spec1 = self.spec_layer(x1)  # (B, F, T)
        # normalize
#         spec = spec.transpose(1,2) # (B, T, F)
        #spec = self.pcen_layer(spec)
        #spec = self.norm_layer(spec)
        
#         if self.training:
#             spec = self.timeMasking(spec)
#             spec = self.freqMasking(spec)

        # then size for CNN model
        # and create a channel
        spec1 = self.sizer(spec1)
        #print("spec1 shape post STFT = " +str(spec1.shape))
        x1 = spec1.unsqueeze(1)
        #print("post unsqueeze x1 shape = " +str(x1.shape))
        # then repeat channels
        x1 = self.backbone(x1)
        #print("post backbone . x1 shape = " +str(x1.shape))
        
        
        spec2 = self.spec_layer(x2)  # (B, F, T)
        # normalize
#         spec = spec.transpose(1,2) # (B, T, F)
        #spec = self.pcen_layer(spec)
        #spec = self.norm_layer(spec)
        
#         if self.training:
#             spec = self.timeMasking(spec)
#             spec = self.freqMasking(spec)

        # then size for CNN model
        # and create a channel
        spec2 = self.sizer(spec2)
        #print("spec2 shape post STFT = " +str(spec2.shape))
        x2 = spec2.unsqueeze(1)
        #print("post unsqueeze x2 shape = " +str(x2.shape))
        # then repeat channels
        x2 = self.backbone(x2)
        #print("post backbone . x2 shape = " +str(x2.shape))
              
        x = torch.abs(x1-x2)
        #print(" x shape = " +str(x.shape))
        out = self.fcOut(x)
        #print("output of fcout = " +str(x))
        #print("output = " +str(out))
        return out

In [34]:

train_dataset = Moz_train_dataset(audio_df = df_train_offset,data_dir = config.data_dir, setSize = train_size, min_length = config.min_duration)
val_dataset = Moz_train_dataset(audio_df = df_train_offset,data_dir = config.data_dir, setSize = int(train_size*.01), min_length = config.min_duration)
test_dataset = Moz_test_dataset(audio_df = df_test_offset, setSize = int(train_size*.01),  data_dir = config.data_dir, min_length= config.min_duration, numway = len(classes))

train_loader = torch.utils.data.DataLoader(train_dataset, batch_size=batch_size, num_workers=num_workers, pin_memory=True, shuffle = True)
val_loader = torch.utils.data.DataLoader(val_dataset, batch_size=config_pytorch.batch_size, num_workers=num_workers, pin_memory=False , shuffle = False )
test_loader = torch.utils.data.DataLoader(test_dataset, batch_size=test_batch_size, num_workers= 0, pin_memory=False , shuffle = False )


In [35]:
len(train_dataset)

100000

In [36]:
#a.shape



In [37]:
# Test block

#temp_ten = torch.rand(64, 9, 1, 15360)
# temp_ten.shape
# bat_len = temp_ten.shape[0]
# print("bat_len = " +str(bat_len))
# for i in range (bat_len):
#     print("i = " + str(i))
#     elem = temp_ten[i,:,:,:]
#     print("elem shape = " +str(elem.shape))
#     for j in range(elem.shape[0]):
#         img = elem[j,:,:]
#         print("img shape = " +str(img.shape))
        

## Training

In [38]:
model =Model('convnext_small',224)
model, lr_log = train_siamese(model , train_loader, val_loader,test_loader,classes , test_batch_size )

sampling rate = 8000. Please make sure the sampling rate is correct in order toget a valid freq range
STFT kernels created, time used = 0.0772 seconds
Training on cuda:0
epoch = 0processed batch 0 of 1563
duration = 0.09183247089385986
epoch = 0processed batch 300 of 1563
duration = 3.4866631825764975
epoch = 0processed batch 600 of 1563
duration = 6.858712307612101
epoch = 0processed batch 900 of 1563
duration = 10.298628548781076
epoch = 0processed batch 1200 of 1563
duration = 13.681343309084575
epoch = 0processed batch 1500 of 1563
duration = 17.118856398264565
updating the best val loss..
saving the model ...
Saving model to: ../outputs/models/pytorch/model_e0_2022_09_09_02_29_04.pth
length of test loader = 16

Now printing classification report...
                       precision    recall  f1-score   support

        an arabiensis       0.17      0.15      0.16       129
culex pipiens complex       0.08      0.09      0.08        94
           ae aegypti       0.14      0.13    

In [39]:
prediction  = [2.0, 3.0, 8.0, 2.0, 8.0, 8.0, 4.0, 0.0, 7.0, 8.0, 8.0, 2.0, 8.0, 0.0, 1.0, 3.0, 4.0, 8.0, 5.0, 8.0, 2.0, 2.0, 2.0, 0.0, 5.0, 5.0, 3.0, 8.0, 4.0, 1.0, 2.0, 5.0, 6.0, 1.0, 8.0, 0.0, 2.0, 4.0, 6.0, 8.0, 7.0, 0.0, 7.0, 0.0, 8.0, 5.0, 8.0, 2.0, 0.0, 0.0, 4.0, 7.0, 5.0, 4.0, 1.0, 2.0, 4.0, 1.0, 6.0, 4.0, 3.0, 7.0, 8.0, 8.0, 0.0, 6.0, 7.0, 1.0, 6.0, 5.0, 7.0, 0.0, 5.0, 0.0, 0.0, 2.0, 1.0, 5.0, 8.0, 1.0, 5.0, 7.0, 5.0, 7.0, 3.0, 6.0, 6.0, 6.0, 2.0, 6.0, 2.0, 6.0, 6.0, 3.0, 6.0, 6.0, 0.0, 4.0, 6.0, 6.0, 0.0, 8.0, 7.0, 1.0, 4.0, 1.0, 3.0, 0.0, 8.0, 6.0, 5.0, 7.0, 7.0, 3.0, 2.0, 0.0, 4.0, 3.0, 4.0, 2.0, 4.0, 2.0, 7.0, 3.0, 1.0, 3.0, 6.0, 5.0, 5.0, 2.0, 0.0, 2.0, 0.0, 6.0, 3.0, 0.0, 3.0, 4.0, 8.0, 6.0, 4.0, 6.0, 0.0, 4.0, 5.0, 2.0, 6.0, 1.0, 1.0, 5.0, 4.0, 6.0, 5.0, 8.0, 0.0, 3.0, 4.0, 4.0, 4.0, 8.0, 8.0, 5.0, 5.0, 0.0, 1.0, 3.0, 3.0, 7.0, 7.0, 1.0, 5.0, 7.0, 6.0, 5.0, 8.0, 3.0, 8.0, 5.0, 2.0, 3.0, 7.0, 3.0, 7.0, 8.0, 4.0, 2.0, 0.0, 6.0, 8.0, 1.0, 3.0, 6.0, 2.0, 2.0, 7.0, 7.0, 2.0, 0.0, 4.0, 8.0]
Label   = [3.0, 5.0, 0.0, 0.0, 2.0, 0.0, 1.0, 3.0, 4.0, 7.0, 1.0, 3.0, 2.0, 1.0, 4.0, 1.0, 3.0, 8.0, 8.0, 8.0, 3.0, 4.0, 7.0, 6.0, 4.0, 1.0, 3.0, 0.0, 7.0, 7.0, 3.0, 1.0, 4.0, 3.0, 5.0, 3.0, 4.0, 2.0, 8.0, 8.0, 7.0, 4.0, 7.0, 2.0, 6.0, 1.0, 8.0, 4.0, 8.0, 0.0, 1.0, 4.0, 2.0, 2.0, 0.0, 8.0, 1.0, 7.0, 4.0, 1.0, 2.0, 1.0, 2.0, 3.0, 6.0, 3.0, 6.0, 5.0, 4.0, 0.0, 5.0, 8.0, 5.0, 7.0, 1.0, 1.0, 6.0, 8.0, 6.0, 6.0, 4.0, 6.0, 6.0, 6.0, 8.0, 4.0, 5.0, 8.0, 0.0, 7.0, 4.0, 1.0, 6.0, 1.0, 5.0, 5.0, 5.0, 3.0, 8.0, 7.0, 8.0, 1.0, 2.0, 7.0, 2.0, 0.0, 4.0, 3.0, 2.0, 5.0, 1.0, 1.0, 2.0, 3.0, 0.0, 4.0, 7.0, 5.0, 2.0, 2.0, 6.0, 7.0, 2.0, 2.0, 6.0, 4.0, 2.0, 5.0, 8.0, 3.0, 6.0, 1.0, 1.0, 6.0, 5.0, 0.0, 2.0, 5.0, 3.0, 3.0, 0.0, 3.0, 3.0, 3.0, 1.0, 4.0, 3.0, 0.0, 6.0, 3.0, 7.0, 1.0, 2.0, 7.0, 3.0, 6.0, 1.0, 3.0, 8.0, 3.0, 6.0, 1.0, 4.0, 4.0, 6.0, 2.0, 5.0, 3.0, 0.0, 5.0, 7.0, 3.0, 5.0, 6.0, 4.0, 7.0, 1.0, 1.0, 6.0, 1.0, 4.0, 5.0, 8.0, 4.0, 6.0, 4.0, 5.0, 1.0, 6.0, 0.0, 3.0, 3.0, 3.0, 4.0, 7.0, 5.0, 2.0, 6.0, 2.0, 4.0]
print(classification_report(np.array(Label), np.array(prediction), target_names= classes))

                       precision    recall  f1-score   support

        an arabiensis       0.08      0.13      0.10        15
culex pipiens complex       0.00      0.00      0.00        28
           ae aegypti       0.08      0.09      0.09        22
       an funestus ss       0.15      0.10      0.12        30
         an squamosus       0.00      0.00      0.00        26
          an coustani       0.09      0.10      0.10        20
         ma uniformis       0.08      0.08      0.08        24
         ma africanus       0.15      0.17      0.16        18
               others       0.15      0.24      0.18        17

             accuracy                           0.09       200
            macro avg       0.09      0.10      0.09       200
         weighted avg       0.08      0.09      0.08       200



In [40]:
label = []
pred = []
for i in range(10):
    label.append(np.random.rand(9))
    pred.append(np.random.rand(9))
print(label)
print(pred)
print(classification_report(label, pred, target_names= classes, labels= classes))

[array([0.87760734, 0.42133766, 0.36638277, 0.09577508, 0.64968163,
       0.18326218, 0.96760532, 0.83578111, 0.35453328]), array([0.96328729, 0.68696909, 0.41232021, 0.32922509, 0.8607725 ,
       0.10209842, 0.45504749, 0.07426344, 0.60985828]), array([0.73629949, 0.24183255, 0.78827908, 0.07632208, 0.89225306,
       0.4841422 , 0.16714914, 0.71596956, 0.42667964]), array([0.40951721, 0.11767799, 0.00766084, 0.10717003, 0.25960055,
       0.96982968, 0.66877324, 0.71768267, 0.32782502]), array([0.322194  , 0.25549861, 0.44225265, 0.56199903, 0.76576567,
       0.58182731, 0.87022707, 0.51925664, 0.40541712]), array([0.68478602, 0.00381556, 0.58405232, 0.81279448, 0.57776442,
       0.49142398, 0.87475936, 0.57296155, 0.16161817]), array([0.3120195 , 0.52491875, 0.44130272, 0.57516214, 0.60038413,
       0.72439919, 0.57462143, 0.99240305, 0.74659416]), array([0.78780704, 0.99466601, 0.90690109, 0.40492058, 0.86618598,
       0.72272776, 0.26961624, 0.70340493, 0.15072064]), array([

ValueError: continuous-multioutput is not supported

In [None]:
label = torch.tensor(8, device = "cuda")
print(label)
label_cpu = label.cpu().detach()
print(label_cpu)
label_np = label_cpu.numpy()
print(type(label_np))
label_np_item = label_np.item()
print(type(label_np_item))


