In [59]:
import torch
import torch.nn as nn 
from torch.utils.data import Dataset,DataLoader
import librosa 
import os
from glob import glob
import numpy as np
from tqdm.notebook import tqdm

In [2]:
class Prologue(nn.Module):
    def __init__(self,
                 in_channels,
                 out_channels,
                 kernel_size=3):
        super(Prologue,self).__init__()
        self.conv=nn.Conv1d(in_channels=in_channels,
                            out_channels=out_channels,
                            kernel_size=kernel_size,
                            padding='same')
        self.norm=nn.BatchNorm1d(out_channels)
        self.relu=nn.ReLU()
    
    def forward(self,x):
        x=self.conv(x)
        x=self.norm(x)
        x=self.relu(x)
        return x

In [3]:
class SubBlock(nn.Module):
    def __init__(self,
                 out_channels,
                 kernel_size,
                 dilation=1):
        super(SubBlock,self).__init__()
        self.depthwise_conv=nn.Conv1d(in_channels=out_channels,
                                      out_channels=out_channels,
                                      kernel_size=kernel_size,
                                      padding=kernel_size//2,
                                      dilation=dilation)
        self.pointwise_conv=nn.Conv1d(in_channels=out_channels,
                                      out_channels=out_channels,
                                      kernel_size=1)
        self.norm=nn.BatchNorm1d(out_channels)
        self.relu=nn.ReLU()
        self.dropout=nn.Dropout()

    def forward(self,x):
        x=self.depthwise_conv(x)
        x=self.pointwise_conv(x)
        x=self.norm(x)
        x=self.relu(x)
        x=self.dropout(x)
        return x

In [4]:
class SqueezeExcitation(nn.Module):
    def __init__(self,
                 in_channels,
                 reduction):
        super(SqueezeExcitation,self).__init__()
        self.squeeze=nn.AdaptiveAvgPool1d(1)
        self.linear1=nn.Linear(in_channels,in_channels//reduction)
        self.linear2=nn.Linear(in_channels//reduction,in_channels)
        self.relu=nn.ReLU()
        self.gate=nn.Sigmoid()
    
    def forward(self,x):
        input=x
        x=self.squeeze(x)
        x=x.squeeze(-1)
        x=self.linear1(x)
        x=self.relu(x)
        x=self.linear2(x)
        x=self.gate(x)
        x=x.unsqueeze(-1)
        return input*x.expand_as(input)

In [5]:
class MegaBlock(nn.Module):
    def __init__(self,
                 out_channels,
                 kernel_size,
                 dilation,
                 repeat,
                 reduction):
        super(MegaBlock,self).__init__()
        self.sub_block_list=[
            SubBlock(
                out_channels=out_channels,
                kernel_size=kernel_size,
                dilation=dilation
            ) for _ in range(repeat)
        ]
        self.repeat_block=nn.Sequential(*self.sub_block_list)
        self.depthwise_conv1=nn.Conv1d(in_channels=out_channels,
                                       out_channels=out_channels,
                                       kernel_size=kernel_size,
                                       padding=kernel_size//2)
        self.depthwise_conv2=nn.Conv1d(in_channels=out_channels,
                                       out_channels=out_channels,
                                       kernel_size=kernel_size,
                                       padding=kernel_size//2)
        self.dropout=nn.Dropout()
        self.relu=nn.ReLU()
        self.pointwise_conv=nn.Conv1d(in_channels=out_channels,
                                      out_channels=out_channels,
                                      kernel_size=1)
        self.norm=nn.BatchNorm1d(out_channels)
        self.se_block=SqueezeExcitation(out_channels,reduction)

    def forward(self,x):
        y=self.repeat_block(x)
        y=self.depthwise_conv1(y)
        y=self.depthwise_conv2(y)
        y=self.se_block(y)
        x=self.pointwise_conv(x)
        x=self.norm(x)
        result=self.relu(x+y)
        result=self.dropout(result)
        return result

In [6]:
class Epilogue(nn.Module):
    def __init__(self,
                 in_channels,
                 out_channels,
                 kernel_size=1):
        super(Epilogue,self).__init__()
        self.conv=nn.Conv1d(in_channels=in_channels,
                            out_channels=out_channels,
                            kernel_size=kernel_size)
        self.norm=nn.BatchNorm1d(out_channels)
        self.relu=nn.ReLU()
    
    def forward(self,x):
        x=self.conv(x)
        x=self.norm(x)
        x=self.relu(x)
        return x

In [7]:
class Encoder(nn.Module):
    def __init__(self,
                 prolog_in_channels=80,
                 prolog_out_channels=256,
                 epilog_out_channels=256,
                 kernel_b1=7,
                 dilation_b1=1,
                 repeat_b1=2,
                 reduction_b1=16,
                 kernel_b2=11,
                 dilation_b2=1,
                 repeat_b2=2,
                 reduction_b2=16,
                 kernel_b3=15,
                 dilation_b3=1,
                 repeat_b3=2,
                 reduction_b3=16):
        super(Encoder,self).__init__()
        self.prolog=Prologue(in_channels=prolog_in_channels,
                             out_channels=prolog_out_channels)
        self.block1=MegaBlock(out_channels=prolog_out_channels,kernel_size=kernel_b1,dilation=dilation_b1,repeat=repeat_b1,reduction=reduction_b1)
        self.block2=MegaBlock(out_channels=prolog_out_channels,kernel_size=kernel_b2,dilation=dilation_b2,repeat=repeat_b2,reduction=reduction_b2)
        self.block3=MegaBlock(out_channels=prolog_out_channels,kernel_size=kernel_b3,dilation=dilation_b3,repeat=repeat_b3,reduction=reduction_b3)
        self.epilog=Epilogue(in_channels=prolog_out_channels,out_channels=epilog_out_channels)
    
    def forward(self,x):
        x=self.prolog(x)
        x=self.block1(x)
        x=self.block2(x)
        x=self.block3(x)
        x=self.epilog(x)
        return x

In [29]:
x=torch.rand(size=[2,80,1])

In [9]:
class AttentiveStatisticalPooling(nn.Module):
    def __init__(self,
                 in_size,
                 hidden_size,
                 eps=1e-8):
        super(AttentiveStatisticalPooling,self).__init__()
        self.eps=eps
        self.linear1=nn.Linear(in_size,hidden_size)
        self.linear2=nn.Linear(hidden_size,in_size)
        self.tanh=nn.Tanh()
        self.softmax=nn.Softmax(dim=2)
    
    def forward(self,x):
        input=x
        x=self.linear1(x.transpose(1,2))
        x=self.tanh(x)
        e_t=self.linear2(x)
        alpha_t=self.softmax(e_t.transpose(1,2))
        means=torch.sum(alpha_t*input,dim=2)
        residuals=torch.sum(alpha_t * input**2,dim=2)-means**2
        stds=torch.sqrt(residuals.clamp(min=self.eps))
        return torch.cat([means,stds],dim=1)


In [10]:
class Decoder(nn.Module):
    def __init__(self,
                 in_size,
                 hidden_size,
                 num_class,
                 eps=1e-8):
        super(Decoder,self).__init__()
        self.attention=AttentiveStatisticalPooling(in_size=in_size,
                                                  hidden_size=hidden_size,
                                                  eps=eps)
        self.norm1=nn.BatchNorm1d(in_size*2)
        self.linear1=nn.Linear(in_features=in_size*2,out_features=192)
        self.norm2=nn.BatchNorm1d(192)
        self.linear2=nn.Linear(in_features=192,out_features=num_class)
    
    def forward(self,x):
        x=self.attention(x)
        x=self.norm1(x)
        x=self.linear1(x)
        embeddings=self.norm2(x)
        logits=self.linear2(embeddings)
        return logits,embeddings

In [11]:
class TiTaNet(nn.Module):
    def __init__(self,
                 prolog_in_channels=80,
                 prolog_out_channels=256,
                 epilog_out_channels=1536,
                 kernel_b1=3,
                 dilation_b1=1,
                 repeat_b1=3,
                 reduction_b1=16,
                 kernel_b2=3,
                 dilation_b2=1,
                 repeat_b2=3,
                 reduction_b2=16,
                 kernel_b3=3,
                 dilation_b3=1,
                 repeat_b3=3,
                 reduction_b3=16,
                 hidden_size=128,
                 num_class=100,
                 eps=1e-8):
        super(TiTaNet,self).__init__()
        self.encoder=Encoder(
                prolog_in_channels,
                 prolog_out_channels,
                 epilog_out_channels,
                 kernel_b1,
                 dilation_b1,
                 repeat_b1,
                 reduction_b1,
                 kernel_b2,
                 dilation_b2,
                 repeat_b2,
                 reduction_b2,
                 kernel_b3,
                 dilation_b3,
                 repeat_b3,
                 reduction_b3)
        self.decoder=Decoder(
                 epilog_out_channels,
                 hidden_size,
                 num_class,
                 eps)
    def forward(self,x):
        x=self.encoder(x)
        return self.decoder(x)
        

In [55]:
titanet=TiTaNet(num_class=103)
y=titanet(x)
y[0].shape,y[1].shape

(torch.Size([2, 103]), torch.Size([2, 192]))

In [56]:
sum(p.numel() for p in titanet.parameters() if p.requires_grad)/1_000_000

5.245335

In [15]:
class AdditiveAngularMarginLoss(nn.Module):
    def __init__(self, scale=20.0, margin=1.35):
        super().__init__()

        self.eps = 1e-7
        self.scale = scale
        self.margin = margin

    def forward(self, logits, labels):
        numerator = self.scale * torch.cos(
            torch.acos(torch.clamp(torch.diagonal(logits.transpose(0, 1)[labels]), -1.0 + self.eps, 1 - self.eps))
            + self.margin
        )
        excl = torch.cat(
            [torch.cat((logits[i, :y], logits[i, y + 1 :])).unsqueeze(0) for i, y in enumerate(labels)], dim=0
        )
        denominator = torch.exp(numerator) + torch.sum(torch.exp(self.scale * excl), dim=1)
        L = numerator - torch.log(denominator)
        return -torch.mean(L)

In [44]:
class TiTaNetDataset(Dataset):
    def __init__(self,data_folder,
                 sample_rate=16_000,
                 audio_len=1,
                 file_ext='wav'):
        super(TiTaNetDataset,self).__init__()
        self.data_folder=data_folder
        self.audio_len=audio_len
        self.sample_rate=sample_rate
        self.file_ext=file_ext
        self.audio_files=glob(f"{self.data_folder}/**/*.{file_ext}")
        self.labels=os.listdir(self.data_folder)
        self.arg_to_label={k:v for v,k in enumerate(self.labels)}
        self.label_to_arg={k:v for k,v in enumerate(self.labels)}
    
    def __len__(self):
        return len(self.audio_files)
    
    def __getitem__(self, index):
        file=self.audio_files[index]
        data,_=librosa.load(file,sr=self.sample_rate,mono=True)
        label=file.split('/')[-2]
        label=self.arg_to_label[label]
        if data.shape[0]>=self.audio_len*self.sample_rate:
            data=data[:self.audio_len*self.sample_rate]
        if data.shape[0]<self.audio_len*self.sample_rate:
            data=np.pad(data, (0,self.audio_len*self.sample_rate - data.shape[0]), mode='constant')
        mels=librosa.feature.melspectrogram(y=data,sr=self.sample_rate,n_fft=512,hop_length=160,win_length=400,n_mels=80)
        mels=librosa.power_to_db(mels,ref=np.max)
        return torch.tensor(mels),torch.tensor(label)

In [38]:
data_dir="/mnt/c/Users/rahim/Downloads/archive/data"

In [40]:
files=glob(f"{data_dir}/**/*.flac")
len(files)

4017

In [45]:
dataset=TiTaNetDataset(data_folder=data_dir,file_ext='flac')

In [42]:
dataset.arg_to_label

{'0001': 0,
 '0002': 1,
 '0003': 2,
 '0004': 3,
 '0005': 4,
 '0006': 5,
 '0007': 6,
 '0008': 7,
 '0009': 8,
 '0010': 9,
 '0011': 10,
 '0012': 11,
 '0013': 12,
 '0014': 13,
 '0015': 14,
 '0016': 15,
 '0017': 16,
 '0018': 17,
 '0019': 18,
 '0020': 19,
 '0021': 20,
 '0022': 21,
 '0023': 22,
 '0024': 23,
 '0025': 24,
 '0026': 25,
 '0027': 26,
 '0028': 27,
 '0029': 28,
 '0030': 29,
 '0031': 30,
 '0032': 31,
 '0033': 32,
 '0034': 33,
 '0035': 34,
 '0036': 35,
 '0037': 36,
 '0038': 37,
 '0039': 38,
 '0040': 39,
 '0041': 40,
 '0042': 41,
 '0043': 42,
 '0044': 43,
 '0045': 44,
 '0046': 45,
 '0047': 46,
 '0048': 47,
 '0049': 48,
 '0050': 49,
 '0051': 50,
 '0052': 51,
 '0053': 52,
 '0054': 53,
 '0055': 54,
 '0056': 55,
 '0057': 56,
 '0058': 57,
 '0059': 58,
 '0060': 59,
 '0061': 60,
 '0062': 61,
 '0063': 62,
 '0064': 63,
 '0065': 64,
 '0066': 65,
 '0067': 66,
 '0068': 67,
 '0069': 68,
 '0070': 69,
 '0071': 70,
 '0072': 71,
 '0073': 72,
 '0074': 73,
 '0075': 74,
 '0076': 75,
 '0077': 76,
 '0078': 

In [51]:
device=torch.device('cuda' if torch.cuda.is_available() else 'cpu')
print(device)

cuda


In [52]:
optimizer=torch.optim.Adam(titanet.parameters())
criterion=AdditiveAngularMarginLoss()

In [57]:
titanet=titanet.to(device)

In [54]:
dataloader=DataLoader(dataset=dataset,batch_size=10,shuffle=True)

In [60]:
for epoch in range(1):
    titanet.train()
    train_loss=0
    for x,y in tqdm(dataloader):
        x=x.to(device)
        y=y.to(device)
        optimizer.zero_grad()
        pred=titanet(x)
        loss=criterion(pred[0],y)
        train_loss+=loss.item()
        loss.backward()
        optimizer.step()
    print(train_loss)

  0%|          | 0/402 [00:00<?, ?it/s]

18032.066257476807
