In [1]:
import torch
import torch.nn as nn
import torch.nn.functional as F
from torch.utils.data import DataLoader,Dataset
import os,glob,random
import librosa
import soundfile as sf  
import numpy as np
from itertools import permutations
from tqdm import tqdm
from time import perf_counter

# TasNet

In [2]:
class Encoder(nn.Module):
    def __init__(self,L,N):
        super(Encoder,self).__init__()
        """
        L: Number of input channels(number of samples per segment)
        N: Number of output channels(number of basis signals)
        """
        self.L = L
        self.N = N
        self.EPS = 1e-8
        self.conv1d_U=nn.Conv1d(in_channels=L,out_channels=N,kernel_size=1,stride=1,bias=False)
        self.conv1d_V=nn.Conv1d(in_channels=L,out_channels=N,kernel_size=1,stride=1,bias=False)
    
    def forward(self,mixture):
        """
        mixture:Tensor of shape [B,K,L] where K are the number of segment being processed at once
        output: Tensor of shape [B,K,N] where N are the number of basis signals
        """
        B,K,L=mixture.size()
        norm_coef=torch.norm(mixture,p=2,dim=2,keepdim=True)
        normed_mixture=mixture/(norm_coef+self.EPS)
        normed_mixture=torch.unsqueeze(normed_mixture.view(-1,L),2)
        conv=F.relu(self.conv1d_U(normed_mixture))
        gate=F.sigmoid(self.conv1d_V(normed_mixture))
        mixture_w=conv*gate
        mixture_w=mixture_w.view(B,K,self.N)
        return mixture_w,norm_coef

In [3]:
class Separator(nn.Module):
    def __init__(self,N:int,hidden_size,num_layers,bidirectional=False,nspk=2) -> None:
        super(Separator,self).__init__()
        self.N=N
        self.hidden_size=hidden_size
        self.bidirectional=bidirectional
        self.num_layers=num_layers
        self.nspk=nspk
        self.layer_norm=nn.LayerNorm(N)
        self.LSTM=nn.LSTM(input_size=N,hidden_size=hidden_size,num_layers=num_layers,bidirectional=bidirectional,batch_first=True)
        fc_in_dim=hidden_size*2 if bidirectional else hidden_size
        self.fc=nn.Linear(fc_in_dim,nspk*N)
    
    def forward(self,mixture_w):
        """
        mixture_w: Tensor of shape [B,K,N]
        output: Tensor of shape [B,K,nspk,N]
        """
        B,K,N=mixture_w.size()
        normed_mixture_w=self.layer_norm(mixture_w)
        output,_=self.LSTM(normed_mixture_w)
        score=self.fc(output)
        score=score.view(B,K,self.nspk,N)
        est_mask=F.softmax(score,dim=2)
        return est_mask

In [4]:
class Decoder(nn.Module):
    def __init__(self,N,L):
        super(Decoder,self).__init__()
        self.N=N
        self.L=L
        self.basis_signals=nn.Linear(N,L,bias=False)
    
    def forward(self,mixture_w,est_mask,norm_coef):
        """
        mixture_w: Tensor of shape [B,K,N]
        est_mask: Tensor of shape [B,K,nspk,N]
        norm_coef: Tensor of shape [B,K,1]
        output: Tensor of shape [B,nspk,K,L]
        """
        source_w=torch.unsqueeze(mixture_w,2)*est_mask
        est_source=self.basis_signals(source_w)
        norm_coef=torch.unsqueeze(norm_coef,2)
        est_source=est_source*norm_coef
        est_source=est_source.permute(0,2,1,3).contiguous()
        return est_source

In [5]:
class TasNet(nn.Module):
    def __init__(self,L,N,hidden_size,num_layers,bidirectional=False,nspk=2):
        super(TasNet,self).__init__()
        self.L=L
        self.N=N
        self.hidden_size=hidden_size
        self.num_layers=num_layers
        self.bidirectional=bidirectional
        self.nspk=nspk
        self.encoder=Encoder(L,N)
        self.separator=Separator(N,hidden_size,num_layers,bidirectional,nspk)
        self.decoder=Decoder(N,L)
    
    def forward(self,mixture):
        mixture_w,norm_coef=self.encoder(mixture)
        est_mask=self.separator(mixture_w)
        est_source=self.decoder(mixture_w,est_mask,norm_coef)
        return est_source

In [6]:
class AudioDataset(Dataset):
    def __init__(self,L:int,K:int,folder_path:str,sample_rate=8000) -> None:
        self.L=L
        self.K=K
        self.folder_path=folder_path
        self.sample_rate=sample_rate
        self.files=glob.glob(os.path.join(folder_path,'*.wav'))
        self.audio_info=self.load_audio_info()
    
    def __len__(self):
        return len(self.audio_info['path'])
    
    def __getitem__(self,idx):
        audio_path=self.audio_info['path'][idx]
        start=self.audio_info['start'][idx]/self.sample_rate
        end=self.audio_info['end'][idx]/self.sample_rate
        audio1,_=librosa.load(audio_path,sr=self.sample_rate,mono=True,offset=start,duration=end-start)
        # load a random audio from the data
        i=random.randint(a=0,b=len(self.audio_info['path'])-2)
        while i==idx:
            i=random.randint(a=0,b=len(self.audio_info['path'])-2)
        
        audio_path=self.audio_info['path'][i]
        start=self.audio_info['start'][i]/self.sample_rate
        end=self.audio_info['end'][i]/self.sample_rate
        audio2,_=librosa.load(audio_path,sr=self.sample_rate,mono=True,offset=start,duration=end-start)
        mixture=audio1+audio2
        mixture=librosa.util.normalize(mixture)
        audio1=librosa.util.normalize(audio1)
        audio2=librosa.util.normalize(audio2)
        mixture=torch.from_numpy(mixture.reshape(self.K,self.L))
        sources=torch.from_numpy(np.array([audio1.reshape(self.K,self.L),audio2.reshape(self.K,self.L)]))
        return mixture,sources

    def load_audio_info(self):
        audio_info=dict(path=list(),start=list(),end=list())
        for file in self.files:
            info=sf.info(os.path.join(self.folder_path,file))
            duration=int(info.duration*self.sample_rate)
            chunk_length=self.L*self.K
            start=0
            for i in range(chunk_length,duration,chunk_length):
                if(i-start)==chunk_length:
                    audio_info['path'].append(info.name)
                    audio_info['start'].append(start)
                    audio_info['end'].append(i)
                    start=i
        return audio_info
        

In [7]:
L=1000
N=500
hidden_size=128
num_layers=3
bidirectional=True
nspk=2
num_epochs=100

In [8]:
model=TasNet(L,N,hidden_size,num_layers,bidirectional,nspk)

In [9]:
x=torch.randn(2,3,500)

In [57]:
sum(p.numel() for p in model.parameters() if p.requires_grad)

3193648

In [11]:
out=model(x)

In [12]:
out.shape

torch.Size([2, 2, 3, 500])

In [10]:
audio_folder="/mnt/d/Programs/Python/PW/projects/asteroid/zip-hindi-2k"

In [58]:
dataset=AudioDataset(L=1000,K=8,folder_path=audio_folder)

In [59]:
dataset[0][0].shape,dataset[0][1].shape

(torch.Size([8, 1000]), torch.Size([2, 8, 1000]))

In [16]:
device=torch.device('cuda' if torch.cuda.is_available() else 'cpu')
print(device)

cuda


In [60]:
model=model.to(device)

In [67]:
dataloader=DataLoader(dataset,batch_size=800,shuffle=True)

In [20]:
list(permutations([0,1]))

[(0, 1), (1, 0)]

# SI-SNR
 SI-SNR is a metric that measures the quality of the separated audio signals. It is defined as the ratio of the energy of the target speaker to the energy of the interference speaker plus noise. The higher the SI-SNR, the better the separation quality. \
The formula for SI-SNR is given by:
$$
s_{target} = \frac{<s',s>s}{<s,s>} \\
e_{noise} = s' - s_{target} \\
$$
$$
SI-SNR = 10 \log_{10} \frac{||s_{target}||^2}{||e_{noise}||^2}
$$
where $s_{t}$ is the target speaker, $\hat{s}_{t}$ is the separated speaker, and $e_{t}$ is the noise. 


In [36]:
# SI-SNR with PIT(Permutation Invariant Training)
def calculate_si_snr(source:torch.tensor,estimate:torch.tensor,eps=1e-8):
    """
    source: Tensor of shape [B,C,K,L]
    estimate: Tensor of shape [B,C,K,L]
    eps: small value to avoid division by zero
    output: Tensor of shape [B,K]
    B: Batch size
    C: Number of speakers
    K: Number of segments
    L: Number of samples per segment
    T=K*L
    """
    B,C,K,L=source.size()
    flat_source=source.view(B,C,-1) # [B,C,T]
    flat_estimate=estimate.view(B,C,-1) # [B,C,T]
    s_target=torch.unsqueeze(flat_source,dim=1) # [B,1,C,T]
    s_estimate=torch.unsqueeze(flat_estimate,dim=2) # [B,C,1,T]
    pair_wise_dot=torch.sum(s_target*s_estimate,dim=3,keepdim=True) # [B,C,C,1]
    s_target_energy=torch.sum(s_target**2,dim=3,keepdim=True)+eps # [B,1,C,1]
    pairwise_proj=pair_wise_dot*s_target/s_target_energy # [B,C,C,T]
    e_noise=s_estimate-pairwise_proj # [B,C,C,T]
    pair_wise_si_snr=torch.sum(pairwise_proj**2,dim=3)/(torch.sum(e_noise**2,dim=3)+eps) # [B,C,C]
    pair_wise_si_snr=10*torch.log10(pair_wise_si_snr+eps) # [B,C,C]
    perms = source.new_tensor(list(permutations(range(C))), dtype=torch.long)
    # one-hot, [C!, C, C]
    index = torch.unsqueeze(perms, 2)
    perms_one_hot = source.new_zeros((*perms.size(), C)).scatter_(2, index, 1)
    # [B, C!] <- [B, C, C] einsum [C!, C, C], SI-SNR sum of each permutation
    snr_set = torch.einsum('bij,pij->bp', [pair_wise_si_snr, perms_one_hot])
    # max_snr = torch.gather(snr_set, 1, max_snr_idx.view(-1, 1))  # [B, 1]
    max_snr, _ = torch.max(snr_set, dim=1, keepdim=True)
    max_snr /= C
    loss=0-torch.mean(max_snr)
    return loss

In [65]:
optimizer=torch.optim.Adam(model.parameters(),lr=1e-3)

In [68]:
model.train()
for epoch in tqdm(range(num_epochs),desc="Epochs"):
    start=perf_counter()
    train_loss=0
    for mixture,sources in tqdm(dataloader,desc="Training"):
        mixture=mixture.to(device)
        sources=sources.to(device)
        # print(f"mixture_shape: {mixture.shape}")
        # print(f"sources_shape: {sources.shape}")
        est_sources=model(mixture)
        optimizer.zero_grad()
        # print(f"est_sources_shape: {est_sources.shape}")
        si_snr=calculate_si_snr(sources,est_sources)
        # print(f"si_snr_shape: {si_snr.item()}")
        train_loss+=si_snr.item()
        si_snr.backward()
        optimizer.step()
    end=perf_counter()
    print(f"Epoch: {epoch+1}/{num_epochs} Loss: {train_loss} Time: {end-start}")


Training: 100%|██████████| 11/11 [02:37<00:00, 14.28s/it]
Epochs:   1%|          | 1/100 [02:37<4:19:06, 157.04s/it]

Epoch: 1/100 Loss: 222.48468589782715 Time: 142.58121526899959


Training: 100%|██████████| 11/11 [02:49<00:00, 15.38s/it]
Epochs:   2%|▏         | 2/100 [05:26<4:28:12, 164.20s/it]

Epoch: 2/100 Loss: 204.53797149658203 Time: 143.11632652700064


Training: 100%|██████████| 11/11 [02:31<00:00, 13.78s/it]
Epochs:   3%|▎         | 3/100 [07:57<4:16:09, 158.45s/it]

Epoch: 3/100 Loss: 186.55970573425293 Time: 137.26371982299952


Training: 100%|██████████| 11/11 [02:21<00:00, 12.86s/it]
Epochs:   4%|▍         | 4/100 [10:19<4:02:49, 151.77s/it]

Epoch: 4/100 Loss: 169.7868480682373 Time: 127.2203563990006


Training: 100%|██████████| 11/11 [02:21<00:00, 12.86s/it]
Epochs:   5%|▌         | 5/100 [12:40<3:54:26, 148.07s/it]

Epoch: 5/100 Loss: 155.81644344329834 Time: 127.15654144499968


Training: 100%|██████████| 11/11 [02:40<00:00, 14.63s/it]
Epochs:   6%|▌         | 6/100 [15:21<3:58:49, 152.44s/it]

Epoch: 6/100 Loss: 144.58962154388428 Time: 143.72755607199997


Training: 100%|██████████| 11/11 [02:28<00:00, 13.51s/it]
Epochs:   7%|▋         | 7/100 [17:50<3:54:19, 151.18s/it]

Epoch: 7/100 Loss: 136.26745319366455 Time: 141.45053883200035


Training: 100%|██████████| 11/11 [02:37<00:00, 14.29s/it]
Epochs:   8%|▊         | 8/100 [20:27<3:54:44, 153.09s/it]

Epoch: 8/100 Loss: 131.58578300476074 Time: 151.79753036500006


Training: 100%|██████████| 11/11 [02:33<00:00, 13.92s/it]
Epochs:   9%|▉         | 9/100 [23:00<3:52:12, 153.11s/it]

Epoch: 9/100 Loss: 125.48340797424316 Time: 153.1508325170007


Training: 100%|██████████| 11/11 [02:29<00:00, 13.58s/it]
Epochs:  10%|█         | 10/100 [25:30<3:47:55, 151.95s/it]

Epoch: 10/100 Loss: 120.37688446044922 Time: 149.34733808199962


Training: 100%|██████████| 11/11 [02:26<00:00, 13.31s/it]
Epochs:  11%|█         | 11/100 [27:56<3:42:51, 150.24s/it]

Epoch: 11/100 Loss: 115.4372091293335 Time: 146.36212526500003


Training: 100%|██████████| 11/11 [02:29<00:00, 13.56s/it]
Epochs:  12%|█▏        | 12/100 [30:25<3:39:51, 149.90s/it]

Epoch: 12/100 Loss: 112.33608341217041 Time: 149.13731325200024


Training: 100%|██████████| 11/11 [02:30<00:00, 13.72s/it]
Epochs:  13%|█▎        | 13/100 [32:56<3:37:49, 150.22s/it]

Epoch: 13/100 Loss: 109.68339157104492 Time: 150.94518325799982


Training: 100%|██████████| 11/11 [02:27<00:00, 13.43s/it]
Epochs:  14%|█▍        | 14/100 [35:24<3:34:14, 149.47s/it]

Epoch: 14/100 Loss: 106.67328548431396 Time: 147.74703666299956


Training: 100%|██████████| 11/11 [02:26<00:00, 13.36s/it]
Epochs:  15%|█▌        | 15/100 [37:51<3:30:40, 148.71s/it]

Epoch: 15/100 Loss: 102.66371250152588 Time: 146.95557555800042


Training: 100%|██████████| 11/11 [02:25<00:00, 13.23s/it]
Epochs:  16%|█▌        | 16/100 [40:16<3:26:52, 147.76s/it]

Epoch: 16/100 Loss: 99.7813720703125 Time: 145.55681887500032


Training: 100%|██████████| 11/11 [02:25<00:00, 13.20s/it]
Epochs:  17%|█▋        | 17/100 [42:41<3:23:19, 146.98s/it]

Epoch: 17/100 Loss: 98.39365577697754 Time: 145.1516074359988


Training: 100%|██████████| 11/11 [02:25<00:00, 13.27s/it]
Epochs:  18%|█▊        | 18/100 [45:07<3:20:26, 146.67s/it]

Epoch: 18/100 Loss: 94.74980735778809 Time: 145.9460554310008


Training: 100%|██████████| 11/11 [02:23<00:00, 13.09s/it]
Epochs:  19%|█▉        | 19/100 [47:31<3:16:54, 145.85s/it]

Epoch: 19/100 Loss: 93.99800109863281 Time: 143.94807640399995


Training: 100%|██████████| 11/11 [02:23<00:00, 13.09s/it]
Epochs:  20%|██        | 20/100 [49:55<3:13:42, 145.28s/it]

Epoch: 20/100 Loss: 91.30178642272949 Time: 143.95008503499957


Training: 100%|██████████| 11/11 [02:23<00:00, 13.05s/it]
Epochs:  21%|██        | 21/100 [52:19<3:10:36, 144.77s/it]

Epoch: 21/100 Loss: 90.88211965560913 Time: 143.5630422129998


Training: 100%|██████████| 11/11 [02:24<00:00, 13.12s/it]
Epochs:  22%|██▏       | 22/100 [54:43<3:08:01, 144.64s/it]

Epoch: 22/100 Loss: 88.1833610534668 Time: 144.33311950500138


Training: 100%|██████████| 11/11 [02:22<00:00, 13.00s/it]
Epochs:  23%|██▎       | 23/100 [57:06<3:04:58, 144.13s/it]

Epoch: 23/100 Loss: 85.94932508468628 Time: 142.96114570100144


Training: 100%|██████████| 11/11 [02:25<00:00, 13.25s/it]
Epochs:  24%|██▍       | 24/100 [59:32<3:03:11, 144.62s/it]

Epoch: 24/100 Loss: 84.35712623596191 Time: 145.76880521699968


Training: 100%|██████████| 11/11 [02:23<00:00, 13.03s/it]
Epochs:  25%|██▌       | 25/100 [1:01:55<3:00:17, 144.24s/it]

Epoch: 25/100 Loss: 82.816978931427 Time: 143.33630346099926


Training: 100%|██████████| 11/11 [02:23<00:00, 13.09s/it]
Epochs:  26%|██▌       | 26/100 [1:04:19<2:57:47, 144.15s/it]

Epoch: 26/100 Loss: 81.3454179763794 Time: 143.9550864339999


Training: 100%|██████████| 11/11 [02:23<00:00, 13.01s/it]
Epochs:  27%|██▋       | 27/100 [1:06:42<2:55:01, 143.85s/it]

Epoch: 27/100 Loss: 79.81549215316772 Time: 143.1492765160001


Training: 100%|██████████| 11/11 [02:24<00:00, 13.10s/it]
Epochs:  28%|██▊       | 28/100 [1:09:07<2:52:43, 143.94s/it]

Epoch: 28/100 Loss: 78.5211877822876 Time: 144.14263246100018


Training: 100%|██████████| 11/11 [02:21<00:00, 12.89s/it]
Epochs:  29%|██▉       | 29/100 [1:11:28<2:49:32, 143.28s/it]

Epoch: 29/100 Loss: 78.16467905044556 Time: 141.74493236999842


Training: 100%|██████████| 11/11 [02:23<00:00, 13.03s/it]
Epochs:  30%|███       | 30/100 [1:13:52<2:47:11, 143.31s/it]

Epoch: 30/100 Loss: 75.2542634010315 Time: 143.36048304399992


Training: 100%|██████████| 11/11 [02:23<00:00, 13.09s/it]
Epochs:  31%|███       | 31/100 [1:16:16<2:45:01, 143.50s/it]

Epoch: 31/100 Loss: 74.61617612838745 Time: 143.94991315800144


Training: 100%|██████████| 11/11 [02:23<00:00, 13.09s/it]
Epochs:  32%|███▏      | 32/100 [1:18:40<2:42:47, 143.63s/it]

Epoch: 32/100 Loss: 73.06675481796265 Time: 143.95072098399942


Training: 100%|██████████| 11/11 [02:25<00:00, 13.25s/it]
Epochs:  33%|███▎      | 33/100 [1:21:05<2:41:06, 144.28s/it]

Epoch: 33/100 Loss: 71.49845600128174 Time: 145.77232242399987


Training: 100%|██████████| 11/11 [02:23<00:00, 13.08s/it]
Epochs:  34%|███▍      | 34/100 [1:23:29<2:38:35, 144.17s/it]

Epoch: 34/100 Loss: 69.68909931182861 Time: 143.93036129100074


Training: 100%|██████████| 11/11 [03:33<00:00, 19.43s/it]
Epochs:  35%|███▌      | 35/100 [1:27:03<2:58:47, 165.04s/it]

Epoch: 35/100 Loss: 68.33493757247925 Time: 213.73481736400026


Training: 100%|██████████| 11/11 [02:21<00:00, 12.90s/it]
Epochs:  36%|███▌      | 36/100 [1:29:25<2:48:39, 158.11s/it]

Epoch: 36/100 Loss: 66.88784122467041 Time: 141.94153070999891


Training: 100%|██████████| 11/11 [02:24<00:00, 13.18s/it]
Epochs:  37%|███▋      | 37/100 [1:31:50<2:41:52, 154.16s/it]

Epoch: 37/100 Loss: 65.86032819747925 Time: 144.9506470010001


Training: 100%|██████████| 11/11 [02:23<00:00, 13.03s/it]
Epochs:  38%|███▊      | 38/100 [1:34:13<2:35:57, 150.92s/it]

Epoch: 38/100 Loss: 64.8126916885376 Time: 143.35861033000037


Training: 100%|██████████| 11/11 [02:23<00:00, 13.05s/it]
Epochs:  39%|███▉      | 39/100 [1:36:37<2:31:11, 148.71s/it]

Epoch: 39/100 Loss: 64.01849365234375 Time: 143.55487915199956


Training: 100%|██████████| 11/11 [02:25<00:00, 13.23s/it]
Epochs:  40%|████      | 40/100 [1:39:02<2:27:45, 147.76s/it]

Epoch: 40/100 Loss: 61.04585409164429 Time: 145.54483196600086


Training: 100%|██████████| 11/11 [02:23<00:00, 13.09s/it]
Epochs:  41%|████      | 41/100 [1:41:26<2:24:10, 146.62s/it]

Epoch: 41/100 Loss: 61.55712938308716 Time: 143.95245296800022


Training: 100%|██████████| 11/11 [02:24<00:00, 13.10s/it]
Epochs:  42%|████▏     | 42/100 [1:43:50<2:21:01, 145.88s/it]

Epoch: 42/100 Loss: 60.35132694244385 Time: 144.15617001200008


Training: 100%|██████████| 11/11 [02:25<00:00, 13.21s/it]
Epochs:  43%|████▎     | 43/100 [1:46:16<2:18:25, 145.72s/it]

Epoch: 43/100 Loss: 57.86133861541748 Time: 145.33605863699995


Training: 100%|██████████| 11/11 [02:23<00:00, 13.03s/it]
Epochs:  44%|████▍     | 44/100 [1:48:39<2:15:20, 145.01s/it]

Epoch: 44/100 Loss: 58.913119316101074 Time: 143.34850611399997


Training: 100%|██████████| 11/11 [02:23<00:00, 13.09s/it]
Epochs:  45%|████▌     | 45/100 [1:51:03<2:12:38, 144.69s/it]

Epoch: 45/100 Loss: 56.88757658004761 Time: 143.9613602520003


Training: 100%|██████████| 11/11 [02:25<00:00, 13.23s/it]
Epochs:  46%|████▌     | 46/100 [1:53:29<2:10:27, 144.95s/it]

Epoch: 46/100 Loss: 56.830570697784424 Time: 145.55745571700027


Training: 100%|██████████| 11/11 [02:23<00:00, 13.07s/it]
Epochs:  47%|████▋     | 47/100 [1:55:52<2:07:43, 144.59s/it]

Epoch: 47/100 Loss: 55.858739376068115 Time: 143.73480563799967


Training: 100%|██████████| 11/11 [02:23<00:00, 13.09s/it]
Epochs:  48%|████▊     | 48/100 [1:58:16<2:05:08, 144.40s/it]

Epoch: 48/100 Loss: 55.68235969543457 Time: 143.95834909699988


Training: 100%|██████████| 11/11 [02:24<00:00, 13.12s/it]
Epochs:  49%|████▉     | 49/100 [2:00:41<2:02:43, 144.39s/it]

Epoch: 49/100 Loss: 54.73190784454346 Time: 144.35629430900008


Training: 100%|██████████| 11/11 [02:23<00:00, 13.03s/it]
Epochs:  50%|█████     | 50/100 [2:03:04<2:00:03, 144.07s/it]

Epoch: 50/100 Loss: 53.03669834136963 Time: 143.3413040430005


Training: 100%|██████████| 11/11 [02:24<00:00, 13.12s/it]
Epochs:  51%|█████     | 51/100 [2:05:28<1:57:43, 144.16s/it]

Epoch: 51/100 Loss: 52.768497943878174 Time: 144.34982066200064


Training: 100%|██████████| 11/11 [02:24<00:00, 13.11s/it]
Epochs:  52%|█████▏    | 52/100 [2:07:53<1:55:19, 144.16s/it]

Epoch: 52/100 Loss: 51.397419452667236 Time: 144.16469067099933


Training: 100%|██████████| 11/11 [02:24<00:00, 13.16s/it]
Epochs:  53%|█████▎    | 53/100 [2:10:17<1:53:03, 144.33s/it]

Epoch: 53/100 Loss: 50.568994998931885 Time: 144.74154529800035


Training: 100%|██████████| 11/11 [02:24<00:00, 13.10s/it]
Epochs:  54%|█████▍    | 54/100 [2:12:41<1:50:36, 144.28s/it]

Epoch: 54/100 Loss: 51.565826416015625 Time: 144.15420414599976


Training: 100%|██████████| 11/11 [02:23<00:00, 13.01s/it]
Epochs:  55%|█████▌    | 55/100 [2:15:05<1:47:57, 143.94s/it]

Epoch: 55/100 Loss: 49.34139013290405 Time: 143.1570683810005


Training: 100%|██████████| 11/11 [02:23<00:00, 13.05s/it]
Epochs:  56%|█████▌    | 56/100 [2:17:28<1:45:28, 143.83s/it]

Epoch: 56/100 Loss: 48.997844219207764 Time: 143.55298024700096


Training: 100%|██████████| 11/11 [02:23<00:00, 13.01s/it]
Epochs:  57%|█████▋    | 57/100 [2:19:51<1:42:55, 143.62s/it]

Epoch: 57/100 Loss: 47.983006954193115 Time: 143.14977326500048


Training: 100%|██████████| 11/11 [02:24<00:00, 13.10s/it]
Epochs:  58%|█████▊    | 58/100 [2:22:15<1:40:38, 143.78s/it]

Epoch: 58/100 Loss: 47.88825345039368 Time: 144.144214603999


Training: 100%|██████████| 11/11 [02:23<00:00, 13.07s/it]
Epochs:  59%|█████▉    | 59/100 [2:24:39<1:38:14, 143.77s/it]

Epoch: 59/100 Loss: 47.33168363571167 Time: 143.75191736700071


Training: 100%|██████████| 11/11 [02:22<00:00, 12.96s/it]
Epochs:  60%|██████    | 60/100 [2:27:02<1:35:36, 143.41s/it]

Epoch: 60/100 Loss: 46.0676589012146 Time: 142.55160058


Training: 100%|██████████| 11/11 [02:23<00:00, 13.09s/it]
Epochs:  61%|██████    | 61/100 [2:29:26<1:33:19, 143.57s/it]

Epoch: 61/100 Loss: 46.30667209625244 Time: 143.94479861699983


Training: 100%|██████████| 11/11 [02:23<00:00, 13.05s/it]
Epochs:  62%|██████▏   | 62/100 [2:31:49<1:30:55, 143.56s/it]

Epoch: 62/100 Loss: 45.711820125579834 Time: 143.54916159999993


Training: 100%|██████████| 11/11 [02:24<00:00, 13.10s/it]
Epochs:  63%|██████▎   | 63/100 [2:34:13<1:28:38, 143.74s/it]

Epoch: 63/100 Loss: 44.70859217643738 Time: 144.15390620900143


Training: 100%|██████████| 11/11 [02:24<00:00, 13.10s/it]
Epochs:  64%|██████▍   | 64/100 [2:36:38<1:26:19, 143.87s/it]

Epoch: 64/100 Loss: 44.44026708602905 Time: 144.15842691899888


Training: 100%|██████████| 11/11 [02:26<00:00, 13.30s/it]
Epochs:  65%|██████▌   | 65/100 [2:39:04<1:24:21, 144.61s/it]

Epoch: 65/100 Loss: 46.26851439476013 Time: 146.34248988400032


Training: 100%|██████████| 11/11 [02:23<00:00, 13.09s/it]
Epochs:  66%|██████▌   | 66/100 [2:41:28<1:21:50, 144.42s/it]

Epoch: 66/100 Loss: 43.895299196243286 Time: 143.96088218600016


Training: 100%|██████████| 11/11 [02:26<00:00, 13.30s/it]
Epochs:  67%|██████▋   | 67/100 [2:43:54<1:19:44, 144.99s/it]

Epoch: 67/100 Loss: 44.14952731132507 Time: 146.3356687129999


Training: 100%|██████████| 11/11 [02:24<00:00, 13.16s/it]
Epochs:  68%|██████▊   | 68/100 [2:46:19<1:17:17, 144.93s/it]

Epoch: 68/100 Loss: 45.12539052963257 Time: 144.76964447699902


Training: 100%|██████████| 11/11 [02:25<00:00, 13.25s/it]
Epochs:  69%|██████▉   | 69/100 [2:48:45<1:15:00, 145.17s/it]

Epoch: 69/100 Loss: 44.47179961204529 Time: 145.74066024000058


Training: 100%|██████████| 11/11 [02:26<00:00, 13.32s/it]
Epochs:  70%|███████   | 70/100 [2:51:11<1:12:47, 145.58s/it]

Epoch: 70/100 Loss: 43.424460887908936 Time: 146.546852554


Training: 100%|██████████| 11/11 [02:58<00:00, 16.23s/it]
Epochs:  71%|███████   | 71/100 [2:54:10<1:15:09, 155.48s/it]

Epoch: 71/100 Loss: 41.39010691642761 Time: 178.58589285299968


Training: 100%|██████████| 11/11 [02:28<00:00, 13.47s/it]
Epochs:  72%|███████▏  | 72/100 [2:56:38<1:11:31, 153.28s/it]

Epoch: 72/100 Loss: 40.778159379959106 Time: 148.1395442510002


Training: 100%|██████████| 11/11 [02:27<00:00, 13.38s/it]
Epochs:  73%|███████▎  | 73/100 [2:59:05<1:08:08, 151.44s/it]

Epoch: 73/100 Loss: 40.4101505279541 Time: 147.14302888299972


Training: 100%|██████████| 11/11 [02:28<00:00, 13.52s/it]
Epochs:  74%|███████▍  | 74/100 [3:01:34<1:05:16, 150.63s/it]

Epoch: 74/100 Loss: 38.96952676773071 Time: 148.75212698500036


Training: 100%|██████████| 11/11 [02:25<00:00, 13.25s/it]
Epochs:  75%|███████▌  | 75/100 [3:04:00<1:02:09, 149.17s/it]

Epoch: 75/100 Loss: 39.921809673309326 Time: 145.74918646199876


Training: 100%|██████████| 11/11 [02:27<00:00, 13.38s/it]
Epochs:  76%|███████▌  | 76/100 [3:06:27<59:25, 148.56s/it]  

Epoch: 76/100 Loss: 38.508291244506836 Time: 147.1506671060015


Training: 100%|██████████| 11/11 [02:26<00:00, 13.30s/it]
Epochs:  77%|███████▋  | 77/100 [3:08:53<56:41, 147.90s/it]

Epoch: 77/100 Loss: 38.621994733810425 Time: 146.36125414400158


Training: 100%|██████████| 11/11 [02:26<00:00, 13.32s/it]
Epochs:  78%|███████▊  | 78/100 [3:11:20<54:04, 147.49s/it]

Epoch: 78/100 Loss: 37.165539503097534 Time: 146.53794005300006


Training: 100%|██████████| 11/11 [02:25<00:00, 13.21s/it]
Epochs:  79%|███████▉  | 79/100 [3:13:45<51:23, 146.85s/it]

Epoch: 79/100 Loss: 38.88856554031372 Time: 145.35223595299976


Training: 100%|██████████| 11/11 [02:26<00:00, 13.34s/it]
Epochs:  80%|████████  | 80/100 [3:16:12<48:56, 146.82s/it]

Epoch: 80/100 Loss: 37.89687418937683 Time: 146.755250143


Training: 100%|██████████| 11/11 [02:27<00:00, 13.41s/it]
Epochs:  81%|████████  | 81/100 [3:18:39<46:33, 147.04s/it]

Epoch: 81/100 Loss: 36.0511200428009 Time: 147.54143661600028


Training: 100%|██████████| 11/11 [02:28<00:00, 13.52s/it]
Epochs:  82%|████████▏ | 82/100 [3:21:08<44:15, 147.55s/it]

Epoch: 82/100 Loss: 37.0791335105896 Time: 148.75846038599775


Training: 100%|██████████| 11/11 [02:26<00:00, 13.29s/it]
Epochs:  83%|████████▎ | 83/100 [3:23:34<41:41, 147.14s/it]

Epoch: 83/100 Loss: 36.916221380233765 Time: 146.15459200999976


Training: 100%|██████████| 11/11 [02:26<00:00, 13.29s/it]
Epochs:  84%|████████▍ | 84/100 [3:26:00<39:09, 146.84s/it]

Epoch: 84/100 Loss: 36.343759298324585 Time: 146.16000002099827


Training: 100%|██████████| 11/11 [02:28<00:00, 13.54s/it]
Epochs:  85%|████████▌ | 85/100 [3:28:29<36:52, 147.48s/it]

Epoch: 85/100 Loss: 36.70704460144043 Time: 148.95706244900066


Training: 100%|██████████| 11/11 [02:26<00:00, 13.30s/it]
Epochs:  86%|████████▌ | 86/100 [3:30:56<34:19, 147.14s/it]

Epoch: 86/100 Loss: 36.3805627822876 Time: 146.33498027799942


Training: 100%|██████████| 11/11 [02:27<00:00, 13.45s/it]
Epochs:  87%|████████▋ | 87/100 [3:33:24<31:55, 147.38s/it]

Epoch: 87/100 Loss: 35.361780405044556 Time: 147.9396980800011


Training: 100%|██████████| 11/11 [02:26<00:00, 13.32s/it]
Epochs:  88%|████████▊ | 88/100 [3:35:50<29:25, 147.13s/it]

Epoch: 88/100 Loss: 34.6097776889801 Time: 146.54667494599926


Training: 100%|██████████| 11/11 [02:29<00:00, 13.56s/it]
Epochs:  89%|████████▉ | 89/100 [3:38:19<27:05, 147.73s/it]

Epoch: 89/100 Loss: 34.507757902145386 Time: 149.1439220989996


Training: 100%|██████████| 11/11 [02:28<00:00, 13.49s/it]
Epochs:  90%|█████████ | 90/100 [3:40:48<24:39, 147.92s/it]

Epoch: 90/100 Loss: 35.13728404045105 Time: 148.36637045899988


Training: 100%|██████████| 11/11 [02:27<00:00, 13.41s/it]
Epochs:  91%|█████████ | 91/100 [3:43:15<22:10, 147.81s/it]

Epoch: 91/100 Loss: 33.24251461029053 Time: 147.54279035200307


Training: 100%|██████████| 11/11 [02:26<00:00, 13.29s/it]
Epochs:  92%|█████████▏| 92/100 [3:45:41<19:38, 147.31s/it]

Epoch: 92/100 Loss: 33.857255935668945 Time: 146.14329550200273


Training: 100%|██████████| 11/11 [02:25<00:00, 13.25s/it]
Epochs:  93%|█████████▎| 93/100 [3:48:07<17:07, 146.84s/it]

Epoch: 93/100 Loss: 34.68957710266113 Time: 145.75317637899934


Training: 100%|██████████| 11/11 [02:28<00:00, 13.52s/it]
Epochs:  94%|█████████▍| 94/100 [3:50:36<14:44, 147.41s/it]

Epoch: 94/100 Loss: 33.44966673851013 Time: 148.7479594630022


Training: 100%|██████████| 11/11 [02:26<00:00, 13.32s/it]
Epochs:  95%|█████████▌| 95/100 [3:53:02<12:15, 147.16s/it]

Epoch: 95/100 Loss: 33.846524715423584 Time: 146.5574012620018


Training: 100%|██████████| 11/11 [02:26<00:00, 13.34s/it]
Epochs:  96%|█████████▌| 96/100 [3:55:29<09:48, 147.04s/it]

Epoch: 96/100 Loss: 34.36861181259155 Time: 146.75017255000057


Training: 100%|██████████| 11/11 [02:27<00:00, 13.45s/it]
Epochs:  97%|█████████▋| 97/100 [3:57:57<07:21, 147.31s/it]

Epoch: 97/100 Loss: 33.09628391265869 Time: 147.95004029899792


Training: 100%|██████████| 11/11 [02:27<00:00, 13.43s/it]
Epochs:  98%|█████████▊| 98/100 [4:00:25<04:54, 147.44s/it]

Epoch: 98/100 Loss: 32.70990014076233 Time: 147.74559151200083


Training: 100%|██████████| 11/11 [02:25<00:00, 13.27s/it]
Epochs:  99%|█████████▉| 99/100 [4:02:51<02:26, 146.99s/it]

Epoch: 99/100 Loss: 32.22913980484009 Time: 145.9459352369995


Training: 100%|██████████| 11/11 [02:30<00:00, 13.65s/it]
Epochs: 100%|██████████| 100/100 [4:05:21<00:00, 147.22s/it]

Epoch: 100/100 Loss: 32.75156044960022 Time: 150.14777073699952





In [69]:
torch.save(model,"./model.pt")

In [9]:
model=torch.load("./model.pt")

  model=torch.load("./model.pt")


In [11]:
data,sr=librosa.load("/mnt/d/Programs/Python/PW/projects/asteroid/zip-hindi-2k/4.wav",sr=8000,mono=True)

In [13]:
len(data)/8000

5.003625

In [14]:
data=librosa.util.normalize(data)
data=data[:8000*5]
data=torch.from_numpy(data.reshape(5,8,1000))

In [16]:
out=model(data.to('cuda'))

In [17]:
out.shape

torch.Size([5, 2, 8, 1000])

In [21]:
# out=out.cpu().detach().numpy()
print(out[:,0,:,:].reshape(-1).shape)

(40000,)


In [22]:
from IPython.display import Audio

In [23]:
Audio(out[:,0,:,:].reshape(-1),rate=8000)

In [24]:
Audio(out[:,1,:,:].reshape(-1),rate=8000)