In [1]:
%load_ext autoreload
%autoreload 2
import os
import argparse
import json
from tqdm import tqdm
from copy import deepcopy

import numpy as np
import torch
import torch.nn as nn
from torch.utils.data import Subset
# from torch.utils.tensorboard import SummaryWriter

import time

import random
random.seed(0)
torch.manual_seed(0)
np.random.seed(0)

from scipy.io.wavfile import write as wavwrite
from scipy.io.wavfile import read as wavread

from dataset import load_CleanNoisyPairDataset
from util import rescale, find_max_epoch, print_size, sampling
from network import CleanUNet

In [2]:
def denoise(output_directory, ckpt_iter, subset, num, gpu, opt, dump=False):
    """
    Denoise audio

    Parameters:
    output_directory (str):         save generated speeches to this path
    ckpt_iter (int or 'max'):       the pretrained checkpoint to be loaded; 
                                    automitically selects the maximum iteration if 'max' is selected
    subset (str):                   training, testing, validation
    num (int):                      number of samples to use in inference, use all if 0.
    gpu (bool):                     whether to run on gpu
    opt (bool):                     wheter to use optimazation scheme
    dump (bool):                    whether save enhanced (denoised) audio
    """

    # setup local experiment path
    exp_path = train_config["exp_path"]
    print('exp_path:', exp_path)

    # load data
    loader_config = deepcopy(trainset_config)
    loader_config["crop_length_sec"] = 0
    dataloader = load_CleanNoisyPairDataset(
        **loader_config, 
        subset=subset,
        batch_size=1, 
        num_gpus=1
    )
    if num == 0:
        num = len(dataloader)

    # predefine model
    device = 'cuda' if gpu else 'cpu'
    if(gpu):
        assert torch.cuda.is_available()
    net = CleanUNet(**network_config, **opt_config).to(device)
    print_size(net)

    # load checkpoint
    ckpt_directory = os.path.join(train_config["log"]["directory"], exp_path, 'checkpoint')
    if ckpt_iter == 'max':
        ckpt_iter = find_max_epoch(ckpt_directory)
    if ckpt_iter != 'pretrained':
        ckpt_iter = int(ckpt_iter)
    model_path = os.path.join(ckpt_directory, '{}.pkl'.format(ckpt_iter))
    checkpoint = torch.load(model_path, map_location='cpu')
    net.load_state_dict(checkpoint['model_state_dict'])
    net.eval()

    # get output directory ready
    if ckpt_iter == "pretrained":
        speech_directory = os.path.join(output_directory, exp_path, 'speech', ckpt_iter) 
    else:
        speech_directory = os.path.join(output_directory, exp_path, 'speech', '{}k'.format(ckpt_iter//1000))
    if dump and not os.path.isdir(speech_directory):
        os.makedirs(speech_directory)
        os.chmod(speech_directory, 0o775)
    print("speech_directory: ", speech_directory, flush=True)

    # inference
    all_generated_audio = []
    all_clean_audio = []
    sortkey = lambda name: '_'.join(name.split('/')[-1].split('_')[1:])

    avg_time = 0
    iter = 1
    with tqdm(total = num, disable =True) as pbar:
        for clean_audio, noisy_audio, fileid in dataloader:
            # if not gpu:
                # clean_audio, noisy_audio = clean_audio.to('cpu'), noisy_audio.to('cpu')
            # else:
                # noisy_audio = noisy_audio.cuda()
            clean_audio, noisy_audio = clean_audio.to(device), noisy_audio.to(device)

            filename = sortkey(fileid[0][0])
            print("input shape", noisy_audio.shape)
            LENGTH = len(noisy_audio[0].squeeze())
            start_time = time.time()
            generated_audio = sampling(net, noisy_audio)
            
            if dump:
                wavwrite(os.path.join(speech_directory, 'enhanced_{}'.format(filename)), 
                        trainset_config["sample_rate"],
                        generated_audio[0].squeeze().cpu().numpy())
            else:
                all_clean_audio.append(clean_audio[0].squeeze().cpu().numpy())
                all_generated_audio.append(generated_audio[0].squeeze().cpu().numpy())
                
            end_time = time.time()
            elapsed_time = end_time - start_time
            avg_time += elapsed_time
            pbar.set_postfix({"Average Time": f"{avg_time / iter:.6f}"})
            pbar.update(1)
  
            
            if iter == num:
                break
            iter+=1

    print("Average time: ", avg_time / iter)
    return all_clean_audio, all_generated_audio

In [3]:
_config = "configs/valentini.json"
ckpt_iter = 85000
subset = "testing"
num = 1
gpu = False
opt = True

In [4]:
with open(_config) as f:
    data = f.read()
config = json.loads(data)
gen_config              = config["gen_config"]
global network_config
network_config          = config["network_config"]      # to define wavenet
global train_config
train_config            = config["train_config"]        # train config
global trainset_config
trainset_config         = config["trainset_config"]     # to read trainset configurations
if opt==True:
    global opt_config
    opt_config         = config["opt_config"] 
else:
    opt_config          = {}

torch.backends.cudnn.enabled = True
torch.backends.cudnn.benchmark = True


In [5]:
with torch.no_grad():
    denoise(gen_config["output_directory"],
            subset=subset,
            ckpt_iter=ckpt_iter,
            num=num,
            gpu=gpu,
            opt=opt,
            dump=True)

exp_path: valentini
CleanUNet Parameters: 46.071937M;  
speech_directory:  ./exp/valentini/speech/85k
input shape torch.Size([1, 1, 69174])
encoder start:  torch.Size([1, 1, 69374])
torch.Size([1, 64, 34686])
torch.Size([1, 128, 17342])
torch.Size([1, 256, 8670])
torch.Size([1, 512, 4334])
torch.Size([1, 768, 2166])
torch.Size([1, 768, 1082])
torch.Size([1, 768, 540])
torch.Size([1, 768, 269])
decoder start:  torch.Size([1, 768, 269])
torch.Size([1, 768, 540])
torch.Size([1, 768, 1082])
torch.Size([1, 768, 2166])
torch.Size([1, 512, 4334])
torch.Size([1, 256, 8670])
torch.Size([1, 128, 17342])
torch.Size([1, 64, 34686])
torch.Size([1, 1, 69374])
Average time:  1.0802733898162842


In [33]:
net = CleanUNet(**network_config, **opt_config).to('cuda')
from torchsummary import summary
print(summary(net,input_size = (1,147266)))

torch.Size([2, 768, 574])
torch.Size([2, 768, 1150])
torch.Size([2, 768, 1150])
torch.Size([2, 768, 2302])
torch.Size([2, 768, 2302])
torch.Size([2, 768, 4606])
torch.Size([2, 768, 4606])
torch.Size([2, 512, 9214])
torch.Size([2, 512, 9214])
torch.Size([2, 256, 18430])
torch.Size([2, 256, 18430])
torch.Size([2, 128, 36862])
torch.Size([2, 128, 36862])
torch.Size([2, 64, 73726])
torch.Size([2, 64, 73726])
torch.Size([2, 1, 147454])
----------------------------------------------------------------
        Layer (type)               Output Shape         Param #
            Conv1d-1            [-1, 64, 73726]             320
              ReLU-2            [-1, 64, 73726]               0
            Conv1d-3           [-1, 128, 73726]           8,320
               GLU-4            [-1, 64, 73726]               0
            Conv1d-5           [-1, 128, 36862]          32,896
              ReLU-6           [-1, 128, 36862]               0
            Conv1d-7           [-1, 256, 36862]     

ValueError: setting an array element with a sequence. The requested array has an inhomogeneous shape after 1 dimensions. The detected shape was (2,) + inhomogeneous part.

In [13]:
import torch
import torch.nn as nn
import numpy as np
channels_input=1
channels_output=1
channels_H=64
max_H=768
encoder_n_layers=8
kernel_size=4
stride=2

class AverageChannels(nn.Module):
    def __init__(self, factor):
        super(AverageChannels, self).__init__()
        self.factor = factor

    def forward(self, x):
        batch_size, channels, length = x.size()
        # Reshape the input to (batch_size, channels // factor, factor, length)
        x = x.view(batch_size, channels // self.factor, self.factor, length)
        # Average over the factor dimension
        x = x.mean(dim=2)
        return x



x = torch.randn(1,1,48126)
enc = [nn.Conv1d(channels_input, channels_H, kernel_size, stride),
                nn.ReLU(),
                nn.Conv1d(channels_H, channels_H * 2, 1), 
                nn.GLU(dim=1)]
# channels_H = channels_H//2
dec = [nn.Conv1d(channels_H, channels_H * 2, 1), 
                        nn.GLU(dim=1),
                        nn.ConvTranspose1d(channels_H, channels_output, kernel_size, stride)]
dec1 = [nn.Conv1d(channels_H, channels_H * 2, 1, padding=1), 
                        nn.GLU(dim=1),
                        nn.Upsample(scale_factor=stride, mode='linear', align_corners=False),
                        AverageChannels(factor=4),
                        nn.Conv1d(channels_H//4, channels_output, kernel_size,stride = 1, padding='same'),
                        nn.ReLU()]
print(x.shape)
for e in enc:
    x = e(x)
    print(x.shape)
x = torch.randn(1,64,24062)
print("decoder")
print(x.shape)
for d in dec:
    x = d(x)
    print(x.shape)

x = torch.randn(1,64,24062)
print("decoder1")
print(x.shape)
for d in dec1:
    x = d(x)
    print(x.shape)

torch.Size([1, 1, 48126])
torch.Size([1, 64, 24062])
torch.Size([1, 64, 24062])
torch.Size([1, 128, 24062])
torch.Size([1, 64, 24062])
decoder
torch.Size([1, 64, 24062])
torch.Size([1, 128, 24062])
torch.Size([1, 64, 24062])
torch.Size([1, 1, 48126])
decoder1
torch.Size([1, 64, 24062])
torch.Size([1, 128, 24064])
torch.Size([1, 64, 24064])
torch.Size([1, 64, 48128])
torch.Size([1, 16, 48128])
torch.Size([1, 1, 48128])
torch.Size([1, 1, 48128])
