In [2]:
import torch
import torch.nn as nn
from torchsummary import summary
import librosa

import math
import sys
import time
from pathlib import Path
from IPython.display import Audio

from conformer import ConformerBlock

In [3]:
# model = nn.Sequential(
#     ConformerBlock(dim=64),
#     nn.Conv1d(16, 32, 1),
#     nn.Upsample(scale_factor=2)
# )

# summary(model, (16, 64))

In [4]:
# x = torch.randn(1, 16, 64)
# output = model(x)
# print(output.shape)

In [5]:
# def upscale(x, H, W):
#     B, N, C = x.size()
#     assert N == H*W
#     x = x.permute(0, 2, 1)
#     x = x.view(-1, C, H, W)
#     x = nn.PixelShuffle(2)(x)
#     B, C, H, W = x.size()
#     x = x.view(-1, C, H*W)
#     x = x.permute(0, 2, 1)
#     return x, H, W

In [6]:
def upconv(x, in_channel, out_channel):
    x = x.permute(0, 2, 1)
    print('upconv permuted: ', x.shape)
    x = nn.Conv1d(in_channel, out_channel, 1)(x)
    print('conv1d: ', x.shape)
    x = nn.Upsample(scale_factor=4)(x)
    print('upsample: ', x.shape)
    output = x.permute(0, 2, 1)
    
    return output

In [7]:
class Generator(nn.Module):
    def __init__(self, latent_dim=100, embed_dim=16, wave_length=int(22050/4)):
        super(Generator, self).__init__()

        self.latent_dim = latent_dim
        self.wave_length = wave_length
        self.embed_dim = embed_dim

        self.l1 = nn.Linear(latent_dim, wave_length * embed_dim)

        self.blocks = nn.ModuleList([
            ConformerBlock(dim=embed_dim),
            # ConformerBlock(dim=embed_dim),
            # ConformerBlock(dim=embed_dim),
            # ConformerBlock(dim=embed_dim),
            # ConformerBlock(dim=embed_dim),
        ])

        self.upsample_blocks = nn.ModuleList([
            nn.ModuleList([
                ConformerBlock(dim=embed_dim//4),
                # ConformerBlock(dim=embed_dim//4),
                # ConformerBlock(dim=embed_dim//4),
                # ConformerBlock(dim=embed_dim//4),
            ]),
            nn.ModuleList([
                ConformerBlock(dim=embed_dim//16),
                # ConformerBlock(dim=embed_dim//16),
            ])
        ])

#         self.deconv = nn.Sequential(
#             nn.Conv2d(embed_dim//16, 1, 1, 1, 0)
#         )

    def forward(self, z):
        print("input: ", z.shape)
        x = self.l1(z)
        print("Linear-1:", x.shape)
        x = x.view(-1, self.wave_length, self.embed_dim)
        print("reshape:", x.shape)
        
        for index, blk in enumerate(self.blocks):
            x = blk(x)
            print("Head-ConformerBlock-{}:".format(index), x.shape)
            
        for index, block in enumerate(self.upsample_blocks):
            x = upconv(x, x.size()[2], self.embed_dim//4**(index+1))
            print("upscale-{}:".format(index), x.shape)
            for j, b in enumerate(block):
                x = b(x)
                print("ConformerBlock-{}-{}:".format(index, j), x.shape)
                
        output = x.view(-1)
#         x = x.permute(0, 2, 1)
#         print(x.shape)
#         x = x.view(-1, self.embed_dim//16, H, W)
#         print(x.shape)
#         output = self.deconv(x)

        return output

In [8]:
model = Generator()

In [9]:
summary(model, (100, ))

input:  torch.Size([2, 100])
Linear-1: torch.Size([2, 88192])
reshape: torch.Size([2, 5512, 16])
Head-ConformerBlock-0: torch.Size([2, 5512, 16])


RuntimeError: Given normalized_shape=[4], expected input with shape [*, 4], but got input of size[2, 5512, 16]

In [None]:
latent_dim = 100
z = torch.rand(latent_dim, dtype=torch.float32)
output = model.forward(z)
output = output.squeeze(0).squeeze(0).detach().numpy().copy()

In [122]:
output.shape

(5632,)

In [123]:
# S = librosa.feature.inverse.mel_to_stft(output, sr=22050)
# y = librosa.griffinlim(S)
# print(y.shape)
Audio(output, rate=22050)

In [None]:
S = librosa.feature.inverse.mel_to_stft(output, sr=22050)
y = librosa.griffinlim(S)
print(S.shape)
print(y.shape)
soundfile.write('./output/result.wav', y, 22050, format="WAV")