# GAN training sample
Generator: 

## import libraries and configure

In [1]:
import glob
import time

import numpy as np
import librosa
import torch
import torch.optim as optim
import torch.nn as nn
from torchsummary import summary
from tqdm import tqdm
from IPython.display import Audio

from models.Generator import Generator
from models.Discriminator import Discriminator

In [2]:
data_path = "./data/piano/**/*.wav"
batch_size = 32
z_dim = 100
n_epoch = 100
lr = 0.0001
sampling_rate = 16000
# learn_pecentage = 5 # G１回の学習に対してDを何回学習させるか
generate_sounds_interval = 1 # 5秒の音を生成

In [3]:
#GPUが使用可能かどうか確認
device = torch.device("cuda:0" if torch.cuda.is_available() else "cpu")
print("device:",device)

device: cpu


## Prepare Dataset

In [4]:
path_list = []
for path in glob.glob(data_path, recursive=True):
    path_list.append(path)
    # print(path)

In [5]:
# # 各ファイルの時間を確認
# s = 0
# for path in path_list:
#     y, sr = librosa.load(path)
#     sec = len(y)/sr
#     n = int(sec/generate_sounds_interval)
#     s += n
#     print("{}: {}[sec], {}".format(path, sec, n))
# print(s)

In [6]:
# データをgenerate_sounds_interval秒単位に分割
wave_data = []
for path in tqdm(path_list):
    raw_wave, sr = librosa.load(path)
    for i in range(0, len(raw_wave), generate_sounds_interval*sr):
        # 時間が足りないものは弾く
        if i+generate_sounds_interval*sr > len(raw_wave):
            continue
        wave_data.append(raw_wave[i:i+generate_sounds_interval*sr])

100%|██████████| 19/19 [01:08<00:00,  3.62s/it]


In [7]:
class GanDataset(torch.utils.data.Dataset):
    def __init__(self, data):
        self.data = []
        
        for i, d in tqdm(enumerate(data)):
            melspec = librosa.feature.melspectrogram(y=d, sr=sr)
            melspec_db = librosa.power_to_db(melspec, ref=np.max)
            self.data.append(melspec_db)

    def __len__(self):
        return len(self.data)

    def __getitem__(self, idx):
        return self.data[idx]

In [8]:
data = GanDataset(wave_data)

1590it [00:06, 246.48it/s]


In [9]:
data_loader = torch.utils.data.DataLoader(data, batch_size=batch_size, shuffle=True)

## Models

In [10]:
net_G = Generator()
summary(net_G, (100,))

input:  torch.Size([2, 100])
0.005585908889770508
l1: torch.Size([2, 5632])
0.00016498565673828125
l1 -> reshape: torch.Size([2, 352, 16])
blk1-0: torch.Size([2, 352, 16])
0.045564889907836914
blk2-0 upscale: torch.Size([2, 1408, 4])
0.0009160041809082031
blk2-0: torch.Size([2, 1408, 4])
1.37642502784729
blk2-1 upscale: torch.Size([2, 5632, 1])
0.0011420249938964844
blk2-1: torch.Size([2, 5632, 1])
80.64254975318909
torch.Size([2, 1, 5632])
torch.Size([2, 1, 128, 44])
----------------------------------------------------------------
        Layer (type)               Output Shape         Param #
            Linear-1                 [-1, 5632]         568,832
         LayerNorm-2              [-1, 352, 16]              32
            Linear-3              [-1, 352, 64]           1,088
             Swish-4              [-1, 352, 64]               0
           Dropout-5              [-1, 352, 64]               0
            Linear-6              [-1, 352, 16]           1,040
           Dro

In [11]:
net_D = Discriminator()
summary(net_D, (1, 128, 44))

----------------------------------------------------------------
        Layer (type)               Output Shape         Param #
            Conv2d-1           [-1, 1, 128, 44]              10
              ReLU-2           [-1, 1, 128, 44]               0
         MaxPool2d-3           [-1, 1, 128, 44]               0
            Linear-4                 [-1, 1024]       5,768,192
           Dropout-5                 [-1, 1024]               0
         LayerNorm-6               [-1, 4, 256]             512
            Linear-7              [-1, 4, 1024]         263,168
             Swish-8              [-1, 4, 1024]               0
           Dropout-9              [-1, 4, 1024]               0
           Linear-10               [-1, 4, 256]         262,400
          Dropout-11               [-1, 4, 256]               0
      FeedForward-12               [-1, 4, 256]               0
          PreNorm-13               [-1, 4, 256]               0
            Scale-14               [-1,

## Loss function and optimizer

In [12]:
loss_function = nn.BCELoss()

In [13]:
beta1 = 0.5
beta2 = 0.9
optimizerD = optim.Adam(net_D.parameters(),lr=lr,betas=(beta1,beta2))
optimizerG = optim.Adam(net_G.parameters(),lr=lr,betas=(beta1,beta2))

## Training


In [14]:
G_losses = []
D_losses = []
ite = 0
real_label = 1
fake_label = 0

In [15]:
# Gに入力するノイズ
z = torch.Tensor(1, z_dim).uniform_(-1,1)

In [None]:
for epoch in range(n_epoch):
    for ite, data in enumerate(data_loader):
        print(ite)
        # training Discriminator
        # maximize log(D(x)) + log(1 - D(G(z)))
        
        ## Train with all-real batch
        net_D.zero_grad()
        labels = torch.full((data.size()[0],), real_label, dtype=torch.float, device=device)
        output = net_D(data.unsqueeze(1)).view(-1)
        err_D_real = loss_function(output, labels)
        err_D_real.backward()
        D_x = output.mean().item()
        
        ## Train with all-fake batch
        noise = torch.randn(batch_size, z_dim)
        fake = net_G(noise)
        labels.fill_(fake_label)
        output = net_D(fake.detach()).view(-1)
        err_D_fake = loss_function(output, label)
        err_D_fake.backward()
        D_G_z1 = output.mean().item()
        
        err_D = err_D_real + err_D_fake
        optimizerD.step()
        
        # Output training stats
        print('[%d/%d][%d/%d]\tLoss_D: %.4f\tD(x): %.4f\t'
              % (epoch, num_epochs, ite, len(dataloader), err_D.item(), D_x))

0
input:  torch.Size([32, 100])
0.005950927734375
l1: torch.Size([32, 5632])
0.00016999244689941406
l1 -> reshape: torch.Size([32, 352, 16])
blk1-0: torch.Size([32, 352, 16])
0.6757700443267822
blk2-0 upscale: torch.Size([32, 1408, 4])
0.00484013557434082
blk2-0: torch.Size([32, 1408, 4])
39.72172021865845
blk2-1 upscale: torch.Size([32, 5632, 1])
0.008321046829223633
