In [1]:
import glob
import time

import numpy as np
import librosa
import torch
import torch.optim as optim
import torch.nn as nn
from torchsummary import summary
from tqdm import tqdm
from IPython.display import Audio

from models.Generator import Generator
from models.Discriminator import Discriminator

In [2]:
data_path = "./data/piano/**/*.wav"
batch_size = 16
z_dim = 100
n_epoch = 100
lr = 0.0001
sampling_rate = 16000
D_learn_perG_lean = 5
generate_sounds_interval = 1

In [3]:
path_list = []
for path in glob.glob(data_path, recursive=True):
    path_list.append(path)
    # print(path)

In [4]:
# 各ファイルの時間を確認
# s = 0
# for path in path_list:
#     y, sr = librosa.load(path)
#     sec = len(y)/sr
#     n = int(sec/generate_sounds_interval)
#     s += n
#     print("{}: {}[sec], {}".format(path, sec, n))
# print(s)

In [5]:
# データをgenerate_sounds_interval秒単位に分割
wave_data = []
# labels = []
for path in tqdm(path_list):
#     label = path.split('/')[2]
    raw_wave, sr = librosa.load(path)
    for i in range(0, len(raw_wave), generate_sounds_interval*sr):
        # 時間が足りないものは弾く
        if i+generate_sounds_interval*sr > len(raw_wave):
            continue
        wave_data.append(raw_wave[i:i+generate_sounds_interval*sr])
#         labels.append(label)

100%|██████████| 19/19 [01:43<00:00,  5.47s/it]


In [6]:
shape = wave_data[0].shape
for i, w in enumerate(wave_data):
    if w.shape != shape:
        print(i)

In [9]:
class MyData(torch.utils.data.Dataset):
    def __init__(self, data, labels):
        self.data = []
        self.labels = []
        
        assert len(data) == len(labels)
        
        for i, d in tqdm(enumerate(data)):
            melspec = librosa.feature.melspectrogram(y=d, sr=sr)
            melspec_db = librosa.power_to_db(melspec, ref=np.max)
            self.data.append(melspec_db)
            self.labels.append(float(labels[i]))

    def __len__(self):
        return len(self.data)

    def __getitem__(self, idx):
        return self.data[idx], self.labels[idx]

In [10]:
labels = np.random.randint(0, 2, size = len(wave_data))
data = MyData(wave_data, labels)

1590it [00:07, 223.53it/s]


In [11]:
data_loader = torch.utils.data.DataLoader(data, batch_size=32, shuffle=True)

In [12]:
net_D = Discriminator()
# loss_function = nn.CrossEntropyLoss()
loss_function = nn.BCELoss()
optimizer = optim.Adam(net_D.parameters(), lr=lr)

In [13]:
summary(net_D, (1, 128, 44))

----------------------------------------------------------------
        Layer (type)               Output Shape         Param #
            Conv2d-1           [-1, 1, 128, 44]              10
              ReLU-2           [-1, 1, 128, 44]               0
         MaxPool2d-3           [-1, 1, 128, 44]               0
            Linear-4                 [-1, 1024]       5,768,192
           Dropout-5                 [-1, 1024]               0
         LayerNorm-6               [-1, 4, 256]             512
            Linear-7              [-1, 4, 1024]         263,168
             Swish-8              [-1, 4, 1024]               0
           Dropout-9              [-1, 4, 1024]               0
           Linear-10               [-1, 4, 256]         262,400
          Dropout-11               [-1, 4, 256]               0
      FeedForward-12               [-1, 4, 256]               0
          PreNorm-13               [-1, 4, 256]               0
            Scale-14               [-1,

In [14]:
losses = []
for epoch in range(n_epoch):
    # training
    train_losses = 0
    for d in data_loader:
        optimizer.zero_grad()
        x, y = d
        x = x.unsqueeze(1)
        output = net_D(x).view(-1)
        loss = loss_function(output, y)
        loss.backward()
        optimizer.step()
        train_losses += loss.item()
        
    # validation
    test_losses = 0
    actual_list, pred_list = [], []
    for data in data_loader:
        with torch.no_grad():
            x, y = data
            x = x.unsqueeze(1)
            output = net_D(x).view(-1)
            loss = loss_function(output, y)
            _, y_pred = torch.max(output, 1)
            test_losses += loss.item()

            actual_list.append(y.cpu().numpy())
            pred_list.append(y_pred.cpu().numpy())
    
    actual_list = np.concatenate(actual_list)
    pred_list = np.concatenate(pred_list)
    accuracy = np.mean(actual_list == pred_list)

    print("epoch", epoch, "\t train_loss", train_losses, "\t test_loss", test_losses, "\t accuracy", accuracy)

RuntimeError: Found dtype Double but expected Float

In [18]:
output

tensor([0.5336, 0.4634, 0.5175, 0.5143, 0.5132, 0.4794, 0.4304, 0.4368, 0.4511,
        0.5093, 0.6383, 0.5524, 0.4577, 0.4327, 0.5663, 0.3983, 0.4929, 0.5776,
        0.2837, 0.3752, 0.5219, 0.5273, 0.6529, 0.5802, 0.6046, 0.5144, 0.5148,
        0.4257, 0.3903, 0.5113, 0.4560, 0.4466], grad_fn=<ViewBackward>)

In [19]:
y

tensor([0., 0., 0., 1., 0., 0., 0., 0., 0., 1., 0., 1., 1., 1., 1., 0., 1., 1.,
        1., 1., 1., 0., 0., 0., 0., 1., 1., 0., 0., 1., 1., 0.],
       dtype=torch.float64)