In [1]:
import os
import glob
import numpy as np
from tqdm import tqdm
from preprocessing import preprocessing

In [2]:
sample_data_repo = os.path.join('.', 'wav_data', 'pretrain')

samples_data = glob.glob(os.path.join(sample_data_repo, '**', '*wav'), recursive=True)
samples_data = sorted(samples_data)
len(samples_data)

2880

In [3]:
np.random.seed(42)
idx = np.random.permutation(len(samples_data))
train_idx = idx[:int(len(samples_data)*0.8)]
eval_idx = idx[int(len(samples_data)*0.8):]

In [4]:
train_samples = list(np.array(samples_data)[train_idx])
eval_samples = list(np.array(samples_data)[eval_idx])

In [5]:
len(train_samples), len(eval_samples)

(2304, 576)

In [6]:
concat_train_tensors = []

for data_dir in tqdm(train_samples):
    concat_tensor = preprocessing(data_dir, method='mfcc', sr=22050)
    concat_train_tensors.append(concat_tensor)

100%|██████████| 2304/2304 [06:41<00:00,  5.74it/s]


In [7]:
X_train = np.concatenate(np.array(concat_train_tensors), axis=0)
X_train.shape

(14810, 128, 100, 1)

In [8]:
concat_eval_tensors = []

for data_dir in tqdm(eval_samples):
    concat_tensor = preprocessing(data_dir, method='mfcc', sr=22050)
    concat_eval_tensors.append(concat_tensor)

100%|██████████| 576/576 [01:40<00:00,  5.76it/s]


In [9]:
X_eval = np.concatenate(np.array(concat_eval_tensors), axis=0)
X_eval.shape

(3688, 128, 100, 1)

In [10]:
import torch
import torch.nn as nn
import torch.optim as optim
import torch.nn.init as init
from torch.utils.data import TensorDataset, DataLoader

device = torch.device("cuda:0" if torch.cuda.is_available() else "cpu")
print(device)

cuda:0


In [11]:
BATCH_SIZE = 512
EPOCHS = 100
lr = 0.001

In [12]:
X_train = torch.tensor(X_train, device=device).float()
X_eval = torch.tensor(X_eval, device=device).float()
X_train = X_train.permute(0, 3, 1, 2)
X_eval = X_eval.permute(0, 3, 1, 2)

X_train.shape, X_eval.shape

(torch.Size([14810, 1, 128, 100]), torch.Size([3688, 1, 128, 100]))

In [13]:
train_dataloader = DataLoader(X_train, batch_size=BATCH_SIZE, shuffle=True, num_workers=0, drop_last=True)
eval_dataloader = DataLoader(X_eval, batch_size=BATCH_SIZE, num_workers=0, drop_last=True)

In [14]:
class Encoder(nn.Module):
    def __init__(self):
        super(Encoder, self).__init__()
        self.conv1 = nn.Sequential(
            nn.Conv2d(1, 16, 5, padding=0),
            nn.ReLU(),
            nn.BatchNorm2d(16),
            nn.Conv2d(16, 16, 5, padding=0),
            nn.ReLU(),
            nn.BatchNorm2d(16),
            nn.MaxPool2d(2, 2)
        )
        self.conv2 = nn.Sequential(
            nn.Conv2d(16, 32, 5, padding=0),
            nn.ReLU(),
            nn.BatchNorm2d(32),
            nn.Conv2d(32, 32, 5, padding=0),
            nn.ReLU(),
            nn.BatchNorm2d(32),
            nn.MaxPool2d(2, 2)
        )
        self.conv3 = nn.Sequential(
            nn.Conv2d(32, 64, (5, 4), padding=0),
            nn.ReLU(),
            nn.BatchNorm2d(64),
            nn.MaxPool2d(2, 2),
            nn.Conv2d(64, 64, 3, padding=1),
            nn.ReLU()
        )
        
    def forward(self, x):
        out = self.conv1(x)
        out = self.conv2(out)
        out = self.conv3(out)
        out = out.reshape(x.shape[0], -1)
        return out

In [15]:
class Decoder(nn.Module):
    def __init__(self):
        super(Decoder, self).__init__()
        self.conv_trans1 = nn.Sequential(
            nn.ConvTranspose2d(64, 64, 3, 1, 1),
            nn.ReLU(),
            nn.BatchNorm2d(64),
            nn.ConvTranspose2d(64, 64, (5, 4), 2, 0, 1),
            nn.ReLU(),
            nn.BatchNorm2d(64)
        )
        self.conv_trans2 = nn.Sequential(
            nn.ConvTranspose2d(64, 32, 5, 2, 0, 1),
            nn.ReLU(),
            nn.BatchNorm2d(32),
            nn.ConvTranspose2d(32, 32, 5, 1, 0),
            nn.ReLU(),
            nn.BatchNorm2d(32)
        )
        self.conv_trans3 = nn.Sequential(
            nn.ConvTranspose2d(32, 16, 5, 2, 0, 1),
            nn.ReLU(),
            nn.BatchNorm2d(16),
            nn.ConvTranspose2d(16, 1, 5, 1, 0),
        )
        
    def forward(self, x):
        out = x.reshape(x.shape[0], 64, 11, 8)
        out = self.conv_trans1(out)
        out = self.conv_trans2(out)
        out = self.conv_trans3(out)
        return out

In [16]:
class AutoEncoder(nn.Module):
    def __init__(self):
        super(AutoEncoder, self).__init__()
        self.encoder = Encoder()
        self.decoder = Decoder()
        
    def forward(self, x):
        z = self.encoder(x)
        out = self.decoder(z)
        return out

In [17]:
model = AutoEncoder()
model = torch.nn.DataParallel(model)
model.cuda()

DataParallel(
  (module): AutoEncoder(
    (encoder): Encoder(
      (conv1): Sequential(
        (0): Conv2d(1, 16, kernel_size=(5, 5), stride=(1, 1))
        (1): ReLU()
        (2): BatchNorm2d(16, eps=1e-05, momentum=0.1, affine=True, track_running_stats=True)
        (3): Conv2d(16, 16, kernel_size=(5, 5), stride=(1, 1))
        (4): ReLU()
        (5): BatchNorm2d(16, eps=1e-05, momentum=0.1, affine=True, track_running_stats=True)
        (6): MaxPool2d(kernel_size=2, stride=2, padding=0, dilation=1, ceil_mode=False)
      )
      (conv2): Sequential(
        (0): Conv2d(16, 32, kernel_size=(5, 5), stride=(1, 1))
        (1): ReLU()
        (2): BatchNorm2d(32, eps=1e-05, momentum=0.1, affine=True, track_running_stats=True)
        (3): Conv2d(32, 32, kernel_size=(5, 5), stride=(1, 1))
        (4): ReLU()
        (5): BatchNorm2d(32, eps=1e-05, momentum=0.1, affine=True, track_running_stats=True)
        (6): MaxPool2d(kernel_size=2, stride=2, padding=0, dilation=1, ceil_mode=Fal

In [18]:
loss_func = nn.MSELoss()
optimizer = optim.Adam(model.parameters(), lr=lr)

In [19]:
def train(train_dataloader, eval_dataloader, epochs):
    for epoch in range(epochs):
        # train
        train_loss = 0
        nb_train_steps = 0

        for x_batch in train_dataloader:
            optimizer.zero_grad()
            
            outputs = model(x_batch)
            
            loss = loss_func(outputs, x_batch)
            loss.backward()
            train_loss += loss.mean().item()
            nb_train_steps += 1
            
            # scheduler.step()
            optimizer.step()
        
        train_loss = train_loss / nb_train_steps
             
            
        # evaluate      
        model.eval()
        eval_loss = 0
        nb_eval_steps = 0
        
        for x_batch in eval_dataloader:
            with torch.no_grad():
                outputs = model(x_batch)
            
            tmp_eval_loss = loss_func(outputs, x_batch)
            eval_loss += tmp_eval_loss.mean().item()
            nb_eval_steps += 1
            
        eval_loss = eval_loss / nb_eval_steps
            
        
        for param_group in optimizer.param_groups:
            lr = param_group['lr']
        print('epoch: {:3d},    lr={:6f},    loss={:5f},    eval_loss={:5f}'
              .format(epoch+1, lr, train_loss, eval_loss))

In [20]:
train(train_dataloader, eval_dataloader, EPOCHS)

epoch:   1,    lr=0.001000,    loss=0.086038,    eval_loss=0.030105
epoch:   2,    lr=0.001000,    loss=0.024615,    eval_loss=0.028986
epoch:   3,    lr=0.001000,    loss=0.023811,    eval_loss=0.028139
epoch:   4,    lr=0.001000,    loss=0.023147,    eval_loss=0.027346
epoch:   5,    lr=0.001000,    loss=0.022421,    eval_loss=0.026624
epoch:   6,    lr=0.001000,    loss=0.021882,    eval_loss=0.026023
epoch:   7,    lr=0.001000,    loss=0.021431,    eval_loss=0.025444
epoch:   8,    lr=0.001000,    loss=0.020966,    eval_loss=0.024913
epoch:   9,    lr=0.001000,    loss=0.020562,    eval_loss=0.024386
epoch:  10,    lr=0.001000,    loss=0.020084,    eval_loss=0.023869
epoch:  11,    lr=0.001000,    loss=0.019670,    eval_loss=0.023347
epoch:  12,    lr=0.001000,    loss=0.019177,    eval_loss=0.022893
epoch:  13,    lr=0.001000,    loss=0.018764,    eval_loss=0.022351
epoch:  14,    lr=0.001000,    loss=0.018360,    eval_loss=0.021923
epoch:  15,    lr=0.001000,    loss=0.018075,   