In [115]:
import torch
import torch.nn as nn
import torch.nn.functional as F

class AudioNet(nn.Module):
    
    def __init__(self, target_output_size=8000, sample_rate=16000, context=1, num_additional_convolutions=3, out_channels=32, kernal_size=3):
        super(AudioNet, self).__init__()
        self.output_size = target_output_size
        self.input_size = target_output_size + (2 * context * target_output_size)

        self.sample_rate = sample_rate
        self.context = context
        self.num_additional_convolutions = num_additional_convolutions
        self.out_channels = out_channels
        self.kernal_size = kernal_size
        self.stride = 1
        self.padding = (kernal_size - 1) // 2

        sr = self.sample_rate
        b_kernel_size = int(10 * sample_rate//sr) + 1
        b_padding = (b_kernel_size - 1) // 2

        self.base_layers = nn.ModuleList()
        self.base_layers.append(nn.Conv1d(1, self.out_channels, kernel_size=b_kernel_size, stride=self.stride, padding=b_padding))
        while sr > 400:
            sr = sr // 2
            b_kernel_size = int(10 * sample_rate//sr) + 1
            b_padding = (b_kernel_size - 1) // 2
            self.base_layers.append(nn.Conv1d(1, self.out_channels, kernel_size=b_kernel_size, stride=self.stride, padding=b_padding))
        
        length_of_base_layers = len(self.base_layers)
        self.pool1 = nn.MaxPool1d(kernel_size=self.kernal_size, stride=length_of_base_layers, padding=0)
        

        self.deeper_layers = nn.Sequential()
        for _ in range(self.num_additional_convolutions):
            self.deeper_layers.append(nn.Conv1d(self.out_channels, self.out_channels, kernel_size=self.kernal_size, stride=self.stride, padding=self.padding))
            self.deeper_layers.append(nn.ReLU())

        self.batch_norm = nn.BatchNorm1d(self.out_channels)
        self.pool2 = nn.MaxPool1d(kernel_size=self.kernal_size, stride=num_additional_convolutions, padding=0)
        self.fc1 = nn.Linear(self.out_channels, 1)

    def forward(self, x):
        print(x.shape)
    
        x = [layer(x) for layer in self.base_layers]
        print([x_.shape for x_ in x])
        x = torch.cat(x, 0)
        print(x.shape)

        x = self.pool1(x.t())
        print(f"pool {x.shape}")
        x = x.t()
        print(f"pool {x.shape}")

        # deeper layers
        for layer in self.deeper_layers:
            print(layer)
            x = layer(x)
            print(x.shape)

        x = self.batch_norm(x.t())
        print(f"batch norm {x.shape}")

        # x = self.deeper_layers(x)
        print(x.shape)
        x = self.pool2(x.t())
        print(x.shape)

        x = self.fc1(x.t())
        print(x.shape)
        return x.t()

device = torch.device("cuda" if torch.cuda.is_available() else "cpu")
print(f"using {device} device.")
torch.cuda.empty_cache()
net = AudioNet(kernal_size=5, out_channels=16).to(device)

print(net)
x = torch.rand(1, 24000).to(device)
output = net(x)
print(output)

using cuda device.
AudioNet(
  (base_layers): ModuleList(
    (0): Conv1d(1, 16, kernel_size=(11,), stride=(1,), padding=(5,))
    (1): Conv1d(1, 16, kernel_size=(21,), stride=(1,), padding=(10,))
    (2): Conv1d(1, 16, kernel_size=(41,), stride=(1,), padding=(20,))
    (3): Conv1d(1, 16, kernel_size=(81,), stride=(1,), padding=(40,))
    (4): Conv1d(1, 16, kernel_size=(161,), stride=(1,), padding=(80,))
    (5): Conv1d(1, 16, kernel_size=(321,), stride=(1,), padding=(160,))
    (6): Conv1d(1, 16, kernel_size=(641,), stride=(1,), padding=(320,))
  )
  (pool1): MaxPool1d(kernel_size=5, stride=7, padding=0, dilation=1, ceil_mode=False)
  (deeper_layers): Sequential(
    (0): Conv1d(16, 16, kernel_size=(5,), stride=(1,), padding=(2,))
    (1): ReLU()
    (2): Conv1d(16, 16, kernel_size=(5,), stride=(1,), padding=(2,))
    (3): ReLU()
    (4): Conv1d(16, 16, kernel_size=(5,), stride=(1,), padding=(2,))
    (5): ReLU()
  )
  (batch_norm): BatchNorm1d(16, eps=1e-05, momentum=0.1, affine=True

In [129]:
print(torch.cuda.current_device())
print(torch.device(output.device))



0
cuda:0


In [None]:
from torch.utils import data

class AudioDataset(data.IterableDataset):
    def __init__(self, data):
        self.data = data

    def __len__(self):
        return len(self.data)

    def __getitem__(self, idx):
        return self.data[idx], self.labels[idx]
    
    def __iter__(self):
        for i in range(len(self.data)):
            yield self.data[i]

    
    
# Create a dataset
data = torch.randn(100, 8000)

audio_dataset = AudioDataset(data)

print(len(audio_dataset))

trainloader = torch.utils.data.DataLoader(audio_dataset)

for i, data in enumerate(trainloader, 0):
    print(data.shape)
    if i >= 2:
        break


100
torch.Size([1, 8000])
torch.Size([1, 8000])
torch.Size([1, 8000])


In [None]:
import torch.optim as optim

criterion = nn.MSELoss()
optimizer = optim.SGD(net.parameters(), lr=0.001)
optimizer.zero_grad()
# print(list(net.parameters()))

# Training loop
for epoch in range(2):  # loop over the dataset multiple times

    running_loss = 0.0
    for i, data in enumerate(trainloader, 0):
        data = data.to(device)
        print(f"{i} - {data.shape}")
        
        # forward + backward + optimize
        outputs = net(data)
        ideal_input = data[:,net.target_output_size:-net.target_output_size]
        loss = criterion(outputs, ideal_input)
        print(loss)
        loss.backward()
        optimizer.step()
        optimizer.zero_grad()

        # print statistics
        running_loss += loss.item()
        if i % 100:    # print every 100 mini-batches
            print('[%d, %5d] loss: %.3f' %
                  (epoch + 1, i + 1, running_loss / 2000))
            running_loss = 0.0

0 - torch.Size([1, 8000])


TypeError: 'Conv1d' object is not subscriptable