In [66]:
import torch
import torch.nn as nn
import torchaudio
import torchaudio.transforms as transforms
from torch.utils.data import DataLoader, Dataset

In [67]:
from torch.nn.utils.rnn import pad_sequence

def collate_fn(batch):
    inputs, labels = zip(*batch)
    
    max_len = max(input.shape[2] for input in inputs)
    
    inputs_padded = [torch.nn.functional.pad(input, (0, max_len - input.shape[2])) for input in inputs]
    
    inputs_padded = torch.stack(inputs_padded)
    
    labels = torch.tensor(labels, dtype=torch.float)
    
    return inputs_padded, labels

In [68]:
class VoiceDataset(Dataset):
    def __init__(self, file_list):
        self.file_list = file_list
        self.transform = transforms.MFCC(sample_rate=16000, n_mfcc=40)

    def __len__(self):
        return len(self.file_list)

    def __getitem__(self, idx):
        filename = self.file_list[idx]
        waveform, sample_rate = torchaudio.load(filename)
        mfcc = self.transform(waveform)
        label = 1.0 if "owner" in filename else 0.0
        return mfcc, label 

In [69]:
class VoiceAuthCNN(nn.Module):
    def __init__(self):
        super(VoiceAuthCNN, self).__init__()
        self.conv1 = nn.Conv2d(1, 16, kernel_size=3, stride=1, padding=1)
        self.conv2 = nn.Conv2d(16, 32, kernel_size=3, stride=1, padding=1)
        self.conv3 = nn.Conv2d(32, 64, kernel_size=3, stride=1, padding=1)
        self.pool = nn.MaxPool2d(2, 2)
        
        self.adaptive_pool = nn.AdaptiveAvgPool2d((5, 5))
        
        self.fc1 = nn.Linear(64 * 5 * 5, 128)
        self.fc2 = nn.Linear(128, 64)
        self.fc3 = nn.Linear(64, 1)
    
    def forward(self, x):
        x = self.pool(torch.relu(self.conv1(x)))
        x = self.pool(torch.relu(self.conv2(x)))
        x = self.pool(torch.relu(self.conv3(x)))
        
        x = self.adaptive_pool(x)
        
        x = x.view(x.size(0), -1)
        x = torch.relu(self.fc1(x))
        x = torch.relu(self.fc2(x))
        x = self.fc3(x)  
        return x.squeeze(-1) 

In [70]:
import os
import torch.optim as optim
def train_model(model, train_loader, criterion, optimizer, num_epochs):
    model.train()
    for epoch in range(num_epochs):
        running_loss = 0.0
        for inputs, labels in train_loader:
            optimizer.zero_grad()
            outputs = model(inputs)
            loss = criterion(outputs, labels)
            loss.backward()
            optimizer.step()
            running_loss += loss.item()
        
        print(f"Epoch [{epoch+1}/{num_epochs}], Loss: {running_loss/len(train_loader):.4f}")
        
file_list = [os.path.join("recordings", f) for f in os.listdir("recordings") if f.endswith(".wav")]
dataset = VoiceDataset(file_list)
train_loader = DataLoader(dataset, batch_size=8, shuffle=True, collate_fn=collate_fn)

In [71]:
model = VoiceAuthCNN()
criterion = nn.BCEWithLogitsLoss()
optimizer = optim.Adam(model.parameters(), lr=0.001)

In [72]:
train_model(model, train_loader, criterion, optimizer, num_epochs=25)

Epoch [1/25], Loss: 0.4302
Epoch [2/25], Loss: 0.0038
Epoch [3/25], Loss: 0.0000
Epoch [4/25], Loss: 0.0000
Epoch [5/25], Loss: 0.0000
Epoch [6/25], Loss: 0.0000
Epoch [7/25], Loss: 0.0000
Epoch [8/25], Loss: 0.0000
Epoch [9/25], Loss: 0.0000
Epoch [10/25], Loss: 0.0000
Epoch [11/25], Loss: 0.0000
Epoch [12/25], Loss: 0.0000
Epoch [13/25], Loss: 0.0000
Epoch [14/25], Loss: 0.0000
Epoch [15/25], Loss: 0.0000
Epoch [16/25], Loss: 0.0000
Epoch [17/25], Loss: 0.0000
Epoch [18/25], Loss: 0.0000
Epoch [19/25], Loss: 0.0000
Epoch [20/25], Loss: 0.0000
Epoch [21/25], Loss: 0.0000
Epoch [22/25], Loss: 0.0000
Epoch [23/25], Loss: 0.0000
Epoch [24/25], Loss: 0.0000
Epoch [25/25], Loss: 0.0000
