In [2]:
import numpy as np
import os
from torch.utils.data import Dataset
import librosa
import torch
import matplotlib.pyplot as plt
from torch import nn
import tqdm
import torchvision
from sklearn.metrics import roc_curve


from torchvision.models import resnet18

device =  'cuda' if torch.cuda.is_available() else 'cpu'

print(f"Using {device}")

Using cuda


In [3]:
def pad_random(x: np.ndarray, max_len: int = 64000):
    x_len = x.shape[0]
    if x_len > max_len:
        stt = np.random.randint(x_len - max_len)
        return x[stt:stt + max_len]

    num_repeats = int(max_len / x_len) + 1
    padded_x = np.tile(x, (num_repeats))
    return pad_random(padded_x, max_len)

class Singfake(Dataset):
    """
    Dataset class for the Singfake dataset.
    """
    def __init__(self, base_dir, partition="train", max_len=64000):
        assert partition in ["train", "val", "test_t01", "test_t02", "test_t04"], "Invalid partition. Must be one of ['train', 'val', 'test_t01', 'test_t02', 'test_t04']"
        self.base_dir = base_dir
        self.partition = partition
        self.base_dir = os.path.join(base_dir, partition + ".csv")
        self.max_len = max_len

        self.transforms = torchvision.transforms.Compose([torchvision.transforms.Resize((224,224))])

        df=pd.read_csv(self.base_dir)
        self.file_paths=list(df["file_path"])
        self.label=list(df["label"])



    def __len__(self):
        return len(self.file_paths)

    def __getitem__(self, index):
        file_path = self.file_paths[index]
        bonafide_or_spoof = self.label[index]
        label = 1 if bonafide_or_spoof == "bonafide" else 0
        try:
            x, _ = librosa.load(os.path.join(self.base_dir, file_path), sr=16000, mono=True)
            x = pad_random(x, self.max_len) # x = pad_random (audio,64000)
            x = librosa.util.normalize(x)
            x = librosa.feature.rms(y=x,hop_length=160,frame_length=320)
            #x = librosa.feature.zero_crossing_rate(y=x,frame_length=320,hop_length=160)
            return torch.tensor(x).type(torch.float32), label, file_path

        except Exception as e:
            print(f"Error loading {file_path}: {e}")
            return None

In [4]:
train_ds=Singfake('/content/drive/MyDrive/Singfake/Labeled_dataset',partition='train')
val_ds=Singfake('/content/drive/MyDrive/Singfake/Labeled_dataset',partition='val')
t01_ds=Singfake('/content/drive/MyDrive/Singfake/Labeled_dataset',partition='test_t01')
t02_ds=Singfake('/content/drive/MyDrive/Singfake/Labeled_dataset',partition='test_t02')
t04_ds=Singfake('/content/drive/MyDrive/Singfake/Labeled_dataset',partition='test_t04')

In [5]:
print(len(train_ds))
print(len(t01_ds))

84404
43625


In [6]:
train_loader=torch.utils.data.DataLoader(train_ds,batch_size=16)
test_loader=torch.utils.data.DataLoader(t01_ds,batch_size=16)

In [7]:
print(train_ds[0][0].shape)

torch.Size([1, 401])


In [8]:
class auditory_conv1d(nn.Module):
    def __init__(self):
        super().__init__()
        self.input_embedding=nn.Sequential(nn.Conv1d(in_channels=1,out_channels=256,stride=1,kernel_size=5),
                                           nn.ReLU(),
                                           nn.MaxPool1d(kernel_size=5,stride=2,padding=1),
                                           nn.Conv1d(in_channels=256,out_channels=256,kernel_size=4),
                                           nn.ReLU(),
                                           nn.MaxPool1d(kernel_size=5,stride=1,padding=1),
                                           nn.Conv1d(in_channels=256,out_channels=128,kernel_size=4),
                                           nn.ReLU(),
                                           nn.MaxPool1d(kernel_size=5,stride=2,padding=1),
                                           nn.Conv1d(in_channels=128,out_channels=64,kernel_size=4),
                                           nn.ReLU(),
                                           nn.MaxPool1d(kernel_size=5,stride=2,padding=1),
                                           nn.Conv1d(in_channels=64,out_channels=32,kernel_size=5),
                                           nn.ReLU(),
                                           nn.MaxPool1d(kernel_size=5,stride=2,padding=1),
                                           nn.Dropout(p=0.2),
                                           nn.Flatten(),
                                           nn.ReLU(),
                                           nn.Linear(in_features=640,out_features=512,bias=True),
                                           nn.ReLU(),
                                           nn.Dropout(p=0.3),
                                           nn.Linear(in_features=512,out_features=256,bias=True),
                                           nn.ReLU(),
                                           nn.Linear(in_features=256,out_features=128,bias=True),
                                           nn.ReLU(),
                                           nn.Linear(in_features=128,out_features=64,bias=True),
                                           nn.ReLU(),
                                           nn.Linear(in_features=64,out_features=32,bias=True),
                                           nn.ReLU(),
                                           nn.Linear(in_features=32,out_features=1,bias=True))


    def forward(self,x):
        return self.input_embedding(x)


class AudioLSTM(nn.Module):
    def __init__(self, input_size, hidden_size, num_layers, num_classes):
        super(AudioLSTM, self).__init__()
        self.hidden_size = hidden_size
        self.num_layers = num_layers
        self.lstm = nn.LSTM(input_size, hidden_size, num_layers, batch_first=True)
        self.fc = nn.Linear(hidden_size, num_classes)
    
    def forward(self, x):
        h0 = torch.zeros(self.num_layers, x.size(0), self.hidden_size).to(x.device)
        c0 = torch.zeros(self.num_layers, x.size(0), self.hidden_size).to(x.device)
        
        out, _ = self.lstm(x, (h0, c0))
        # Use output of the last time step for classification
        out = self.fc(out[:, -1, :])
        return out

In [10]:
input_size = 401  # Input size, for example, if using a spectrogram
hidden_size = 64
num_layers = 2
num_classes = 1  # Number of classes for classification

model1=auditory_conv1d().to(device)
model2 = AudioLSTM(input_size, hidden_size, num_layers, num_classes).to(device)
#model2=auditory_lstm().to(device)

test=next(iter(train_loader))[0].to(device)

with torch.inference_mode():
    print(model1(test).shape)
    print(model2(test).shape)

torch.Size([16, 1])
torch.Size([16, 1])


In [11]:
def accuracy_fn(logits,true):
    return torch.eq(torch.argmax(torch.softmax(logits,dim=1),dim=1).squeeze(),true).sum().item()/len(logits)

In [12]:
def train_model(model, epochs):
    model = model.to(device)
    optimizer = torch.optim.Adam(params=model.parameters())
    loss_fn = nn.BCEWithLogitsLoss()

    for i in tqdm.notebook.tqdm(range(epochs)):
        train_preds = torch.tensor([], device=device)
        val_preds = torch.tensor([], device=device)
        train_actual = torch.tensor([], device=device)
        val_actual = torch.tensor([], device=device)
        
        print("\nTraining:")
        model.train()
        
        temp_train_loss = []
        temp_val_loss = []
    
        net_train_loss = 0
        net_val_loss = 0
        
        with tqdm.notebook.tqdm(total=len(train_loader)) as pbar:
            for x, y, _ in train_loader:
                x = x.to(device)
                y = y.to(device).type(torch.float32)

                logits = model(x)
        
                optimizer.zero_grad()
                loss = loss_fn(logits.squeeze(), y)
                
                preds = torch.sigmoid(logits).detach().squeeze()
                train_preds = torch.cat((train_preds, preds))
                train_actual = torch.cat((train_actual, y))
                
                temp_train_loss.append(loss.item())
                
                loss.backward()
                optimizer.step()
                
                pbar.update(1)
            pbar.close()
        
        net_train_loss = sum(temp_train_loss) / len(temp_train_loss)
    
        print("Testing:")
        model.eval()
    
        with tqdm.notebook.tqdm(total=len(test_loader)) as pbar2:
            for x, y, _ in test_loader:
                x = x.to(device)
                y = y.to(device).type(torch.float32)
                
                with torch.inference_mode():
                    logits = model(x)
                    loss = loss_fn(logits.squeeze(), y.type(torch.float32))
                    
                    preds = torch.sigmoid(logits).detach().squeeze()
                    val_preds = torch.cat((val_preds, preds))
                    val_actual = torch.cat((val_actual, y))
                    
                    temp_val_loss.append(loss.item())
    
                    pbar2.update(1)
            pbar2.close()
    
        net_val_loss = sum(temp_val_loss) / len(temp_val_loss)
        
        fpr, tpr, thresholds = roc_curve(val_actual.cpu().numpy(), val_preds.cpu().numpy())
        fnr = 1 - tpr
        eer_threshold = thresholds[np.nanargmin(np.absolute((fnr - fpr)))]
        eer = fpr[np.nanargmin(np.absolute((fnr - fpr)))]
        
        print(f"\nEpoch {i+1}:\nTrain Loss: {net_train_loss}\nVal Loss: {net_val_loss}\nEER: {eer}\nEER Threshold: {eer_threshold}\n")


In [13]:
train_model(model1,100)

  0%|          | 0/100 [00:00<?, ?it/s]


Training:


  0%|          | 0/5276 [00:00<?, ?it/s]

KeyboardInterrupt: 

In [None]:
torch.save(model.state_dict(),'epoch-100.pt')