In [6]:
import os
import pandas as pd
import librosa
import torch
import numpy as np
from tqdm import tqdm
from sklearn.model_selection import train_test_split
from transformers import Wav2Vec2FeatureExtractor, Wav2Vec2Model
import torch.nn as nn
import torch.optim as optim

In [7]:

directory_path = '../clips__test/'

print(len(os.listdir(directory_path)))

91


In [16]:
!pip install --upgrade librosa

Collecting numpy!=1.22.0,!=1.22.1,!=1.22.2,>=1.20.3 (from librosa)
  Using cached numpy-1.20.3-cp39-cp39-win_amd64.whl.metadata (2.0 kB)
Using cached numpy-1.20.3-cp39-cp39-win_amd64.whl (13.7 MB)
Installing collected packages: numpy
  Attempting uninstall: numpy
    Found existing installation: numpy 1.26.4
    Uninstalling numpy-1.26.4:
      Successfully uninstalled numpy-1.26.4
Successfully installed numpy-1.20.3


DEPRECATION: pyodbc 4.0.0-unsupported has a non-standard version number. pip 24.1 will enforce this behaviour change. A possible replacement is to upgrade to a newer version of pyodbc or contact the author to suggest that they release a version with a conforming version number. Discussion can be found at https://github.com/pypa/pip/issues/12063
ERROR: pip's dependency resolver does not currently take into account all the packages that are installed. This behaviour is the source of the following dependency conflicts.
daal4py 2021.3.0 requires daal==2021.2.3, which is not installed.
tensorflow-intel 2.16.1 requires numpy<2.0.0,>=1.23.5; python_version <= "3.11", but you have numpy 1.20.3 which is incompatible.


In [11]:
def load_audio_files(directory_path, metadata_file, layer_indices=[-1]):
    """Load audio files specified in metadata and extract representations from the specified layers."""
    metadata = pd.read_csv(metadata_file, sep="\t")
    data = []
    labels = []
    problematic_files = []
    for index, row in tqdm(metadata.iterrows(), total=metadata.shape[0]):
        speaker_id = row["client_id"]  # Extract unique speaker ID
        speaker_folder = os.path.join(directory_path, speaker_id)
        for filename in os.listdir(speaker_folder):
            file_path = os.path.join(speaker_folder, filename)
            try:
                audio, sr = librosa.load(file_path, sr=16000)
                input_values = feature_extractor(audio, return_tensors="pt", sampling_rate=sr).input_values
                input_values = input_values.to(device)
                with torch.no_grad():
                    outputs = model(input_values)
                    for layer_index in layer_indices:
                        hidden_states = outputs.hidden_states[layer_index]
                        concatenated_states = torch.cat(hidden_states, dim=-1).cpu().numpy()  # Concatenate along the last dimension
                        data.append(concatenated_states)
                        labels.append(speaker_id)
            except Exception as e:
                print(f"Error loading file: {file_path}, {e}")
                problematic_files.append(file_path)
    return np.array(data), np.array(labels), problematic_files

In [None]:

# Directory path where audio files are stored
directory_path = '../clips__test/'
metadata_file = "../test.tsv"
data, labels, problematic_files = load_audio_files(directory_path, metadata_file, layer_indices=[0, 5, 10, 15, 20, 24])

# Convert labels to one-hot encoding
num_classes = len(np.unique(labels))
labels_one_hot = np.eye(num_classes)[labels]

# Split data into training and testing sets
X_train, X_test, y_train, y_test = train_test_split(data, labels_one_hot, test_size=0.2, random_state=42)


In [17]:
# Define the RNN model using PyTorch
class RNN(nn.Module):
    def __init__(self, input_size, hidden_size, num_classes):
        super(RNN, self).__init__()
        self.lstm1 = nn.LSTM(input_size, hidden_size, batch_first=True)
        self.lstm2 = nn.LSTM(hidden_size, hidden_size, batch_first=True)
        self.lstm3 = nn.LSTM(hidden_size, hidden_size, batch_first=True)
        self.fc = nn.Linear(hidden_size, num_classes)

    def forward(self, x):
        # Set initial hidden and cell states for all three LSTM layers
        h0_1 = torch.zeros(1, x.size(0), hidden_size).to(device)
        c0_1 = torch.zeros(1, x.size(0), hidden_size).to(device)

        h0_2 = torch.zeros(1, x.size(0), hidden_size).to(device)
        c0_2 = torch.zeros(1, x.size(0), hidden_size).to(device)

        h0_3 = torch.zeros(1, x.size(0), hidden_size).to(device)
        c0_3 = torch.zeros(1, x.size(0), hidden_size).to(device)

        # Forward propagate through the first LSTM layer
        out, _ = self.lstm1(x, (h0_1, c0_1))
        # Forward propagate through the second LSTM layer
        out, _ = self.lstm2(out, (h0_2, c0_2))
        # Forward propagate through the third LSTM layer
        out, _ = self.lstm3(out, (h0_3, c0_3))

        # Decode the hidden state of the last time step
        out = self.fc(out[:, -1, :])
        return out

# Hyperparameters
input_size = X_train.shape[2]  # Input size is the size of concatenated hidden states
hidden_size = 128
learning_rate = 0.001
num_epochs = 10

# Initialize the model, loss function, and optimizer
model = RNN(input_size, hidden_size, num_classes).to(device)
criterion = nn.CrossEntropyLoss()
optimizer = optim.Adam(model.parameters(), lr=learning_rate)

# Train the model
for epoch in range(num_epochs):
    model.train()
    optimizer.zero_grad()
    inputs = torch.tensor(X_train).to(device)
    labels = torch.argmax(torch.tensor(y_train), dim=1).to(device)  # Convert one-hot labels to indices
    outputs = model(inputs)
    loss = criterion(outputs, labels)
    loss.backward()
    optimizer.step()
    print(f'Epoch [{epoch + 1}/{num_epochs}], Loss: {loss.item()}')

# Test the model
model.eval()
with torch.no_grad():
    inputs = torch.tensor(X_test).to(device)
    labels = torch.argmax(torch.tensor(y_test), dim=1).to(device)  # Convert one-hot labels to indices
    outputs = model(inputs)
    _, predicted = torch.max(outputs.data, 1)
    total = labels.size(0)
    correct = (predicted == labels).sum().item()
    accuracy = correct / total
    print(f'Test Accuracy: {accuracy}')

NameError: name 'X_train' is not defined