### Extracting embeddings

Importing packages.

In [1]:
import os
import librosa
import torch
from tqdm import tqdm
import numpy as np
from transformers import Wav2Vec2FeatureExtractor, Wav2Vec2Model

Mounting the data from drive

In [5]:
directory_path = '/content/drive/MyDrive/enhancing_speaker_recognition_evaluation/data'

print(len(os.listdir(directory_path)))

3


In [10]:
directory_path = os.path.expanduser("~/Dokumente/NLP_daten_p3/all_speakers")

print(len(os.listdir(directory_path)))

139


Defining device.

In [8]:
device = torch.device("cuda" if torch.cuda.is_available() else "cpu")
print(f"Using device: {device}")

Using device: cpu


Now we're extracting the vector representations of the audio files in different stages of the encoder.

In [11]:
feature_extractor = Wav2Vec2FeatureExtractor.from_pretrained("facebook/wav2vec2-xls-r-300m")
model = Wav2Vec2Model.from_pretrained("facebook/wav2vec2-xls-r-300m", output_hidden_states=True)
model.to(device)

def check_directories_exist(directory, layer_indices):
    """Prüft, ob die benötigten Verzeichnisse für jede Schicht bereits existieren."""
    all_exist = True
    for index in layer_indices:
        layer_dir = os.path.join(directory, f"layer_{index}")
        if not os.path.exists(layer_dir):
            all_exist = False
            break
    return all_exist

def load_audio_files(directory, layer_indices=[-1]):
    """Lädt alle MP3-Dateien im angegebenen Verzeichnis und extrahiert die Repräsentationen aus den spezifizierten Schichten."""
    for filename in tqdm(os.listdir(directory)):
        if filename.endswith(".mp3"):
            file_path = os.path.join(directory, filename)
            audio, sr = librosa.load(file_path, sr=16000)
            input_values = feature_extractor(audio, return_tensors="pt", sampling_rate=sr).input_values
            input_values = input_values.to(device)
            with torch.no_grad():
                outputs = model(input_values)
                for index in layer_indices:
                    hidden_states = outputs.hidden_states[index]
                    # creating sub directory for each layer in speaker directory
                    layer_dir = os.path.join(directory, f"layer_{index}")
                    os.makedirs(layer_dir, exist_ok=True)
                    save_path = os.path.join(layer_dir, f"{os.path.splitext(filename)[0]}_layer_{index}.npy")
                    np.save(save_path, hidden_states.cpu().numpy())

def process_audio_directory(base_directory, layer_indices=range(25)):
    """Verarbeitet Audio-Dateien in den angegebenen Verzeichnissen, falls die Ziellayer-Verzeichnisse noch nicht existieren."""
    for d in os.listdir(base_directory):
        dir_path = os.path.join(base_directory, d)
        if os.path.isdir(dir_path) and not check_directories_exist(dir_path, layer_indices):
            load_audio_files(dir_path, layer_indices)

directory_path = os.path.expanduser("~/Dokumente/NLP_daten_p3/all_speakers")

process_audio_directory(directory_path)

Some weights of Wav2Vec2Model were not initialized from the model checkpoint at facebook/wav2vec2-xls-r-300m and are newly initialized: ['wav2vec2.encoder.pos_conv_embed.conv.parametrizations.weight.original0', 'wav2vec2.encoder.pos_conv_embed.conv.parametrizations.weight.original1']
You should probably TRAIN this model on a down-stream task to be able to use it for predictions and inference.
100%|██████████| 251/251 [03:17<00:00,  1.27it/s]
100%|██████████| 74/74 [01:13<00:00,  1.01it/s]
 92%|█████████▏| 159/173 [01:59<00:10,  1.33it/s]


KeyboardInterrupt: 