In [11]:
import os
import librosa
import torch
from tqdm import tqdm
import numpy as np
from transformers import Wav2Vec2FeatureExtractor, Wav2Vec2Model

In [12]:
from google.colab import drive
drive.mount('/content/drive')

directory_path = '/content/drive/My Drive/NLP/clips__test'

print(len(os.listdir(directory_path)))

Drive already mounted at /content/drive; to attempt to forcibly remount, call drive.mount("/content/drive", force_remount=True).
5


In [9]:
# using GPU if available
device = torch.device("cuda" if torch.cuda.is_available() else "cpu")
print(f"Using device: {device}")

feature_extractor = Wav2Vec2FeatureExtractor.from_pretrained("facebook/wav2vec2-xls-r-300m")
model = Wav2Vec2Model.from_pretrained("facebook/wav2vec2-xls-r-300m", output_hidden_states=True)
model.to(device)  # Using identified device

def load_audio_files(directory, layer_indices=[-1]):
    """Lädt alle MP3-Dateien im angegebenen Verzeichnis und extrahiert die Repräsentationen aus den spezifizierten Schichten."""
    for filename in tqdm(os.listdir(directory)):
        if filename.endswith(".mp3"):
            file_path = os.path.join(directory, filename)
            audio, sr = librosa.load(file_path, sr=16000)
            input_values = feature_extractor(audio, return_tensors="pt", sampling_rate=sr).input_values
            input_values = input_values.to(device)
            with torch.no_grad():
                outputs = model(input_values)
                for index in layer_indices:
                    hidden_states = outputs.hidden_states[index]
                    # creating sub directory for each layer in speaker directory
                    layer_dir = os.path.join(directory, f"layer_{index}")
                    os.makedirs(layer_dir, exist_ok=True)
                    save_path = os.path.join(layer_dir, f"{os.path.splitext(filename)[0]}_layer_{index}.npy")
                    np.save(save_path, hidden_states.cpu().numpy())
                    # print(f"Processed and saved: {filename} in {save_path}")



for d in os.listdir(directory_path):
  dir = os.path.join(directory_path, d)
  load_audio_files(dir, layer_indices=[0, 5, 10, 15, 20, 24])

Using device: cuda


Some weights of Wav2Vec2Model were not initialized from the model checkpoint at facebook/wav2vec2-xls-r-300m and are newly initialized: ['wav2vec2.encoder.pos_conv_embed.conv.parametrizations.weight.original0', 'wav2vec2.encoder.pos_conv_embed.conv.parametrizations.weight.original1']
You should probably TRAIN this model on a down-stream task to be able to use it for predictions and inference.
100%|██████████| 364/364 [01:14<00:00,  4.89it/s]
100%|██████████| 362/362 [01:04<00:00,  5.59it/s]
100%|██████████| 358/358 [01:12<00:00,  4.97it/s]
100%|██████████| 346/346 [01:06<00:00,  5.17it/s]
100%|██████████| 356/356 [01:21<00:00,  4.36it/s]
100%|██████████| 367/367 [01:14<00:00,  4.95it/s]
100%|██████████| 351/351 [01:15<00:00,  4.62it/s]
100%|██████████| 366/366 [01:08<00:00,  5.37it/s]
100%|██████████| 354/354 [01:09<00:00,  5.09it/s]
0it [00:00, ?it/s]
0it [00:00, ?it/s]
0it [00:00, ?it/s]
0it [00:00, ?it/s]
100%|██████████| 62/62 [00:44<00:00,  1.39it/s]
0it [00:00, ?it/s]
0it [00:00,