### Extracting embeddings

Importing packages.

In [2]:
import os
import librosa
import torch
from tqdm import tqdm
import numpy as np
from transformers import Wav2Vec2FeatureExtractor, Wav2Vec2Model

Mounting the data from drive

In [5]:
directory_path = '/content/drive/MyDrive/enhancing_speaker_recognition_evaluation/data'

print(len(os.listdir(directory_path)))

3


In [7]:
directory_path = os.path.expanduser("/home/rag/experimental_trial/data/all_speakers")

print(len(os.listdir(directory_path)))

58


Defining device.

In [4]:
device = torch.device("cuda" if torch.cuda.is_available() else "cpu")
print(f"Using device: {device}")

Using device: cuda


Now we're extracting the vector representations of the audio files in different stages of the encoder.

In [6]:
import os
import librosa
import torch
from tqdm import tqdm
import numpy as np
from transformers import Wav2Vec2FeatureExtractor, Wav2Vec2Model

feature_extractor = Wav2Vec2FeatureExtractor.from_pretrained("facebook/wav2vec2-xls-r-300m")
model = Wav2Vec2Model.from_pretrained("facebook/wav2vec2-xls-r-300m", output_hidden_states=True)
model.to(device)

def check_directories_exist(directory, layer_indices):
    """Prüft, ob die benötigten Verzeichnisse für jede Schicht bereits existieren."""
    all_exist = True
    for index in layer_indices:
        layer_dir = os.path.join(directory, f"layer_{index}")
        if not os.path.exists(layer_dir):
            all_exist = False
            break
    return all_exist

def load_audio_files(directory, layer_indices=[-1]):
    """Lädt alle MP3-Dateien im angegebenen Verzeichnis und extrahiert die Repräsentationen aus den spezifizierten Schichten."""
    for filename in tqdm(os.listdir(directory)):
        if filename.endswith(".mp3"):
            file_path = os.path.join(directory, filename)
            audio, sr = librosa.load(file_path, sr=16000)
            input_values = feature_extractor(audio, return_tensors="pt", sampling_rate=sr).input_values
            input_values = input_values.to(device)
            with torch.no_grad():
                outputs = model(input_values)
                for index in layer_indices:
                    hidden_states = outputs.hidden_states[index]
                    # creating sub directory for each layer in speaker directory
                    layer_dir = os.path.join(directory, f"layer_{index}")
                    os.makedirs(layer_dir, exist_ok=True)
                    save_path = os.path.join(layer_dir, f"{os.path.splitext(filename)[0]}_layer_{index}.npy")
                    np.save(save_path, hidden_states.cpu().numpy())

def process_audio_directory(base_directory, layer_indices=range(25)):
    """Verarbeitet Audio-Dateien in den angegebenen Verzeichnissen, falls die Ziellayer-Verzeichnisse noch nicht existieren."""
    for d in os.listdir(base_directory):
        dir_path = os.path.join(base_directory, d)
        if os.path.isdir(dir_path) and not check_directories_exist(dir_path, layer_indices):
            load_audio_files(dir_path, layer_indices)

directory_path = os.path.expanduser("/home/rag/experimental_trial/data/all_speakers_xls_r_300m")

process_audio_directory(directory_path)

Some weights of Wav2Vec2Model were not initialized from the model checkpoint at facebook/wav2vec2-xls-r-300m and are newly initialized: ['wav2vec2.encoder.pos_conv_embed.conv.parametrizations.weight.original0', 'wav2vec2.encoder.pos_conv_embed.conv.parametrizations.weight.original1']
You should probably TRAIN this model on a down-stream task to be able to use it for predictions and inference.
  return F.conv1d(input, weight, bias, self.stride,
100%|██████████| 50/50 [00:10<00:00,  4.94it/s]
100%|██████████| 50/50 [00:04<00:00, 12.24it/s]
100%|██████████| 50/50 [00:03<00:00, 13.75it/s]
100%|██████████| 50/50 [00:03<00:00, 14.41it/s]
100%|██████████| 50/50 [00:03<00:00, 14.84it/s]
100%|██████████| 50/50 [00:03<00:00, 14.47it/s]
100%|██████████| 50/50 [00:03<00:00, 15.86it/s]
100%|██████████| 50/50 [00:04<00:00, 11.87it/s]
100%|██████████| 50/50 [00:03<00:00, 12.56it/s]
100%|██████████| 50/50 [00:03<00:00, 12.90it/s]
100%|██████████| 50/50 [00:04<00:00, 11.81it/s]
100%|██████████| 50/50 [

# fine tuning von XLS R

In [None]:
import torch
from datasets import load_dataset
from transformers import Wav2Vec2ForCTC, Trainer, TrainingArguments
from transformers import Wav2Vec2FeatureExtractor
from torch.nn.functional import cross_entropy

# Load dataset
dataset = load_dataset("voxceleb1")

# Prepare feature extractor
feature_extractor = Wav2Vec2FeatureExtractor.from_pretrained("facebook/wav2vec2-xls-r-300m")

# Define dataset preprocessing
def prepare_dataset(batch):
    # Process audio files
    audio = batch["audio"]
    inputs = feature_extractor(audio["array"], sampling_rate=audio["sampling_rate"], return_tensors="pt")
    batch["input_values"] = inputs.input_values.squeeze(0)
    batch["labels"] = batch["speaker_id"]
    return batch

# Apply preprocessing
dataset = dataset.map(prepare_dataset, remove_columns=dataset.column_names["train"], batch_size=8, num_proc=4, batched=True)

# Model
model = Wav2Vec2ForCTC.from_pretrained("facebook/wav2vec2-xls-r-300m", num_labels=dataset["train"].features["speaker_id"].num_classes)

# Define Training Arguments
training_args = TrainingArguments(
  output_dir="./results",
  group_by_length=True,
  per_device_train_batch_shift_size=16,
  evaluation_strategy="steps",
  num_train_epochs=3,
  save_steps=500,
  eval_steps=500,
  logging_steps=10,
  learning_rate=1e-4,
  save_total_limit=2,
)

# Trainer with a custom compute_loss function
class CustomTrainer(Trainer):
    def compute_loss(self, model, inputs, return_outputs=False):
        labels = inputs.pop("labels")
        outputs = model(**inputs)
        logits = outputs.logits
        loss = cross_entropy(logits.view(-1, self.model.config.num_labels), labels.view(-1))
        return (loss, outputs) if return_outputs else loss

# Define Trainer
trainer = CustomTrainer(
  model=model,
  args=training_args,
  train_dataset=dataset["train"],
  eval_dataset=dataset["test"],
  tokenizer=feature_extractor,
)

# Start training
trainer.train()


fine tuning on our dataset

In [3]:
import os
import sys
import torch
import librosa
import pandas as pd
import numpy as np
from torch.utils.data import Dataset
from transformers import Wav2Vec2Processor, Wav2Vec2Model, Trainer, TrainingArguments, TrainerCallback, Wav2Vec2FeatureExtractor
import math
from datasets import load_metric
from datetime import datetime
import torch.nn as nn

# Load the processor
processor = Wav2Vec2FeatureExtractor.from_pretrained("facebook/wav2vec2-xls-r-300m")

# Define the custom dataset class using pandas
class LocalAudioDataset(Dataset):
    def __init__(self, csv_file, processor, subset, noise_factor=0.0):
        self.processor = processor
        self.data = pd.read_csv(csv_file)
        self.data = self.data[self.data['subset'] == subset]
        self.speaker_ids = {label: idx for idx, label in enumerate(self.data['label'].unique())}
        self.data['label'] = self.data['label'].map(self.speaker_ids)
        self.noise_factor = noise_factor
        
        print(f"Loaded {len(self.speaker_ids)} speakers: {self.speaker_ids}")
        print(f"Total files in {subset}: {len(self.data)}")

    def __len__(self):
        return len(self.data)

    def __getitem__(self, idx, retry_count=0):
        file_path = self.data.iloc[idx]['path']
        label = self.data.iloc[idx]['label']
        
        try:
            audio, sr = librosa.load(file_path, sr=16000)
            audio = librosa.to_mono(audio)
            audio = self._pad_or_truncate(audio, max_length=16000)
            if self.noise_factor > 0:
                audio = self._add_noise(audio)
            input_values = self.processor(audio, sampling_rate=16000, return_tensors="pt").input_values.squeeze(0)
            return {"input_values": input_values, "labels": label}
        except Exception as e:
            if retry_count < 3:  # Retry up to 3 times
                return self.__getitem__((idx + 1) % len(self), retry_count + 1)
            else:
                print(f"Error loading {file_path}: {e}", file=sys.stderr)
                raise e  # Raise exception if retry limit is reached

    def _pad_or_truncate(self, audio, max_length):
        if len(audio) < max_length:
            pad_size = max_length - len(audio)
            audio = np.pad(audio, (0, pad_size), 'constant', constant_values=(0, 0))
        else:
            audio = audio[:max_length]
        return audio

    def _add_noise(self, audio):
        noise = np.random.randn(len(audio))
        augmented_audio = audio + self.noise_factor * noise
        augmented_audio = augmented_audio.astype(type(audio[0]))
        return augmented_audio

# Paths to dataset CSV file
csv_file = 'dataset_large.csv'
train_dataset = LocalAudioDataset(csv_file, processor, 'train')
validate_dataset = LocalAudioDataset(csv_file, processor, 'validate')
test_dataset = LocalAudioDataset(csv_file, processor, 'test')

num_speakers = len(train_dataset.speaker_ids)
print(f"Number of unique speakers: {num_speakers}")

print(f"Labels in train dataset: {train_dataset.data['label'].tolist()}")
print(f"Labels in test dataset: {test_dataset.data['label'].tolist()}")

# Define a custom classification head on top of the base Wav2Vec2 model
class Wav2Vec2ClassificationHead(nn.Module):
    def __init__(self, config, num_labels):
        super().__init__()
        self.dropout = nn.Dropout(config.hidden_dropout)
        self.dense = nn.Linear(config.hidden_size, config.hidden_size)
        self.out_proj = nn.Linear(config.hidden_size, num_labels)

    def forward(self, features, **kwargs):
        x = features[:, 0, :]  # take the mean of the hidden states of the first token
        x = self.dropout(x)
        x = self.dense(x)
        x = torch.tanh(x)
        x = self.dropout(x)
        x = self.out_proj(x)
        return x

# Define the full model by combining Wav2Vec2Model with the classification head
class Wav2Vec2ForCustomClassification(nn.Module):
    def __init__(self, num_labels):
        super().__init__()
        self.wav2vec2 = Wav2Vec2Model.from_pretrained("facebook/wav2vec2-xls-r-300m")
        self.classifier = Wav2Vec2ClassificationHead(self.wav2vec2.config, num_labels)

    def forward(self, input_values, attention_mask=None, labels=None):
        outputs = self.wav2vec2(input_values, attention_mask=attention_mask)
        hidden_states = outputs.last_hidden_state
        logits = self.classifier(hidden_states)
        
        loss = None
        if labels is not None:
            loss_fct = nn.CrossEntropyLoss()
            loss = loss_fct(logits.view(-1, logits.shape[-1]), labels.view(-1))
        
        return (loss, logits) if loss is not None else logits

# Instantiate the model with the custom classification head
model = Wav2Vec2ForCustomClassification(num_labels=num_speakers)

device = torch.device("cuda" if torch.cuda.is_available() else "cpu")
print(f"Using device: {device}")
model = model.to(device)

def validate_labels(dataset):
    for item in dataset:
        label = item['labels']
        if label >= num_speakers or label < 0:
            print(f"Invalid label {label} for item: {item}")
            raise ValueError(f"Invalid label {label} found in dataset.")
    print("All labels are valid.")

validate_labels(train_dataset)
validate_labels(validate_dataset)
validate_labels(test_dataset)

batch_size = 8
steps_per_epoch = math.ceil(len(train_dataset) / batch_size)
logging_steps = steps_per_epoch // 5
eval_steps = steps_per_epoch // 5

accuracy_metric = load_metric("accuracy")

def compute_metrics(eval_pred):
    logits, labels = eval_pred
    predictions = np.argmax(logits, axis=-1)
    return accuracy_metric.compute(predictions=predictions, references=labels)

log_dir = "/home/rag/experimental_trial/results/training_logs"
os.makedirs(log_dir, exist_ok=True)
timestamp = datetime.now().strftime("%Y%m%d_%H%M%S")
log_file = os.path.join(log_dir, f"training_logxlsr_finetuning_{timestamp}.csv")
with open(log_file, "w") as f:
    f.write("Timestamp,Step,Training Loss,Validation Loss,Accuracy\n")

class SaveMetricsCallback(TrainerCallback):
    def on_log(self, args, state, control, logs=None, **kwargs):
        if logs is not None:
            with open(log_file, "a") as f:
                timestamp = datetime.now().strftime("%Y-%m-%d %H:%M:%S")
                step = state.global_step
                training_loss = logs.get("loss", "")
                validation_loss = logs.get("eval_loss", "")
                accuracy = logs.get("eval_accuracy", "")
                f.write(f"{timestamp},{step},{training_loss},{validation_loss},{accuracy}\n")

class EarlyStoppingCallback(TrainerCallback):
    def __init__(self, early_stopping_patience=100, early_stopping_threshold=0.0):
        self.early_stopping_patience = early_stopping_patience
        self.early_stopping_threshold = early_stopping_threshold
        self.best_metric = None
        self.patience_counter = 0

    def on_evaluate(self, args, state, control, **kwargs):
        metric = kwargs.get("metrics", {}).get("eval_loss")
        if metric is None:
            return
        
        if self.best_metric is None or metric < self.best_metric - self.early_stopping_threshold:
            self.best_metric = metric
            self.patience_counter = 0
        else:
            self.patience_counter += 1
        
        if self.patience_counter >= self.early_stopping_patience:
            print(f"Early stopping at step {state.global_step}")
            control.should_training_stop = True

# Ensure 'no_cuda' parameter aligns with device availability
training_args = TrainingArguments(
    output_dir="./results",
    group_by_length=True,
    per_device_train_batch_size=batch_size,
    evaluation_strategy="steps",
    num_train_epochs=100,
    save_steps=logging_steps,
    eval_steps=eval_steps,
    logging_steps=logging_steps,
    learning_rate=5e-6,
    save_total_limit=2,
    no_cuda=not torch.cuda.is_available(),
    load_best_model_at_end=True,
    metric_for_best_model="eval_loss",
    greater_is_better=False,  # lower eval_loss is better
    save_strategy="steps"  # Save checkpoints every `save_steps`
)

# Add early stopping callback to the trainer
trainer = Trainer(
    model=model,
    args=training_args,
    train_dataset=train_dataset,
    eval_dataset=validate_dataset,
    tokenizer=processor,
    compute_metrics=compute_metrics,
    callbacks=[SaveMetricsCallback(), EarlyStoppingCallback()]  # Include early stopping
)

# Train and evaluate
trainer.train()

metrics = trainer.evaluate(test_dataset)

print(f"Test set evaluation metrics: {metrics}")
print("Training and evaluation completed successfully!")

best_model_dir = "./results/best_model_xlsr_finetuning"
os.makedirs(best_model_dir, exist_ok=True)

trainer.save_model(best_model_dir)
processor.save_pretrained(best_model_dir)

print(f"Best model saved to {best_model_dir}")


Loaded 111 speakers: {'speaker_6': 0, 'speaker_156': 1, 'speaker_22': 2, 'speaker_19': 3, 'speaker_91': 4, 'speaker_27': 5, 'speaker_94': 6, 'speaker_34': 7, 'speaker_97': 8, 'speaker_100': 9, 'speaker_36': 10, 'speaker_128': 11, 'speaker_134': 12, 'speaker_68': 13, 'speaker_9': 14, 'speaker_17': 15, 'speaker_73': 16, 'speaker_42': 17, 'speaker_52': 18, 'speaker_151': 19, 'speaker_150': 20, 'speaker_141': 21, 'speaker_82': 22, 'speaker_130': 23, 'speaker_75': 24, 'speaker_58': 25, 'speaker_74': 26, 'speaker_104': 27, 'speaker_47': 28, 'speaker_135': 29, 'speaker_71': 30, 'speaker_83': 31, 'speaker_116': 32, 'speaker_99': 33, 'speaker_108': 34, 'speaker_31': 35, 'speaker_106': 36, 'speaker_28': 37, 'speaker_65': 38, 'speaker_48': 39, 'speaker_49': 40, 'speaker_53': 41, 'speaker_3': 42, 'speaker_63': 43, 'speaker_138': 44, 'speaker_98': 45, 'speaker_92': 46, 'speaker_123': 47, 'speaker_32': 48, 'speaker_10': 49, 'speaker_155': 50, 'speaker_153': 51, 'speaker_23': 52, 'speaker_59': 53, 's

Some weights of Wav2Vec2Model were not initialized from the model checkpoint at facebook/wav2vec2-xls-r-300m and are newly initialized: ['wav2vec2.encoder.pos_conv_embed.conv.parametrizations.weight.original0', 'wav2vec2.encoder.pos_conv_embed.conv.parametrizations.weight.original1']
You should probably TRAIN this model on a down-stream task to be able to use it for predictions and inference.


Using device: cuda
All labels are valid.
All labels are valid.
All labels are valid.


You can avoid this message in future by passing the argument `trust_remote_code=True`.
Passing `trust_remote_code=True` will be mandatory to load this metric from the next major release of `datasets`.


Step,Training Loss,Validation Loss,Accuracy
222,4.7061,4.708409,0.008108
444,4.7057,4.706166,0.012613
666,4.6998,4.701221,0.013514
888,4.695,4.695594,0.012613
1110,4.6848,4.686562,0.017117
1332,4.6689,4.672047,0.020721
1554,4.6539,4.652225,0.033333
1776,4.6358,4.627883,0.063063
1998,4.6109,4.597482,0.081081
2220,4.5818,4.540682,0.10991


Early stopping at step 95904


Test set evaluation metrics: {'eval_loss': 0.5091816782951355, 'eval_accuracy': 0.8729729729729729, 'eval_runtime': 13.3249, 'eval_samples_per_second': 83.303, 'eval_steps_per_second': 10.432, 'epoch': 86.4}
Training and evaluation completed successfully!
Best model saved to ./results/best_model_xlsr_finetuning


now we extract hidden states

In [10]:
import os
import numpy as np
import torch
from transformers import Wav2Vec2Processor, Wav2Vec2Model, Wav2Vec2FeatureExtractor
from tqdm import tqdm
import librosa
from safetensors.torch import load_file as safe_load
from torch import nn

# Initialize the processor and model for xlsr
processor = Wav2Vec2FeatureExtractor.from_pretrained("facebook/wav2vec2-xls-r-300m")
finetuned_model_path = "/home/rag/experimental_trial/results/best_model_xlsr_finetuning/model.safetensors"

# Define a custom classification head on top of the base Wav2Vec2 model
class Wav2Vec2ClassificationHead(nn.Module):
    def __init__(self, config, num_labels):
        super().__init__()
        self.dropout = nn.Dropout(config.hidden_dropout)
        self.dense = nn.Linear(config.hidden_size, config.hidden_size)
        self.out_proj = nn.Linear(config.hidden_size, num_labels)

    def forward(self, features, **kwargs):
        x = features[:, 0, :]  # take the mean of the hidden states of the first token
        x = self.dropout(x)
        x = self.dense(x)
        x = torch.tanh(x)
        x = self.dropout(x)
        x = self.out_proj(x)
        return x

# Define the full model by combining Wav2Vec2Model with the classification head
class Wav2Vec2ForCustomClassification(nn.Module):
    def __init__(self, num_labels):
        super().__init__()
        self.wav2vec2 = Wav2Vec2Model.from_pretrained("facebook/wav2vec2-xls-r-300m", output_hidden_states=True)
        self.classifier = Wav2Vec2ClassificationHead(self.wav2vec2.config, num_labels)

    def forward(self, input_values, attention_mask=None, labels=None):
        outputs = self.wav2vec2(input_values, attention_mask=attention_mask)
        hidden_states = outputs.hidden_states
        logits = self.classifier(hidden_states[-1])
        
        loss = None
        if labels is not None:
            loss_fct = nn.CrossEntropyLoss()
            loss = loss_fct(logits.view(-1, logits.shape[-1]), labels.view(-1))
        
        return (loss, logits, hidden_states) if loss is not None else (logits, hidden_states)

# Instantiate the model with the custom classification head
model = Wav2Vec2ForCustomClassification(num_labels=111)
device = torch.device("cuda" if torch.cuda.is_available() else "cpu")
state_dict = safe_load(finetuned_model_path)
model.load_state_dict(state_dict)
model.to(device)

def check_directories_exist(directory, layer_indices):
    """Prüft, ob die benötigten Verzeichnisse für jede Schicht bereits existieren."""
    all_exist = True
    for index in layer_indices:
        layer_dir = os.path.join(directory, f"layer_{index}")
        if not os.path.exists(layer_dir):
            all_exist = False
            break
    return all_exist

def load_audio_files(directory, layer_indices=[-1]):
    """Lädt alle MP3-Dateien im angegebenen Verzeichnis und extrahiert die Repräsentationen aus den spezifizierten Schichten."""
    for filename in tqdm(os.listdir(directory)):
        if filename.endswith(".mp3"):
            file_path = os.path.join(directory, filename)
            audio, sr = librosa.load(file_path, sr=16000)
            inputs = processor(audio, sampling_rate=sr, return_tensors="pt")
            input_values = inputs["input_values"].to(device)
            
            with torch.no_grad():
                logits, hidden_states = model(input_values)
                for index in layer_indices:
                    hidden_state = hidden_states[index]
                    # creating sub directory for each layer in speaker directory
                    layer_dir = os.path.join(directory, f"layer_{index}")
                    os.makedirs(layer_dir, exist_ok=True)
                    save_path = os.path.join(layer_dir, f"{os.path.splitext(filename)[0]}_layer_{index}.npy")
                    np.save(save_path, hidden_state.cpu().numpy())

def process_audio_directory(base_directory, layer_indices=range(25)):
    """Verarbeitet Audio-Dateien in den angegebenen Verzeichnissen, falls die Ziellayer-Verzeichnisse noch nicht existieren."""
    for d in os.listdir(base_directory):
        dir_path = os.path.join(base_directory, d)
        if os.path.isdir(dir_path) and not check_directories_exist(dir_path, layer_indices):
            load_audio_files(dir_path, layer_indices)

directory_path = os.path.expanduser("/home/rag/experimental_trial/data/all_speakers_xlrs_finetuned")

process_audio_directory(directory_path)


Some weights of Wav2Vec2Model were not initialized from the model checkpoint at facebook/wav2vec2-xls-r-300m and are newly initialized: ['wav2vec2.encoder.pos_conv_embed.conv.parametrizations.weight.original0', 'wav2vec2.encoder.pos_conv_embed.conv.parametrizations.weight.original1']
You should probably TRAIN this model on a down-stream task to be able to use it for predictions and inference.
100%|██████████| 50/50 [00:04<00:00, 11.70it/s]
100%|██████████| 50/50 [00:05<00:00,  9.26it/s]
100%|██████████| 50/50 [00:04<00:00, 11.64it/s]
100%|██████████| 50/50 [00:03<00:00, 13.00it/s]
100%|██████████| 50/50 [00:03<00:00, 14.15it/s]
100%|██████████| 50/50 [00:03<00:00, 13.95it/s]
100%|██████████| 50/50 [00:03<00:00, 14.45it/s]
100%|██████████| 50/50 [00:04<00:00, 11.66it/s]
100%|██████████| 50/50 [00:03<00:00, 12.51it/s]
100%|██████████| 50/50 [00:04<00:00, 12.08it/s]
100%|██████████| 50/50 [00:05<00:00,  9.36it/s]
100%|██████████| 50/50 [00:05<00:00,  9.33it/s]
100%|██████████| 50/50 [00:0

# now we use optuna to optimize the number of parameter used 

In [12]:
import os
import sys
import torch
import librosa
import pandas as pd
import numpy as np
from torch.utils.data import Dataset
from transformers import Wav2Vec2Processor, Wav2Vec2Model, Trainer, TrainingArguments, TrainerCallback, Wav2Vec2FeatureExtractor, Wav2Vec2Config
import math
from datasets import load_metric
from datetime import datetime
import torch.nn as nn
import optuna

# Load the processor
processor = Wav2Vec2FeatureExtractor.from_pretrained("facebook/wav2vec2-xls-r-300m")

# Define the custom dataset class using pandas
class LocalAudioDataset(Dataset):
    def __init__(self, csv_file, processor, subset, noise_factor=0.0):
        self.processor = processor
        self.data = pd.read_csv(csv_file)
        self.data = self.data[self.data['subset'] == subset]
        self.speaker_ids = {label: idx for idx, label in enumerate(self.data['label'].unique())}
        self.data['label'] = self.data['label'].map(self.speaker_ids)
        self.noise_factor = noise_factor
        
        print(f"Loaded {len(self.speaker_ids)} speakers: {self.speaker_ids}")
        print(f"Total files in {subset}: {len(self.data)}")

    def __len__(self):
        return len(self.data)

    def __getitem__(self, idx, retry_count=0):
        file_path = self.data.iloc[idx]['path']
        label = self.data.iloc[idx]['label']
        
        try:
            audio, sr = librosa.load(file_path, sr=16000)
            audio = librosa.to_mono(audio)
            audio = self._pad_or_truncate(audio, max_length=16000)
            if self.noise_factor > 0:
                audio = self._add_noise(audio)
            input_values = self.processor(audio, sampling_rate=16000, return_tensors="pt").input_values.squeeze(0)
            return {"input_values": input_values, "labels": label}
        except Exception as e:
            if retry_count < 3:  # Retry up to 3 times
                return self.__getitem__((idx + 1) % len(self), retry_count + 1)
            else:
                print(f"Error loading {file_path}: {e}", file=sys.stderr)
                raise e  # Raise exception if retry limit is reached

    def _pad_or_truncate(self, audio, max_length):
        if len(audio) < max_length:
            pad_size = max_length - len(audio)
            audio = np.pad(audio, (0, pad_size), 'constant', constant_values=(0, 0))
        else:
            audio = audio[:max_length]
        return audio

    def _add_noise(self, audio):
        noise = np.random.randn(len(audio))
        augmented_audio = audio + self.noise_factor * noise
        augmented_audio = augmented_audio.astype(type(audio[0]))
        return augmented_audio

# Paths to dataset CSV file
csv_file = 'dataset_large.csv'
train_dataset = LocalAudioDataset(csv_file, processor, 'train')
validate_dataset = LocalAudioDataset(csv_file, processor, 'validate')
test_dataset = LocalAudioDataset(csv_file, processor, 'test')

num_speakers = len(train_dataset.speaker_ids)
print(f"Number of unique speakers: {num_speakers}")

print(f"Labels in train dataset: {train_dataset.data['label'].tolist()}")
print(f"Labels in test dataset: {test_dataset.data['label'].tolist()}")

# Define a custom classification head on top of the base Wav2Vec2 model
class Wav2Vec2ClassificationHead(nn.Module):
    def __init__(self, config, num_labels):
        super().__init__()
        self.dropout = nn.Dropout(config.hidden_dropout)
        self.dense = nn.Linear(config.hidden_size, config.hidden_size)
        self.out_proj = nn.Linear(config.hidden_size, num_labels)

    def forward(self, features, **kwargs):
        x = features[:, 0, :]  # take the mean of the hidden states of the first token
        x = self.dropout(x)
        x = self.dense(x)
        x = torch.tanh(x)
        x = self.dropout(x)
        x = self.out_proj(x)
        return x

# Define the full model by combining Wav2Vec2Model with the classification head
class Wav2Vec2ForCustomClassification(nn.Module):
    def __init__(self, num_labels):
        super().__init__()
        self.wav2vec2 = Wav2Vec2Model.from_pretrained("facebook/wav2vec2-xls-r-300m")
        self.classifier = Wav2Vec2ClassificationHead(self.wav2vec2.config, num_labels)

    def forward(self, input_values, attention_mask=None, labels=None):
        outputs = self.wav2vec2(input_values, attention_mask=attention_mask)
        hidden_states = outputs.last_hidden_state
        logits = self.classifier(hidden_states)
        
        loss = None
        if labels is not None:
            loss_fct = nn.CrossEntropyLoss()
            loss = loss_fct(logits.view(-1, logits.shape[-1]), labels.view(-1))
        
        return (loss, logits) if loss is not None else logits

# Instantiate the model with the custom classification head
model = Wav2Vec2ForCustomClassification(num_labels=num_speakers)

device = torch.device("cuda" if torch.cuda.is_available() else "cpu")
print(f"Using device: {device}")
model = model.to(device)

def validate_labels(dataset):
    for item in dataset:
        label = item['labels']
        if label >= num_speakers or label < 0:
            print(f"Invalid label {label} for item: {item}")
            raise ValueError(f"Invalid label {label} found in dataset.")
    print("All labels are valid.")

validate_labels(train_dataset)
validate_labels(validate_dataset)
validate_labels(test_dataset)

batch_size = 8
steps_per_epoch = math.ceil(len(train_dataset) / batch_size)
logging_steps = steps_per_epoch // 5
eval_steps = steps_per_epoch // 5

accuracy_metric = load_metric("accuracy")

def compute_metrics(eval_pred):
    logits, labels = eval_pred
    predictions = np.argmax(logits, axis=-1)
    return accuracy_metric.compute(predictions=predictions, references=labels)

log_dir = "/home/rag/experimental_trial/results/training_logs"
os.makedirs(log_dir, exist_ok=True)
timestamp = datetime.now().strftime("%Y%m%d_%H%M%S")
log_file = os.path.join(log_dir, f"training_logxlsr_finetuning_optimizing_layers_{timestamp}.csv")
with open(log_file, "w") as f:
    f.write("Timestamp,Step,Training Loss,Validation Loss,Accuracy\n")

class SaveMetricsCallback(TrainerCallback):
    def on_log(self, args, state, control, logs=None, **kwargs):
        if logs is not None:
            with open(log_file, "a") as f:
                timestamp = datetime.now().strftime("%Y-%m-%d %H:%M:%S")
                step = state.global_step
                training_loss = logs.get("loss", "")
                validation_loss = logs.get("eval_loss", "")
                accuracy = logs.get("eval_accuracy", "")
                f.write(f"{timestamp},{step},{training_loss},{validation_loss},{accuracy}\n")

class EarlyStoppingCallback(TrainerCallback):
    def __init__(self, early_stopping_patience=100, early_stopping_threshold=0.0):
        self.early_stopping_patience = early_stopping_patience
        self.early_stopping_threshold = early_stopping_threshold
        self.best_metric = None
        self.patience_counter = 0

    def on_evaluate(self, args, state, control, **kwargs):
        metric = kwargs.get("metrics", {}).get("eval_loss")
        if metric is None:
            return
        
        if self.best_metric is None or metric < self.best_metric - self.early_stopping_threshold:
            self.best_metric = metric
            self.patience_counter = 0
        else:
            self.patience_counter += 1
        
        if self.patience_counter >= self.early_stopping_patience:
            print(f"Early stopping at step {state.global_step}")
            control.should_training_stop = True

# Define the Optuna objective function
def objective(trial):
    # Suggest the number of layers
    num_layers = trial.suggest_int('num_layers', 1, 24)

    # Load the model configuration with the suggested number of layers
    config = Wav2Vec2Config.from_pretrained("facebook/wav2vec2-xls-r-300m", num_labels=num_speakers)
    config.num_hidden_layers = num_layers

    # Instantiate the model with the custom classification head
    model = Wav2Vec2ForCustomClassification(num_labels=num_speakers)
    model.wav2vec2.encoder.layers = model.wav2vec2.encoder.layers[:num_layers]

    device = torch.device("cuda" if torch.cuda.is_available() else "cpu")
    model = model.to(device)

    # Ensure 'no_cuda' parameter aligns with device availability
    training_args = TrainingArguments(
        output_dir="./results",
        group_by_length=True,
        per_device_train_batch_size=batch_size,
        evaluation_strategy="steps",
        num_train_epochs=10,  # Use 10 epochs
        save_steps=logging_steps,
        eval_steps=eval_steps,
        logging_steps=logging_steps,
        learning_rate=5e-6,  # Fixed learning rate
        save_total_limit=2,
        no_cuda=not torch.cuda.is_available(),
        load_best_model_at_end=True,
        metric_for_best_model="eval_loss",
        greater_is_better=False,
        save_strategy="steps"
    )

    # Add early stopping callback to the trainer
    trainer = Trainer(
        model=model,
        args=training_args,
        train_dataset=train_dataset,
        eval_dataset=validate_dataset,
        tokenizer=processor,
        compute_metrics=compute_metrics,
        callbacks=[SaveMetricsCallback(), EarlyStoppingCallback()]
    )

    # Train and evaluate
    trainer.train()
    
    metrics = trainer.evaluate(validate_dataset)
    
    # Return the evaluation loss for Optuna to minimize
    return metrics["eval_loss"]

# Create an Optuna study and optimize the objective function
study = optuna.create_study(direction="minimize")
study.optimize(objective, n_trials=25)

# Print the best hyperparameters found
print(f"Best hyperparameters: {study.best_params}")

# Save the best model and processor
best_model_dir = "./results/best_model_xlsr_finetuning_optuna_layer_optimized"
os.makedirs(best_model_dir, exist_ok=True)

trainer.save_model(best_model_dir)
processor.save_pretrained(best_model_dir)

print(f"Best model saved to {best_model_dir}")


Loaded 111 speakers: {'speaker_6': 0, 'speaker_156': 1, 'speaker_22': 2, 'speaker_19': 3, 'speaker_91': 4, 'speaker_27': 5, 'speaker_94': 6, 'speaker_34': 7, 'speaker_97': 8, 'speaker_100': 9, 'speaker_36': 10, 'speaker_128': 11, 'speaker_134': 12, 'speaker_68': 13, 'speaker_9': 14, 'speaker_17': 15, 'speaker_73': 16, 'speaker_42': 17, 'speaker_52': 18, 'speaker_151': 19, 'speaker_150': 20, 'speaker_141': 21, 'speaker_82': 22, 'speaker_130': 23, 'speaker_75': 24, 'speaker_58': 25, 'speaker_74': 26, 'speaker_104': 27, 'speaker_47': 28, 'speaker_135': 29, 'speaker_71': 30, 'speaker_83': 31, 'speaker_116': 32, 'speaker_99': 33, 'speaker_108': 34, 'speaker_31': 35, 'speaker_106': 36, 'speaker_28': 37, 'speaker_65': 38, 'speaker_48': 39, 'speaker_49': 40, 'speaker_53': 41, 'speaker_3': 42, 'speaker_63': 43, 'speaker_138': 44, 'speaker_98': 45, 'speaker_92': 46, 'speaker_123': 47, 'speaker_32': 48, 'speaker_10': 49, 'speaker_155': 50, 'speaker_153': 51, 'speaker_23': 52, 'speaker_59': 53, 's

Some weights of Wav2Vec2Model were not initialized from the model checkpoint at facebook/wav2vec2-xls-r-300m and are newly initialized: ['wav2vec2.encoder.pos_conv_embed.conv.parametrizations.weight.original0', 'wav2vec2.encoder.pos_conv_embed.conv.parametrizations.weight.original1']
You should probably TRAIN this model on a down-stream task to be able to use it for predictions and inference.


Using device: cuda
All labels are valid.
All labels are valid.
All labels are valid.


  accuracy_metric = load_metric("accuracy")
You can avoid this message in future by passing the argument `trust_remote_code=True`.
Passing `trust_remote_code=True` will be mandatory to load this metric from the next major release of `datasets`.
[I 2024-06-18 13:32:37,615] A new study created in memory with name: no-name-7d154c77-f6c2-45e6-9c2a-f7d86fc663cb
Some weights of Wav2Vec2Model were not initialized from the model checkpoint at facebook/wav2vec2-xls-r-300m and are newly initialized: ['wav2vec2.encoder.pos_conv_embed.conv.parametrizations.weight.original0', 'wav2vec2.encoder.pos_conv_embed.conv.parametrizations.weight.original1']
You should probably TRAIN this model on a down-stream task to be able to use it for predictions and inference.
  return F.conv1d(input, weight, bias, self.stride,


Step,Training Loss,Validation Loss,Accuracy
222,4.708,4.701856,0.016216
444,4.7028,4.689518,0.024324
666,4.6933,4.675404,0.036036
888,4.6819,4.656057,0.065766
1110,4.6694,4.632829,0.074775
1332,4.6546,4.609973,0.105405
1554,4.6314,4.581596,0.116216
1776,4.6139,4.551846,0.118018
1998,4.5888,4.521278,0.116216
2220,4.5652,4.481288,0.118919


[I 2024-06-18 14:04:04,138] Trial 0 finished with value: 3.700018882751465 and parameters: {'num_layers': 6}. Best is trial 0 with value: 3.700018882751465.
Some weights of Wav2Vec2Model were not initialized from the model checkpoint at facebook/wav2vec2-xls-r-300m and are newly initialized: ['wav2vec2.encoder.pos_conv_embed.conv.parametrizations.weight.original0', 'wav2vec2.encoder.pos_conv_embed.conv.parametrizations.weight.original1']
You should probably TRAIN this model on a down-stream task to be able to use it for predictions and inference.


Step,Training Loss,Validation Loss,Accuracy
222,4.7088,4.70505,0.013514
444,4.7058,4.699969,0.016216
666,4.703,4.690352,0.02973
888,4.6996,4.679365,0.02973
1110,4.6918,4.664335,0.040541
1332,4.6763,4.648576,0.045946
1554,4.6678,4.626786,0.053153
1776,4.6541,4.604692,0.073874
1998,4.6398,4.573865,0.066667
2220,4.617,4.549234,0.071171


[I 2024-06-18 14:50:20,789] Trial 1 finished with value: 3.7878518104553223 and parameters: {'num_layers': 17}. Best is trial 0 with value: 3.700018882751465.
Some weights of Wav2Vec2Model were not initialized from the model checkpoint at facebook/wav2vec2-xls-r-300m and are newly initialized: ['wav2vec2.encoder.pos_conv_embed.conv.parametrizations.weight.original0', 'wav2vec2.encoder.pos_conv_embed.conv.parametrizations.weight.original1']
You should probably TRAIN this model on a down-stream task to be able to use it for predictions and inference.


Step,Training Loss,Validation Loss,Accuracy
222,4.707,4.701768,0.018919
444,4.7045,4.690088,0.023423
666,4.693,4.676493,0.047748
888,4.6842,4.657732,0.064865
1110,4.6735,4.634727,0.090991
1332,4.654,4.605322,0.098198
1554,4.6348,4.574787,0.120721
1776,4.6158,4.53982,0.151351
1998,4.592,4.506039,0.15045
2220,4.5656,4.470708,0.133333


[I 2024-06-18 15:19:47,137] Trial 2 finished with value: 3.6675639152526855 and parameters: {'num_layers': 4}. Best is trial 2 with value: 3.6675639152526855.
Some weights of Wav2Vec2Model were not initialized from the model checkpoint at facebook/wav2vec2-xls-r-300m and are newly initialized: ['wav2vec2.encoder.pos_conv_embed.conv.parametrizations.weight.original0', 'wav2vec2.encoder.pos_conv_embed.conv.parametrizations.weight.original1']
You should probably TRAIN this model on a down-stream task to be able to use it for predictions and inference.


Step,Training Loss,Validation Loss,Accuracy
222,4.7091,4.709498,0.009009
444,4.7075,4.706626,0.004505
666,4.7012,4.70143,0.008108
888,4.6948,4.697382,0.018919
1110,4.6875,4.687127,0.034234
1332,4.672,4.682913,0.021622
1554,4.6612,4.663531,0.040541
1776,4.6394,4.634953,0.048649
1998,4.6254,4.609554,0.05045
2220,4.6025,4.575008,0.057658


[I 2024-06-18 16:19:43,081] Trial 3 finished with value: 3.748971462249756 and parameters: {'num_layers': 24}. Best is trial 2 with value: 3.6675639152526855.
Some weights of Wav2Vec2Model were not initialized from the model checkpoint at facebook/wav2vec2-xls-r-300m and are newly initialized: ['wav2vec2.encoder.pos_conv_embed.conv.parametrizations.weight.original0', 'wav2vec2.encoder.pos_conv_embed.conv.parametrizations.weight.original1']
You should probably TRAIN this model on a down-stream task to be able to use it for predictions and inference.


Step,Training Loss,Validation Loss,Accuracy
222,4.7083,4.701194,0.020721
444,4.7068,4.690593,0.02973
666,4.6948,4.67784,0.038739
888,4.6852,4.661039,0.051351
1110,4.6808,4.644998,0.071171
1332,4.6628,4.621366,0.10991
1554,4.6465,4.598451,0.128829
1776,4.634,4.574369,0.138739
1998,4.6164,4.547435,0.137838
2220,4.5926,4.515973,0.144144


[I 2024-06-18 16:51:38,106] Trial 4 finished with value: 3.7498741149902344 and parameters: {'num_layers': 4}. Best is trial 2 with value: 3.6675639152526855.
Some weights of Wav2Vec2Model were not initialized from the model checkpoint at facebook/wav2vec2-xls-r-300m and are newly initialized: ['wav2vec2.encoder.pos_conv_embed.conv.parametrizations.weight.original0', 'wav2vec2.encoder.pos_conv_embed.conv.parametrizations.weight.original1']
You should probably TRAIN this model on a down-stream task to be able to use it for predictions and inference.


Step,Training Loss,Validation Loss,Accuracy
222,4.7112,4.705831,0.016216
444,4.7084,4.702398,0.015315
666,4.7029,4.696335,0.027027
888,4.7001,4.687635,0.03964
1110,4.6934,4.676097,0.051351
1332,4.6854,4.661163,0.056757
1554,4.6757,4.637342,0.081982
1776,4.6595,4.614371,0.103604
1998,4.6426,4.588152,0.123423
2220,4.6258,4.560373,0.122523


[I 2024-06-18 17:47:18,181] Trial 5 finished with value: 3.731489419937134 and parameters: {'num_layers': 21}. Best is trial 2 with value: 3.6675639152526855.
Some weights of Wav2Vec2Model were not initialized from the model checkpoint at facebook/wav2vec2-xls-r-300m and are newly initialized: ['wav2vec2.encoder.pos_conv_embed.conv.parametrizations.weight.original0', 'wav2vec2.encoder.pos_conv_embed.conv.parametrizations.weight.original1']
You should probably TRAIN this model on a down-stream task to be able to use it for predictions and inference.


Step,Training Loss,Validation Loss,Accuracy
222,4.7092,4.707587,0.009009
444,4.7085,4.707574,0.009009
666,4.7047,4.694062,0.01982
888,4.7,4.696763,0.016216
1110,4.6935,4.678085,0.020721
1332,4.6827,4.666332,0.023423
1554,4.6656,4.651262,0.022523
1776,4.655,4.635927,0.028829
1998,4.6437,4.618079,0.027928
2220,4.6313,4.601048,0.028829


[I 2024-06-18 18:45:29,463] Trial 6 finished with value: 3.9214565753936768 and parameters: {'num_layers': 23}. Best is trial 2 with value: 3.6675639152526855.
Some weights of Wav2Vec2Model were not initialized from the model checkpoint at facebook/wav2vec2-xls-r-300m and are newly initialized: ['wav2vec2.encoder.pos_conv_embed.conv.parametrizations.weight.original0', 'wav2vec2.encoder.pos_conv_embed.conv.parametrizations.weight.original1']
You should probably TRAIN this model on a down-stream task to be able to use it for predictions and inference.


Step,Training Loss,Validation Loss,Accuracy
222,4.709,4.699757,0.028829
444,4.7034,4.686362,0.054955
666,4.6916,4.66806,0.056757
888,4.6807,4.650777,0.073874
1110,4.672,4.629227,0.078378
1332,4.6486,4.602532,0.100901
1554,4.639,4.578953,0.118018
1776,4.6166,4.549279,0.127928
1998,4.5936,4.515528,0.11982
2220,4.5671,4.485372,0.137838


[I 2024-06-18 19:18:58,614] Trial 7 finished with value: 3.7033426761627197 and parameters: {'num_layers': 5}. Best is trial 2 with value: 3.6675639152526855.
Some weights of Wav2Vec2Model were not initialized from the model checkpoint at facebook/wav2vec2-xls-r-300m and are newly initialized: ['wav2vec2.encoder.pos_conv_embed.conv.parametrizations.weight.original0', 'wav2vec2.encoder.pos_conv_embed.conv.parametrizations.weight.original1']
You should probably TRAIN this model on a down-stream task to be able to use it for predictions and inference.


Step,Training Loss,Validation Loss,Accuracy
222,4.7085,4.701094,0.024324
444,4.7073,4.693042,0.034234
666,4.7002,4.679426,0.045045
888,4.6888,4.665529,0.051351
1110,4.6797,4.642776,0.054054
1332,4.6638,4.618524,0.054054
1554,4.6481,4.587119,0.066667
1776,4.6218,4.557612,0.062162
1998,4.6001,4.522763,0.048649
2220,4.5742,4.486005,0.045045


[I 2024-06-18 20:03:46,785] Trial 8 finished with value: 3.822213649749756 and parameters: {'num_layers': 14}. Best is trial 2 with value: 3.6675639152526855.
Some weights of Wav2Vec2Model were not initialized from the model checkpoint at facebook/wav2vec2-xls-r-300m and are newly initialized: ['wav2vec2.encoder.pos_conv_embed.conv.parametrizations.weight.original0', 'wav2vec2.encoder.pos_conv_embed.conv.parametrizations.weight.original1']
You should probably TRAIN this model on a down-stream task to be able to use it for predictions and inference.


Step,Training Loss,Validation Loss,Accuracy
222,4.7082,4.69878,0.017117
444,4.7009,4.685322,0.023423
666,4.6936,4.673082,0.033333
888,4.6838,4.654863,0.03964
1110,4.6744,4.63642,0.058559
1332,4.6561,4.612448,0.078378
1554,4.6382,4.585474,0.084685
1776,4.6222,4.558756,0.090991
1998,4.6054,4.533115,0.095495
2220,4.5789,4.500781,0.100901


[I 2024-06-18 20:35:17,473] Trial 9 finished with value: 3.767843723297119 and parameters: {'num_layers': 4}. Best is trial 2 with value: 3.6675639152526855.
Some weights of Wav2Vec2Model were not initialized from the model checkpoint at facebook/wav2vec2-xls-r-300m and are newly initialized: ['wav2vec2.encoder.pos_conv_embed.conv.parametrizations.weight.original0', 'wav2vec2.encoder.pos_conv_embed.conv.parametrizations.weight.original1']
You should probably TRAIN this model on a down-stream task to be able to use it for predictions and inference.


Step,Training Loss,Validation Loss,Accuracy
222,4.7114,4.704685,0.010811
444,4.7064,4.696667,0.018018
666,4.6996,4.683206,0.02973
888,4.6965,4.673188,0.028829
1110,4.684,4.656142,0.035135
1332,4.6721,4.636421,0.043243
1554,4.6585,4.613287,0.04955
1776,4.646,4.590195,0.054054
1998,4.6285,4.563351,0.066667
2220,4.6116,4.534761,0.072072


[I 2024-06-18 21:15:13,698] Trial 10 finished with value: 3.7859463691711426 and parameters: {'num_layers': 10}. Best is trial 2 with value: 3.6675639152526855.
Some weights of Wav2Vec2Model were not initialized from the model checkpoint at facebook/wav2vec2-xls-r-300m and are newly initialized: ['wav2vec2.encoder.pos_conv_embed.conv.parametrizations.weight.original0', 'wav2vec2.encoder.pos_conv_embed.conv.parametrizations.weight.original1']
You should probably TRAIN this model on a down-stream task to be able to use it for predictions and inference.


Step,Training Loss,Validation Loss,Accuracy
222,4.7094,4.701189,0.02973
444,4.7032,4.689653,0.030631
666,4.6981,4.673441,0.041441
888,4.6853,4.659721,0.048649
1110,4.6777,4.641739,0.054955
1332,4.6607,4.621482,0.056757
1554,4.6453,4.593356,0.053153
1776,4.63,4.562118,0.063964
1998,4.5991,4.526942,0.055856
2220,4.5759,4.487835,0.067568


[I 2024-06-18 21:54:08,970] Trial 11 finished with value: 3.763418436050415 and parameters: {'num_layers': 9}. Best is trial 2 with value: 3.6675639152526855.
Some weights of Wav2Vec2Model were not initialized from the model checkpoint at facebook/wav2vec2-xls-r-300m and are newly initialized: ['wav2vec2.encoder.pos_conv_embed.conv.parametrizations.weight.original0', 'wav2vec2.encoder.pos_conv_embed.conv.parametrizations.weight.original1']
You should probably TRAIN this model on a down-stream task to be able to use it for predictions and inference.


Step,Training Loss,Validation Loss,Accuracy
222,4.7082,4.69955,0.009009
444,4.7034,4.689146,0.023423
666,4.6943,4.677635,0.054054
888,4.6854,4.662054,0.063964
1110,4.6775,4.642311,0.058559
1332,4.654,4.617785,0.085586
1554,4.6373,4.588194,0.099099
1776,4.6165,4.554558,0.103604
1998,4.5853,4.518782,0.130631
2220,4.5618,4.486297,0.145946


[I 2024-06-18 22:22:43,499] Trial 12 finished with value: 3.7373125553131104 and parameters: {'num_layers': 1}. Best is trial 2 with value: 3.6675639152526855.
Some weights of Wav2Vec2Model were not initialized from the model checkpoint at facebook/wav2vec2-xls-r-300m and are newly initialized: ['wav2vec2.encoder.pos_conv_embed.conv.parametrizations.weight.original0', 'wav2vec2.encoder.pos_conv_embed.conv.parametrizations.weight.original1']
You should probably TRAIN this model on a down-stream task to be able to use it for predictions and inference.


Step,Training Loss,Validation Loss,Accuracy
222,4.7113,4.702909,0.01982
444,4.7028,4.691449,0.018018
666,4.6967,4.680123,0.032432
888,4.6865,4.662714,0.047748
1110,4.6756,4.645768,0.057658
1332,4.6603,4.625079,0.06036
1554,4.6461,4.597983,0.069369
1776,4.6283,4.573708,0.081081
1998,4.6061,4.537744,0.097297
2220,4.5831,4.512765,0.097297


[I 2024-06-18 23:00:31,014] Trial 13 finished with value: 3.7251672744750977 and parameters: {'num_layers': 8}. Best is trial 2 with value: 3.6675639152526855.
Some weights of Wav2Vec2Model were not initialized from the model checkpoint at facebook/wav2vec2-xls-r-300m and are newly initialized: ['wav2vec2.encoder.pos_conv_embed.conv.parametrizations.weight.original0', 'wav2vec2.encoder.pos_conv_embed.conv.parametrizations.weight.original1']
You should probably TRAIN this model on a down-stream task to be able to use it for predictions and inference.


Step,Training Loss,Validation Loss,Accuracy
222,4.7061,4.703903,0.009009
444,4.7061,4.694532,0.025225
666,4.7002,4.684687,0.025225
888,4.6945,4.672815,0.040541
1110,4.6833,4.657837,0.059459
1332,4.6675,4.637659,0.069369
1554,4.653,4.61377,0.081081
1776,4.6364,4.587174,0.086486
1998,4.6137,4.554393,0.092793
2220,4.5878,4.518972,0.128829


[I 2024-06-18 23:29:36,913] Trial 14 finished with value: 3.73152756690979 and parameters: {'num_layers': 1}. Best is trial 2 with value: 3.6675639152526855.
Some weights of Wav2Vec2Model were not initialized from the model checkpoint at facebook/wav2vec2-xls-r-300m and are newly initialized: ['wav2vec2.encoder.pos_conv_embed.conv.parametrizations.weight.original0', 'wav2vec2.encoder.pos_conv_embed.conv.parametrizations.weight.original1']
You should probably TRAIN this model on a down-stream task to be able to use it for predictions and inference.


Step,Training Loss,Validation Loss,Accuracy
222,4.7099,4.704793,0.020721
444,4.7084,4.697919,0.020721
666,4.7033,4.688953,0.033333
888,4.6922,4.674824,0.02973
1110,4.6879,4.661367,0.032432
1332,4.6733,4.643546,0.053153
1554,4.6633,4.62233,0.065766
1776,4.6486,4.597552,0.07027
1998,4.6368,4.571906,0.093694
2220,4.6145,4.54074,0.081982


[I 2024-06-19 00:13:29,518] Trial 15 finished with value: 3.7462451457977295 and parameters: {'num_layers': 13}. Best is trial 2 with value: 3.6675639152526855.
Some weights of Wav2Vec2Model were not initialized from the model checkpoint at facebook/wav2vec2-xls-r-300m and are newly initialized: ['wav2vec2.encoder.pos_conv_embed.conv.parametrizations.weight.original0', 'wav2vec2.encoder.pos_conv_embed.conv.parametrizations.weight.original1']
You should probably TRAIN this model on a down-stream task to be able to use it for predictions and inference.


Step,Training Loss,Validation Loss,Accuracy
222,4.7074,4.699555,0.01982
444,4.7023,4.687501,0.036937
666,4.6938,4.672675,0.034234
888,4.684,4.654934,0.052252
1110,4.6752,4.633548,0.086486
1332,4.6568,4.612518,0.075676
1554,4.6382,4.582642,0.091892
1776,4.6159,4.549713,0.118018
1998,4.596,4.518878,0.094595
2220,4.5685,4.479667,0.117117


[I 2024-06-19 00:48:21,171] Trial 16 finished with value: 3.746232748031616 and parameters: {'num_layers': 6}. Best is trial 2 with value: 3.6675639152526855.
Some weights of Wav2Vec2Model were not initialized from the model checkpoint at facebook/wav2vec2-xls-r-300m and are newly initialized: ['wav2vec2.encoder.pos_conv_embed.conv.parametrizations.weight.original0', 'wav2vec2.encoder.pos_conv_embed.conv.parametrizations.weight.original1']
You should probably TRAIN this model on a down-stream task to be able to use it for predictions and inference.


Step,Training Loss,Validation Loss,Accuracy
222,4.7095,4.70566,0.010811
444,4.7084,4.696444,0.021622
666,4.7034,4.689759,0.021622
888,4.6936,4.676877,0.042342
1110,4.6852,4.66354,0.043243
1332,4.6723,4.643582,0.048649
1554,4.6667,4.625642,0.06036
1776,4.6471,4.602217,0.068468
1998,4.6237,4.570015,0.073874
2220,4.604,4.531774,0.072973


[I 2024-06-19 01:29:45,792] Trial 17 finished with value: 3.7789292335510254 and parameters: {'num_layers': 11}. Best is trial 2 with value: 3.6675639152526855.
Some weights of Wav2Vec2Model were not initialized from the model checkpoint at facebook/wav2vec2-xls-r-300m and are newly initialized: ['wav2vec2.encoder.pos_conv_embed.conv.parametrizations.weight.original0', 'wav2vec2.encoder.pos_conv_embed.conv.parametrizations.weight.original1']
You should probably TRAIN this model on a down-stream task to be able to use it for predictions and inference.


Step,Training Loss,Validation Loss,Accuracy
222,4.7095,4.70223,0.016216
444,4.705,4.689596,0.023423
666,4.6969,4.670762,0.038739
888,4.6782,4.649355,0.056757
1110,4.6697,4.629714,0.086486
1332,4.6492,4.605486,0.109009
1554,4.6315,4.57802,0.113514
1776,4.6181,4.547173,0.097297
1998,4.5878,4.51257,0.100901
2220,4.5616,4.474004,0.102703


[I 2024-06-19 02:05:43,769] Trial 18 finished with value: 3.7296619415283203 and parameters: {'num_layers': 7}. Best is trial 2 with value: 3.6675639152526855.
Some weights of Wav2Vec2Model were not initialized from the model checkpoint at facebook/wav2vec2-xls-r-300m and are newly initialized: ['wav2vec2.encoder.pos_conv_embed.conv.parametrizations.weight.original0', 'wav2vec2.encoder.pos_conv_embed.conv.parametrizations.weight.original1']
You should probably TRAIN this model on a down-stream task to be able to use it for predictions and inference.


Step,Training Loss,Validation Loss,Accuracy
222,4.7088,4.70607,0.012613
444,4.7076,4.699078,0.020721
666,4.7005,4.688899,0.031532
888,4.6962,4.678683,0.02973
1110,4.6931,4.663395,0.038739
1332,4.6783,4.646754,0.05045
1554,4.6699,4.625658,0.048649
1776,4.6549,4.600386,0.059459
1998,4.6335,4.576926,0.063964
2220,4.616,4.54825,0.052252


[I 2024-06-19 02:52:07,003] Trial 19 finished with value: 3.8210413455963135 and parameters: {'num_layers': 15}. Best is trial 2 with value: 3.6675639152526855.
Some weights of Wav2Vec2Model were not initialized from the model checkpoint at facebook/wav2vec2-xls-r-300m and are newly initialized: ['wav2vec2.encoder.pos_conv_embed.conv.parametrizations.weight.original0', 'wav2vec2.encoder.pos_conv_embed.conv.parametrizations.weight.original1']
You should probably TRAIN this model on a down-stream task to be able to use it for predictions and inference.


Step,Training Loss,Validation Loss,Accuracy
222,4.7094,4.703187,0.016216
444,4.7067,4.69245,0.032432
666,4.6991,4.680313,0.043243
888,4.6902,4.665735,0.075676
1110,4.679,4.64526,0.075676
1332,4.663,4.622732,0.104505
1554,4.6509,4.591664,0.11982
1776,4.6322,4.560389,0.128829
1998,4.6022,4.528654,0.143243
2220,4.5836,4.494323,0.18018


[I 2024-06-19 03:22:14,988] Trial 20 finished with value: 3.6773605346679688 and parameters: {'num_layers': 3}. Best is trial 2 with value: 3.6675639152526855.
Some weights of Wav2Vec2Model were not initialized from the model checkpoint at facebook/wav2vec2-xls-r-300m and are newly initialized: ['wav2vec2.encoder.pos_conv_embed.conv.parametrizations.weight.original0', 'wav2vec2.encoder.pos_conv_embed.conv.parametrizations.weight.original1']
You should probably TRAIN this model on a down-stream task to be able to use it for predictions and inference.


Step,Training Loss,Validation Loss,Accuracy
222,4.7072,4.699301,0.018018
444,4.7047,4.688194,0.021622
666,4.6938,4.673615,0.032432
888,4.6868,4.654483,0.061261
1110,4.6715,4.632442,0.089189
1332,4.6557,4.605697,0.102703
1554,4.6384,4.578365,0.134234
1776,4.622,4.548279,0.13964
1998,4.5867,4.511109,0.152252
2220,4.5635,4.47313,0.168468


[I 2024-06-19 03:52:31,467] Trial 21 finished with value: 3.634565830230713 and parameters: {'num_layers': 3}. Best is trial 21 with value: 3.634565830230713.
Some weights of Wav2Vec2Model were not initialized from the model checkpoint at facebook/wav2vec2-xls-r-300m and are newly initialized: ['wav2vec2.encoder.pos_conv_embed.conv.parametrizations.weight.original0', 'wav2vec2.encoder.pos_conv_embed.conv.parametrizations.weight.original1']
You should probably TRAIN this model on a down-stream task to be able to use it for predictions and inference.


Step,Training Loss,Validation Loss,Accuracy
222,4.7072,4.699294,0.018018
444,4.7046,4.688156,0.021622
666,4.6938,4.67357,0.034234
888,4.6868,4.654453,0.061261
1110,4.6715,4.632446,0.089189
1332,4.6557,4.605681,0.100901
1554,4.6383,4.57827,0.133333
1776,4.6219,4.548192,0.13964
1998,4.5867,4.511121,0.153153
2220,4.5635,4.473117,0.168468


[I 2024-06-19 04:22:35,358] Trial 22 finished with value: 3.634643077850342 and parameters: {'num_layers': 3}. Best is trial 21 with value: 3.634565830230713.
Some weights of Wav2Vec2Model were not initialized from the model checkpoint at facebook/wav2vec2-xls-r-300m and are newly initialized: ['wav2vec2.encoder.pos_conv_embed.conv.parametrizations.weight.original0', 'wav2vec2.encoder.pos_conv_embed.conv.parametrizations.weight.original1']
You should probably TRAIN this model on a down-stream task to be able to use it for predictions and inference.


Step,Training Loss,Validation Loss,Accuracy
222,4.7067,4.700283,0.010811
444,4.7049,4.688091,0.022523
666,4.6956,4.67466,0.027928
888,4.6891,4.657594,0.04955
1110,4.6742,4.637477,0.083784
1332,4.6592,4.612431,0.138739
1554,4.6405,4.583565,0.165766
1776,4.6182,4.553049,0.179279
1998,4.5962,4.521086,0.206306
2220,4.5767,4.48789,0.199099


[I 2024-06-19 04:52:02,033] Trial 23 finished with value: 3.662409543991089 and parameters: {'num_layers': 2}. Best is trial 21 with value: 3.634565830230713.
Some weights of Wav2Vec2Model were not initialized from the model checkpoint at facebook/wav2vec2-xls-r-300m and are newly initialized: ['wav2vec2.encoder.pos_conv_embed.conv.parametrizations.weight.original0', 'wav2vec2.encoder.pos_conv_embed.conv.parametrizations.weight.original1']
You should probably TRAIN this model on a down-stream task to be able to use it for predictions and inference.


Step,Training Loss,Validation Loss,Accuracy
222,4.7075,4.698718,0.018919
444,4.7031,4.685509,0.033333
666,4.6957,4.670607,0.047748
888,4.6793,4.651435,0.055856
1110,4.6687,4.627486,0.081982
1332,4.649,4.597727,0.098198
1554,4.6274,4.56537,0.118018
1776,4.6038,4.536026,0.148649
1998,4.5824,4.498976,0.15045
2220,4.5532,4.462032,0.163964


[I 2024-06-19 05:21:50,192] Trial 24 finished with value: 3.6773271560668945 and parameters: {'num_layers': 2}. Best is trial 21 with value: 3.634565830230713.


Best hyperparameters: {'num_layers': 3}


NameError: name 'trainer' is not defined