In [2]:
import os
import numpy as np
import torch
from transformers import WhisperProcessor, WhisperModel
from tqdm import tqdm
import librosa

device = torch.device("cuda" if torch.cuda.is_available() else "cpu")

# Initialize the processor and model for Whisper
processor = WhisperProcessor.from_pretrained("openai/whisper-large")
model = WhisperModel.from_pretrained("openai/whisper-large", output_hidden_states=True)
model.to(device)

def check_directories_exist(directory, layer_indices):
    """Prüft, ob die benötigten Verzeichnisse für jede Schicht bereits existieren."""
    all_exist = True
    for index in layer_indices:
        layer_dir = os.path.join(directory, f"layer_{index}")
        if not os.path.exists(layer_dir):
            all_exist = False
            break
    return all_exist

def load_audio_files(input_directory, output_directory, layer_indices=[-1]):
    """Lädt alle MP3-Dateien im angegebenen Verzeichnis und extrahiert die Repräsentationen aus den spezifizierten Schichten."""
    for filename in tqdm(os.listdir(input_directory)):
        if filename.endswith(".mp3"):
            file_path = os.path.join(input_directory, filename)
            audio, sr = librosa.load(file_path, sr=16000)
            inputs = processor(audio, sampling_rate=sr, return_tensors="pt")
            input_values = inputs["input_features"].to(device)
            
            with torch.no_grad():
                outputs = model.encoder(input_values)
                for index in layer_indices:
                    hidden_states = outputs.hidden_states[index]
                    mean_pooled_hidden_states = hidden_states.mean(dim=1)  # Mean Pooling über die Zeitdimension
                    # creating sub directory for each layer in output directory
                    layer_dir = os.path.join(output_directory, f"layer_{index}")
                    os.makedirs(layer_dir, exist_ok=True)
                    save_path = os.path.join(layer_dir, f"{os.path.splitext(filename)[0]}_layer_{index}.npy")
                    np.save(save_path, mean_pooled_hidden_states.cpu().numpy())

def process_audio_directory(input_base_directory, output_base_directory, layer_indices=range(25)):
    """Verarbeitet Audio-Dateien in den angegebenen Verzeichnissen und speichert die Ergebnisse im Zielverzeichnis."""
    for d in os.listdir(input_base_directory):
        input_dir_path = os.path.join(input_base_directory, d)
        output_dir_path = os.path.join(output_base_directory, d)
        if os.path.isdir(input_dir_path) and not check_directories_exist(output_dir_path, layer_indices):
            load_audio_files(input_dir_path, output_dir_path, layer_indices)

input_directory_path = os.path.expanduser("/home/rag/experimental_trial/data/all_speakers_backup")
output_directory_path = os.path.expanduser("/home/rag/experimental_trial/data/all_speakers_whisper")
process_audio_directory(input_directory_path, output_directory_path)

Special tokens have been added in the vocabulary, make sure the associated word embeddings are fine-tuned or trained.


now we finetune whisper

In [None]:
import os
import sys
import torch
import torch.nn as nn
import librosa
import pandas as pd
import numpy as np
from torch.utils.data import Dataset
from transformers import WhisperProcessor, WhisperModel, Trainer, TrainingArguments, TrainerCallback, WhisperConfig
import math
from datasets import load_metric
from datetime import datetime

class WhisperForSequenceClassification(nn.Module):
    def __init__(self, config):
        super(WhisperForSequenceClassification, self).__init__()
        self.whisper = WhisperModel.from_pretrained("openai/whisper-large")
        self.classifier = nn.Linear(config.hidden_size, config.num_labels)

    def forward(self, input_values, labels=None):
        outputs = self.whisper(input_values).last_hidden_state
        pooled_output = outputs.mean(dim=1)  # Mean Pooling über die Zeitdimension
        logits = self.classifier(pooled_output)
        
        loss = None
        if labels is not None:
            loss_fct = nn.CrossEntropyLoss()
            loss = loss_fct(logits.view(-1, self.config.num_labels), labels.view(-1))
        
        return (loss, logits) if loss is not None else logits

# Initialize the processor and model for Whisper
processor = WhisperProcessor.from_pretrained("openai/whisper-large")

# Define the custom dataset class using pandas
class LocalAudioDataset(Dataset):
    def __init__(self, csv_file, processor, subset):
        self.processor = processor
        self.data = pd.read_csv(csv_file)
        self.data = self.data[self.data['subset'] == subset]
        self.speaker_ids = {label: idx for idx, label in enumerate(self.data['label'].unique())}
        self.data['label'] = self.data['label'].map(self.speaker_ids)
        
        print(f"Loaded {len(self.speaker_ids)} speakers: {self.speaker_ids}")
        print(f"Total files in {subset}: {len(self.data)}")

    def __len__(self):
        return len(self.data)

    def __getitem__(self, idx):
        file_path = self.data.iloc[idx]['path']
        label = self.data.iloc[idx]['label']
        
        try:
            audio, sr = librosa.load(file_path, sr=16000)
            audio = librosa.to_mono(audio)
            audio = self._pad_or_truncate(audio, max_length=16000)
            input_values = self.processor(audio, sampling_rate=16000, return_tensors="pt").input_values.squeeze(0)
            return {"input_values": input_values, "labels": label}
        except Exception as e:
            print(f"Error loading {file_path}: {e}", file=sys.stderr)
            return self.__getitem__((idx + 1) % len(self))

    def _pad_or_truncate(self, audio, max_length):
        if len(audio) < max_length:
            pad_size = max_length - len(audio)
            audio = np.pad(audio, (0, pad_size), 'constant', constant_values=(0, 0))
        else:
            audio = audio[:max_length]
        return audio

# Paths to dataset CSV file
csv_file = 'dataset_large.csv'
train_dataset = LocalAudioDataset(csv_file, processor, 'train')
validate_dataset = LocalAudioDataset(csv_file, processor, 'validate')
test_dataset = LocalAudioDataset(csv_file, processor, 'test')

num_speakers = len(train_dataset.speaker_ids)
config = WhisperConfig.from_pretrained("openai/whisper-large", num_labels=num_speakers)
model = WhisperForSequenceClassification(config)

device = torch.device("cuda" if torch.cuda.is_available() else "cpu")
print(f"Using device: {device}")
model = model.to(device)

def validate_labels(dataset):
    for item in dataset:
        label = item['labels']
        if label >= num_speakers or label < 0:
            print(f"Invalid label {label} for item: {item}")
            raise ValueError(f"Invalid label {label} found in dataset.")
    print("All labels are valid.")

validate_labels(train_dataset)
validate_labels(validate_dataset)
validate_labels(test_dataset)

batch_size = 8
steps_per_epoch = math.ceil(len(train_dataset) / batch_size)
logging_steps = steps_per_epoch // 5
eval_steps = steps_per_epoch // 5

accuracy_metric = load_metric("accuracy")

def compute_metrics(eval_pred):
    logits, labels = eval_pred
    predictions = np.argmax(logits, axis=-1)
    return accuracy_metric.compute(predictions=predictions, references=labels)

log_dir = "/home/rag/experimental_trial/results/training_logs"
os.makedirs(log_dir, exist_ok=True)
timestamp = datetime.now().strftime("%Y%m%d_%H%M%S")
log_file = os.path.join(log_dir, f"training_log_100_epochs_5_layer{timestamp}.csv")
with open(log_file, "w") as f:
    f.write("Timestamp,Step,Training Loss,Validation Loss,Accuracy\n")

class SaveMetricsCallback(TrainerCallback):
    def on_log(self, args, state, control, logs=None, **kwargs):
        if logs is not None:
            with open(log_file, "a") as f:
                timestamp = datetime.now().strftime("%Y-%m-%d %H:%M:%S")
                step = state.global_step
                training_loss = logs.get("loss", "")
                validation_loss = logs.get("eval_loss", "")
                accuracy = logs.get("eval_accuracy", "")
                f.write(f"{timestamp},{step},{training_loss},{validation_loss},{accuracy}\n")

class EarlyStoppingCallback(TrainerCallback):
    def __init__(self, early_stopping_patience=100, early_stopping_threshold=0.0):
        self.early_stopping_patience = early_stopping_patience
        self.early_stopping_threshold = early_stopping_threshold
        self.best_metric = None
        self.patience_counter = 0

    def on_evaluate(self, args, state, control, **kwargs):
        metric = kwargs.get("metrics", {}).get("eval_loss")
        if metric is None:
            return
        
        if self.best_metric is None or metric < self.best_metric - self.early_stopping_threshold:
            self.best_metric = metric
            self.patience_counter = 0
        else:
            self.patience_counter += 1
        
        if self.patience_counter >= self.early_stopping_patience:
            print(f"Early stopping at step {state.global_step}")
            control.should_training_stop = True

training_args = TrainingArguments(
    output_dir="./results",
    group_by_length=True,
    per_device_train_batch_size=batch_size,
    evaluation_strategy="steps",
    num_train_epochs=100,
    save_steps=logging_steps,
    eval_steps=eval_steps,
    logging_steps=logging_steps,
    learning_rate=5e-6,
    save_total_limit=2,
    no_cuda=False,
    load_best_model_at_end=True,
    metric_for_best_model="eval_loss",
    greater_is_better=False,  # lower eval_loss is better
    save_strategy="steps"  # or "epoch" if you prefer to save every epoch
)

trainer = Trainer(
    model=model,
    args=training_args,
    train_dataset=train_dataset,
    eval_dataset=validate_dataset,
    tokenizer=processor,
    compute_metrics=compute_metrics,
    callbacks=[SaveMetricsCallback(), EarlyStoppingCallback()]
)

trainer.train()

metrics = trainer.evaluate(test_dataset)

print(f"Test set evaluation metrics: {metrics}")
print("Training and evaluation completed successfully!")

best_model_dir = "./results/best_model_100_epochs_5_layer"
os.makedirs(best_model_dir, exist_ok=True)

trainer.save_model(best_model_dir)
processor.save_pretrained(best_model_dir)

print(f"Best model saved to {best_model_dir}")

# hyperparam tuning for whisper

we optimize the number of layers used

In [None]:
import optuna
import torch
from torch.utils.data import Dataset, DataLoader, RandomSampler
from transformers import Trainer, TrainingArguments, TrainerCallback, WhisperConfig, WhisperModel, WhisperProcessor
import pandas as pd
import librosa
import numpy as np
import os
import sys
import math
from datasets import load_metric
from datetime import datetime
import logging
import torch.nn.functional as F

# Set up logging for Optuna
log_dir = "/home/rag/experimental_trial/results/training_logs"
os.makedirs(log_dir, exist_ok=True)
timestamp = datetime.now().strftime("%Y%m%d_%H%M%S")
log_file = os.path.join(log_dir, f"training_log_optuna_optim_whisper{timestamp}.csv")

logging.basicConfig(stream=sys.stdout, level=logging.INFO)
logger = logging.getLogger()

# Add file handler to logger
file_handler = logging.FileHandler(log_file)
file_handler.setLevel(logging.INFO)
formatter = logging.Formatter('%(asctime)s - %(levelname)s - %(message)s')
file_handler.setFormatter(formatter)
logger.addHandler(file_handler)

# Redirect Optuna logging to the file
optuna_logger = logging.getLogger("optuna")
optuna_logger.addHandler(file_handler)

# Load the processor
processor = WhisperProcessor.from_pretrained("openai/whisper-large")

# Define the custom dataset class
class LocalAudioDataset(Dataset):
    def __init__(self, csv_file, processor, subset, noise_factor=0.0, max_speakers=50):
        self.processor = processor
        self.data = pd.read_csv(csv_file)
        self.data = self.data[self.data['subset'] == subset]
        
        # Limit the number of speakers to max_speakers
        speaker_counts = self.data['label'].value_counts()
        top_speakers = speaker_counts.nlargest(max_speakers).index
        self.data = self.data[self.data['label'].isin(top_speakers)]
        
        self.speaker_ids = {label: idx for idx, label in enumerate(self.data['label'].unique())}
        self.data['label'] = self.data['label'].map(self.speaker_ids)
        self.noise_factor = noise_factor
        
        print(f"Loaded {len(self.speaker_ids)} speakers: {self.speaker_ids}")
        print(f"Total files in {subset}: {len(self.data)}")

    def __len__(self):
        return len(self.data)

    def __getitem__(self, idx):
        file_path = self.data.iloc[idx]['path']
        label = self.data.iloc[idx]['label']
        
        try:
            audio, sr = librosa.load(file_path, sr=16000)
            audio = librosa.to_mono(audio)
            # Use the processor to extract features
            inputs = self.processor(audio, sampling_rate=sr, return_tensors="pt")
            input_values = inputs.input_features.squeeze(0)
            return {"input_values": input_values, "labels": label}
        except Exception as e:
            print(f"Error loading {file_path}: {e}", file=sys.stderr)
            return self.__getitem__((idx + 1) % len(self))

# Paths to dataset CSV file
csv_file = 'dataset_large.csv'
train_dataset = LocalAudioDataset(csv_file, processor, 'train', noise_factor=0, max_speakers=111)
validate_dataset = LocalAudioDataset(csv_file, processor, 'validate', max_speakers=111)
test_dataset = LocalAudioDataset(csv_file, processor, 'test', max_speakers=111)

num_speakers = len(train_dataset.speaker_ids)
print(f"Number of unique speakers: {num_speakers}")

print(f"Labels in train dataset: {train_dataset.data['label'].tolist()}")
print(f"Labels in test dataset: {test_dataset.data['label'].tolist()}")

device = torch.device("cuda" if torch.cuda.is_available() else "cpu")
print(f"Using device: {device}")

def validate_labels(dataset):
    for item in dataset:
        label = item['labels']
        if label >= num_speakers or label < 0:
            print(f"Invalid label {label} for item: {item}")
            raise ValueError(f"Invalid label {label} found in dataset.")
    print("All labels are valid.")

batch_size = 2
steps_per_epoch = math.ceil(len(train_dataset) / batch_size)
logging_steps = steps_per_epoch // 5
eval_steps = steps_per_epoch // 5

accuracy_metric = load_metric("accuracy")

def compute_metrics(eval_pred):
    logits, labels = eval_pred
    predictions = np.argmax(logits, axis=-1)
    return accuracy_metric.compute(predictions=predictions, references=labels)

class SaveMetricsCallback(TrainerCallback):
    def on_log(self, args, state, control, logs=None, **kwargs):
        if logs is not None:
            with open(log_file, "a") as f:
                timestamp = datetime.now().strftime("%Y-%m-%d %H:%M:%S")
                step = state.global_step
                training_loss = logs.get("loss", "")
                validation_loss = logs.get("eval_loss", "")
                accuracy = logs.get("eval_accuracy", "")
                f.write(f"{timestamp},{step},{training_loss},{validation_loss},{accuracy}\n")

class EarlyStoppingCallback(TrainerCallback):
    def __init__(self, early_stopping_patience=100, early_stopping_threshold=0.0):
        self.early_stopping_patience = early_stopping_patience
        self.early_stopping_threshold = early_stopping_threshold
        self.best_metric = None
        self.patience_counter = 0

    def on_evaluate(self, args, state, control, **kwargs):
        metric = kwargs.get("metrics", {}).get("eval_loss")
        if metric is None:
            return
        
        if self.best_metric is None or metric < self.best_metric - self.early_stopping_threshold:
            self.best_metric = metric
            self.patience_counter = 0
        else:
            self.patience_counter += 1
        
        if self.patience_counter >= self.early_stopping_patience:
            print(f"Early stopping at step {state.global_step}")
            control.should_training_stop = True

# Custom classification head with mean pooling
class CustomWhisperForSequenceClassification(torch.nn.Module):
    def __init__(self, config):
        super().__init__()
        self.whisper = WhisperModel(config)
        self.pooling = torch.nn.AdaptiveAvgPool1d(1)
        self.hidden_size = config.d_model
        self.num_labels = config.num_labels
        self.classifier = torch.nn.Linear(self.hidden_size, self.num_labels)

    def forward(self, input_values, attention_mask=None, labels=None):
        # Pass input through Whisper encoder
        encoder_outputs = self.whisper.encoder(input_values)
        hidden_states = encoder_outputs.last_hidden_state
        
        # Apply pooling
        pooled_output = self.pooling(hidden_states.transpose(1, 2)).squeeze(-1)
        
        # Ensure the pooled output has the correct shape
        if pooled_output.dim() == 1:
            pooled_output = pooled_output.unsqueeze(0)
        
        # Pass through classifier
        logits = self.classifier(pooled_output)
        
        loss = None
        if labels is not None:
            loss = F.cross_entropy(logits, labels)
        
        return (loss, logits) if loss is not None else (logits,)

# Custom data collator for Whisper
class DataCollatorForWhisper:
    def __call__(self, features):
        input_values = torch.stack([f["input_values"] for f in features])
        labels = torch.tensor([f["labels"] for f in features], dtype=torch.long)
        return {"input_values": input_values, "labels": labels}

# Extend the Trainer class
class CustomTrainer(Trainer):
    def __init__(self, *args, **kwargs):
        super().__init__(*args, **kwargs)
        self.best_loss_model_dir = "./results/best_model_loss_2layer_versuch2"
        self.best_accuracy_model_dir = "./results/best_model_accuracy_versuch2"
        os.makedirs(self.best_loss_model_dir, exist_ok=True)
        os.makedirs(self.best_accuracy_model_dir, exist_ok=True)
        self.best_eval_loss = float("inf")
        self.best_eval_accuracy = 0.0

    def evaluate(self, eval_dataset=None, ignore_keys=None, metric_key_prefix="eval"):
        eval_metrics = super().evaluate(eval_dataset, ignore_keys, metric_key_prefix)
        
        current_eval_loss = eval_metrics["eval_loss"]
        current_eval_accuracy = eval_metrics["eval_accuracy"]
        
        if current_eval_loss < self.best_eval_loss:
            self.best_eval_loss = current_eval_loss
            self.save_model(self.best_loss_model_dir)
            print(f"Saved best model according to eval_loss: {self.best_eval_loss}")

        if current_eval_accuracy > self.best_eval_accuracy:
            self.best_eval_accuracy = current_eval_accuracy
            self.save_model(self.best_accuracy_model_dir)
            print(f"Saved best model according to eval_accuracy: {self.best_eval_accuracy}")

        return eval_metrics
    
    def compute_loss(self, model, inputs, return_outputs=False):
        input_values = inputs.get("input_values")
        labels = inputs.get("labels")
        outputs = model(input_values=input_values, labels=labels)
        loss = outputs[0]
        return (loss, outputs) if return_outputs else loss

def objective(trial):
    # Suggest the number of layers
    num_layers = trial.suggest_int('num_layers', 1, 24)
    
    # Load the model configuration with the suggested number of layers
    config = WhisperConfig.from_pretrained("openai/whisper-large", num_labels=num_speakers)
    config.num_hidden_layers = num_layers
    model = CustomWhisperForSequenceClassification(config)
    
    # Apply the number of hidden layers correctly
    model.whisper.encoder.layers = torch.nn.ModuleList(model.whisper.encoder.layers[:num_layers])
    
    # Transfer the model to the correct device
    model = model.to(device)
    
    training_args = TrainingArguments(
        output_dir="./results",
        group_by_length=False,
        per_device_train_batch_size=batch_size,
        evaluation_strategy="steps",
        num_train_epochs=3,
        save_steps=logging_steps,
        eval_steps=eval_steps,
        logging_steps=logging_steps,
        learning_rate=1e-5,
        save_total_limit=2,
        no_cuda=False,
        load_best_model_at_end=True,
        metric_for_best_model="eval_loss",
        greater_is_better=False,  # lower eval_loss is better
        save_strategy="steps"  # or "epoch" if you prefer to save every epoch
    )
    
    trainer = CustomTrainer(
        model=model,
        args=training_args,
        train_dataset=train_dataset,
        eval_dataset=validate_dataset,
        data_collator=DataCollatorForWhisper(),
        tokenizer=processor,
        compute_metrics=compute_metrics,
        callbacks=[SaveMetricsCallback(), EarlyStoppingCallback(early_stopping_patience=50)]
    )
    
    trainer.train()
    
    # Evaluate the model
    metrics = trainer.evaluate(validate_dataset)
    return metrics['eval_loss']

study = optuna.create_study(direction='minimize')
study.optimize(objective, n_trials=11)

result_file = os.path.join(log_dir, "OptunaResult.txt")
with open(result_file, "w") as f:
    f.write("Best trial:\n")
    trial = study.best_trial
    f.write(f"  Value: {trial.value}\n")
    f.write("  Params:\n")
    for key, value in trial.params.items():
        f.write(f"    {key}: {value}\n")
    
    f.write("\nAll trials:\n")
    for i, trial in enumerate(study.trials):
        f.write(f"Trial {i}:\n")
        f.write(f"  Value: {trial.value}\n")
        f.write("  Params:\n")
        for key, value in trial.params.items():
            f.write(f"    {key}: {value}\n")
        f.write("\n")
    
    f.write("Operation finished.\n")

print("Best trial:")
print(f"  Value: {trial.value}")
print("  Params: ")
for key, value in trial.params.items():
    print(f"    {key}: {value}")


now we extract the hidden states

In [None]:
import os
import numpy as np
import torch
from transformers import WhisperProcessor, WhisperConfig, WhisperModel
from safetensors.torch import load_file as safe_load
from tqdm import tqdm
import librosa

device = torch.device("cuda" if torch.cuda.is_available() else "cpu")

# Define the custom classification head with mean pooling for Whisper
class CustomWhisperForSequenceClassification(torch.nn.Module):
    def __init__(self, config):
        super().__init__()
        self.whisper = WhisperModel(config)
        self.pooling = torch.nn.AdaptiveAvgPool1d(1)
        self.hidden_size = config.d_model
        self.num_labels = config.num_labels
        self.classifier = torch.nn.Linear(self.hidden_size, self.num_labels)

    def forward(self, input_values, attention_mask=None, labels=None):
        # Pass input through Whisper encoder
        encoder_outputs = self.whisper.encoder(input_values)
        hidden_states = encoder_outputs.last_hidden_state
        
        # Apply pooling
        pooled_output = self.pooling(hidden_states.transpose(1, 2)).squeeze(-1)
        
        # Ensure the pooled output has the correct shape
        if pooled_output.dim() == 1:
            pooled_output = pooled_output.unsqueeze(0)
        
        # Pass through classifier
        logits = self.classifier(pooled_output)
        
        loss = None
        if labels is not None:
            loss = F.cross_entropy(logits, labels)
        
        return (loss, logits) if loss is not None else (logits,)

# Path to the fine-tuned model weights file
model_path = "/home/rag/experimental_trial/results/best_model_loss_whisper_110/model.safetensors"

# Load the pre-trained Whisper large model and processor
processor = WhisperProcessor.from_pretrained("openai/whisper-large")

# Load the model configuration
config = WhisperConfig.from_pretrained("openai/whisper-large", num_labels=110)  # Adjust num_labels as needed

# Initialize the custom model with the configuration
model = CustomWhisperForSequenceClassification(config)

# Load the model weights from safetensors file
state_dict = safe_load(model_path)
model.load_state_dict(state_dict)
model.to(device)

def check_directories_exist(directory, layer_indices):
    """Prüft, ob die benötigten Verzeichnisse für jede Schicht bereits existieren."""
    all_exist = True
    for index in layer_indices:
        layer_dir = os.path.join(directory, f"layer_{index}")
        if not os.path.exists(layer_dir):
            all_exist = False
            break
    return all_exist

def load_audio_files(input_directory, output_directory, layer_indices=[-1]):
    """Lädt alle MP3-Dateien im angegebenen Verzeichnis und extrahiert die Repräsentationen aus den spezifizierten Schichten."""
    for filename in tqdm(os.listdir(input_directory)):
        if filename.endswith(".mp3"):
            file_path = os.path.join(input_directory, filename)
            audio, sr = librosa.load(file_path, sr=16000)
            inputs = processor(audio, sampling_rate=sr, return_tensors="pt")
            input_values = inputs["input_features"].to(device)
            
            with torch.no_grad():
                outputs = model.whisper.encoder(input_values, output_hidden_states=True)
                for index in layer_indices:
                    hidden_states = outputs.hidden_states[index]
                    mean_pooled_hidden_states = hidden_states.mean(dim=1)  # Mean Pooling über die Zeitdimension
                    # creating sub directory for each layer in output directory
                    layer_dir = os.path.join(output_directory, f"layer_{index}")
                    os.makedirs(layer_dir, exist_ok=True)
                    save_path = os.path.join(layer_dir, f"{os.path.splitext(filename)[0]}_layer_{index}.npy")
                    np.save(save_path, mean_pooled_hidden_states.cpu().numpy())

def process_audio_directory(input_base_directory, output_base_directory, layer_indices=range(25)):
    """Verarbeitet Audio-Dateien in den angegebenen Verzeichnissen und speichert die Ergebnisse im Zielverzeichnis."""
    for d in os.listdir(input_base_directory):
        input_dir_path = os.path.join(input_base_directory, d)
        output_dir_path = os.path.join(output_base_directory, d)
        if os.path.isdir(input_dir_path) and not check_directories_exist(output_dir_path, layer_indices):
            load_audio_files(input_dir_path, output_dir_path, layer_indices)

input_directory_path = os.path.expanduser("/home/rag/experimental_trial/data/all_speakers_backup")
output_directory_path = os.path.expanduser("/home/rag/experimental_trial/data/all_speakers_whisper_finetuned2")
process_audio_directory(input_directory_path, output_directory_path)