In [2]:
import os
import numpy as np
import torch
from transformers import WhisperProcessor, WhisperModel
from tqdm import tqdm
import librosa

device = torch.device("cuda" if torch.cuda.is_available() else "cpu")

# Initialize the processor and model for Whisper
processor = WhisperProcessor.from_pretrained("openai/whisper-large")
model = WhisperModel.from_pretrained("openai/whisper-large", output_hidden_states=True)
model.to(device)

def check_directories_exist(directory, layer_indices):
    """Prüft, ob die benötigten Verzeichnisse für jede Schicht bereits existieren."""
    all_exist = True
    for index in layer_indices:
        layer_dir = os.path.join(directory, f"layer_{index}")
        if not os.path.exists(layer_dir):
            all_exist = False
            break
    return all_exist

def load_audio_files(input_directory, output_directory, layer_indices=[-1]):
    """Lädt alle MP3-Dateien im angegebenen Verzeichnis und extrahiert die Repräsentationen aus den spezifizierten Schichten."""
    for filename in tqdm(os.listdir(input_directory)):
        if filename.endswith(".mp3"):
            file_path = os.path.join(input_directory, filename)
            audio, sr = librosa.load(file_path, sr=16000)
            inputs = processor(audio, sampling_rate=sr, return_tensors="pt")
            input_values = inputs["input_features"].to(device)
            
            with torch.no_grad():
                outputs = model.encoder(input_values)
                for index in layer_indices:
                    hidden_states = outputs.hidden_states[index]
                    mean_pooled_hidden_states = hidden_states.mean(dim=1)  # Mean Pooling über die Zeitdimension
                    # creating sub directory for each layer in output directory
                    layer_dir = os.path.join(output_directory, f"layer_{index}")
                    os.makedirs(layer_dir, exist_ok=True)
                    save_path = os.path.join(layer_dir, f"{os.path.splitext(filename)[0]}_layer_{index}.npy")
                    np.save(save_path, mean_pooled_hidden_states.cpu().numpy())

def process_audio_directory(input_base_directory, output_base_directory, layer_indices=range(25)):
    """Verarbeitet Audio-Dateien in den angegebenen Verzeichnissen und speichert die Ergebnisse im Zielverzeichnis."""
    for d in os.listdir(input_base_directory):
        input_dir_path = os.path.join(input_base_directory, d)
        output_dir_path = os.path.join(output_base_directory, d)
        if os.path.isdir(input_dir_path) and not check_directories_exist(output_dir_path, layer_indices):
            load_audio_files(input_dir_path, output_dir_path, layer_indices)

input_directory_path = os.path.expanduser("/home/rag/experimental_trial/data/all_speakers_backup")
output_directory_path = os.path.expanduser("/home/rag/experimental_trial/data/all_speakers_whisper")
process_audio_directory(input_directory_path, output_directory_path)

Special tokens have been added in the vocabulary, make sure the associated word embeddings are fine-tuned or trained.


In [None]:
import os
import sys
import torch
import torch.nn as nn
import librosa
import pandas as pd
import numpy as np
from torch.utils.data import Dataset
from transformers import WhisperProcessor, WhisperModel, Trainer, TrainingArguments, TrainerCallback, WhisperConfig
import math
from datasets import load_metric
from datetime import datetime

class WhisperForSequenceClassification(nn.Module):
    def __init__(self, config):
        super(WhisperForSequenceClassification, self).__init__()
        self.whisper = WhisperModel.from_pretrained("openai/whisper-large")
        self.classifier = nn.Linear(config.hidden_size, config.num_labels)

    def forward(self, input_values, labels=None):
        outputs = self.whisper(input_values).last_hidden_state
        pooled_output = outputs.mean(dim=1)  # Mean Pooling über die Zeitdimension
        logits = self.classifier(pooled_output)
        
        loss = None
        if labels is not None:
            loss_fct = nn.CrossEntropyLoss()
            loss = loss_fct(logits.view(-1, self.config.num_labels), labels.view(-1))
        
        return (loss, logits) if loss is not None else logits

# Initialize the processor and model for Whisper
processor = WhisperProcessor.from_pretrained("openai/whisper-large")

# Define the custom dataset class using pandas
class LocalAudioDataset(Dataset):
    def __init__(self, csv_file, processor, subset):
        self.processor = processor
        self.data = pd.read_csv(csv_file)
        self.data = self.data[self.data['subset'] == subset]
        self.speaker_ids = {label: idx for idx, label in enumerate(self.data['label'].unique())}
        self.data['label'] = self.data['label'].map(self.speaker_ids)
        
        print(f"Loaded {len(self.speaker_ids)} speakers: {self.speaker_ids}")
        print(f"Total files in {subset}: {len(self.data)}")

    def __len__(self):
        return len(self.data)

    def __getitem__(self, idx):
        file_path = self.data.iloc[idx]['path']
        label = self.data.iloc[idx]['label']
        
        try:
            audio, sr = librosa.load(file_path, sr=16000)
            audio = librosa.to_mono(audio)
            audio = self._pad_or_truncate(audio, max_length=16000)
            input_values = self.processor(audio, sampling_rate=16000, return_tensors="pt").input_values.squeeze(0)
            return {"input_values": input_values, "labels": label}
        except Exception as e:
            print(f"Error loading {file_path}: {e}", file=sys.stderr)
            return self.__getitem__((idx + 1) % len(self))

    def _pad_or_truncate(self, audio, max_length):
        if len(audio) < max_length:
            pad_size = max_length - len(audio)
            audio = np.pad(audio, (0, pad_size), 'constant', constant_values=(0, 0))
        else:
            audio = audio[:max_length]
        return audio

# Paths to dataset CSV file
csv_file = 'dataset_large.csv'
train_dataset = LocalAudioDataset(csv_file, processor, 'train')
validate_dataset = LocalAudioDataset(csv_file, processor, 'validate')
test_dataset = LocalAudioDataset(csv_file, processor, 'test')

num_speakers = len(train_dataset.speaker_ids)
config = WhisperConfig.from_pretrained("openai/whisper-large", num_labels=num_speakers)
model = WhisperForSequenceClassification(config)

device = torch.device("cuda" if torch.cuda.is_available() else "cpu")
print(f"Using device: {device}")
model = model.to(device)

def validate_labels(dataset):
    for item in dataset:
        label = item['labels']
        if label >= num_speakers or label < 0:
            print(f"Invalid label {label} for item: {item}")
            raise ValueError(f"Invalid label {label} found in dataset.")
    print("All labels are valid.")

validate_labels(train_dataset)
validate_labels(validate_dataset)
validate_labels(test_dataset)

batch_size = 8
steps_per_epoch = math.ceil(len(train_dataset) / batch_size)
logging_steps = steps_per_epoch // 5
eval_steps = steps_per_epoch // 5

accuracy_metric = load_metric("accuracy")

def compute_metrics(eval_pred):
    logits, labels = eval_pred
    predictions = np.argmax(logits, axis=-1)
    return accuracy_metric.compute(predictions=predictions, references=labels)

log_dir = "/home/rag/experimental_trial/results/training_logs"
os.makedirs(log_dir, exist_ok=True)
timestamp = datetime.now().strftime("%Y%m%d_%H%M%S")
log_file = os.path.join(log_dir, f"training_log_100_epochs_5_layer{timestamp}.csv")
with open(log_file, "w") as f:
    f.write("Timestamp,Step,Training Loss,Validation Loss,Accuracy\n")

class SaveMetricsCallback(TrainerCallback):
    def on_log(self, args, state, control, logs=None, **kwargs):
        if logs is not None:
            with open(log_file, "a") as f:
                timestamp = datetime.now().strftime("%Y-%m-%d %H:%M:%S")
                step = state.global_step
                training_loss = logs.get("loss", "")
                validation_loss = logs.get("eval_loss", "")
                accuracy = logs.get("eval_accuracy", "")
                f.write(f"{timestamp},{step},{training_loss},{validation_loss},{accuracy}\n")

class EarlyStoppingCallback(TrainerCallback):
    def __init__(self, early_stopping_patience=100, early_stopping_threshold=0.0):
        self.early_stopping_patience = early_stopping_patience
        self.early_stopping_threshold = early_stopping_threshold
        self.best_metric = None
        self.patience_counter = 0

    def on_evaluate(self, args, state, control, **kwargs):
        metric = kwargs.get("metrics", {}).get("eval_loss")
        if metric is None:
            return
        
        if self.best_metric is None or metric < self.best_metric - self.early_stopping_threshold:
            self.best_metric = metric
            self.patience_counter = 0
        else:
            self.patience_counter += 1
        
        if self.patience_counter >= self.early_stopping_patience:
            print(f"Early stopping at step {state.global_step}")
            control.should_training_stop = True

training_args = TrainingArguments(
    output_dir="./results",
    group_by_length=True,
    per_device_train_batch_size=batch_size,
    evaluation_strategy="steps",
    num_train_epochs=100,
    save_steps=logging_steps,
    eval_steps=eval_steps,
    logging_steps=logging_steps,
    learning_rate=5e-6,
    save_total_limit=2,
    no_cuda=False,
    load_best_model_at_end=True,
    metric_for_best_model="eval_loss",
    greater_is_better=False,  # lower eval_loss is better
    save_strategy="steps"  # or "epoch" if you prefer to save every epoch
)

trainer = Trainer(
    model=model,
    args=training_args,
    train_dataset=train_dataset,
    eval_dataset=validate_dataset,
    tokenizer=processor,
    compute_metrics=compute_metrics,
    callbacks=[SaveMetricsCallback(), EarlyStoppingCallback()]
)

trainer.train()

metrics = trainer.evaluate(test_dataset)

print(f"Test set evaluation metrics: {metrics}")
print("Training and evaluation completed successfully!")

best_model_dir = "./results/best_model_100_epochs_5_layer"
os.makedirs(best_model_dir, exist_ok=True)

trainer.save_model(best_model_dir)
processor.save_pretrained(best_model_dir)

print(f"Best model saved to {best_model_dir}")

ERROR! Session/line number was not unique in database. History logging moved to new session 287


Special tokens have been added in the vocabulary, make sure the associated word embeddings are fine-tuned or trained.


Loaded 111 speakers: {'speaker_6': 0, 'speaker_156': 1, 'speaker_22': 2, 'speaker_19': 3, 'speaker_91': 4, 'speaker_27': 5, 'speaker_94': 6, 'speaker_34': 7, 'speaker_97': 8, 'speaker_100': 9, 'speaker_36': 10, 'speaker_128': 11, 'speaker_134': 12, 'speaker_68': 13, 'speaker_9': 14, 'speaker_17': 15, 'speaker_73': 16, 'speaker_42': 17, 'speaker_52': 18, 'speaker_151': 19, 'speaker_150': 20, 'speaker_141': 21, 'speaker_82': 22, 'speaker_130': 23, 'speaker_75': 24, 'speaker_58': 25, 'speaker_74': 26, 'speaker_104': 27, 'speaker_47': 28, 'speaker_135': 29, 'speaker_71': 30, 'speaker_83': 31, 'speaker_116': 32, 'speaker_99': 33, 'speaker_108': 34, 'speaker_31': 35, 'speaker_106': 36, 'speaker_28': 37, 'speaker_65': 38, 'speaker_48': 39, 'speaker_49': 40, 'speaker_53': 41, 'speaker_3': 42, 'speaker_63': 43, 'speaker_138': 44, 'speaker_98': 45, 'speaker_92': 46, 'speaker_123': 47, 'speaker_32': 48, 'speaker_10': 49, 'speaker_155': 50, 'speaker_153': 51, 'speaker_23': 52, 'speaker_59': 53, 's



Using device: cuda


Error loading /home/rag/experimental_trial/data/finetuning_dataset_large/speaker_6/train/8dec8422ff907ed644c5af9c3f985dc9bc1a6d3e4530d6d6a8e235ed6e925df3ed4953c758fc83fc43686c8b25c88961eed6e938476a53810c1f605fc8364a5b.mp3: 
Error loading /home/rag/experimental_trial/data/finetuning_dataset_large/speaker_6/train/87ca082c21113b4a0a6929e51905d4656de77b9a5f5e109c4dd0611aee245f7043a806b6937edbc0017e0e51be91e3ff42aac1726c841dc681a7703d7105d365.mp3: 
Error loading /home/rag/experimental_trial/data/finetuning_dataset_large/speaker_6/train/ba90490e7cfd09c03c40a882dbe9872644db603990895296a0a6ac8caeb3bd7999e45eeb76357364279531345d69f29144bb01b1926251d9945a8b708129b763.mp3: 
Error loading /home/rag/experimental_trial/data/finetuning_dataset_large/speaker_6/train/8d098ff5e6d3bc51b9d3c66730225f6ce0690b033f972af7aa0a96caf43952632522875b5fc20adc6b7c65bd8be4194e01ce2e3059ebe56390744eaaaed271a2.mp3: 
Error loading /home/rag/experimental_trial/data/finetuning_dataset_large/speaker_6/train/102bd870ea99826

# hyperparam tuning for whisper

In [1]:
import optuna
import torch
from torch.utils.data import Dataset, DataLoader, RandomSampler
from transformers import Trainer, TrainingArguments, TrainerCallback, WhisperConfig, WhisperModel, WhisperProcessor
import pandas as pd
import librosa
import numpy as np
import os
import sys
import math
from datasets import load_metric
from datetime import datetime
import logging
import torch.nn.functional as F

# Set up logging for Optuna
logging.basicConfig(stream=sys.stdout, level=logging.INFO)
logger = logging.getLogger()

# Load the processor
processor = WhisperProcessor.from_pretrained("openai/whisper-large")

# Define the custom dataset class
class LocalAudioDataset(Dataset):
    def __init__(self, csv_file, processor, subset, noise_factor=0.0, max_speakers=50):
        self.processor = processor
        self.data = pd.read_csv(csv_file)
        self.data = self.data[self.data['subset'] == subset]
        
        # Limit the number of speakers to max_speakers
        speaker_counts = self.data['label'].value_counts()
        top_speakers = speaker_counts.nlargest(max_speakers).index
        self.data = self.data[self.data['label'].isin(top_speakers)]
        
        self.speaker_ids = {label: idx for idx, label in enumerate(self.data['label'].unique())}
        self.data['label'] = self.data['label'].map(self.speaker_ids)
        self.noise_factor = noise_factor
        
        print(f"Loaded {len(self.speaker_ids)} speakers: {self.speaker_ids}")
        print(f"Total files in {subset}: {len(self.data)}")

    def __len__(self):
        return len(self.data)

    def __getitem__(self, idx):
        file_path = self.data.iloc[idx]['path']
        label = self.data.iloc[idx]['label']
        
        try:
            audio, sr = librosa.load(file_path, sr=16000)
            audio = librosa.to_mono(audio)
            # Use the processor to extract features
            inputs = self.processor(audio, sampling_rate=sr, return_tensors="pt")
            input_values = inputs.input_features.squeeze(0)
            return {"input_values": input_values, "labels": label}
        except Exception as e:
            print(f"Error loading {file_path}: {e}", file=sys.stderr)
            return self.__getitem__((idx + 1) % len(self))

# Paths to dataset CSV file
csv_file = 'dataset_large.csv'
train_dataset = LocalAudioDataset(csv_file, processor, 'train', noise_factor=0, max_speakers=111)
validate_dataset = LocalAudioDataset(csv_file, processor, 'validate', max_speakers=111)
test_dataset = LocalAudioDataset(csv_file, processor, 'test', max_speakers=111)

num_speakers = len(train_dataset.speaker_ids)
print(f"Number of unique speakers: {num_speakers}")

print(f"Labels in train dataset: {train_dataset.data['label'].tolist()}")
print(f"Labels in test dataset: {test_dataset.data['label'].tolist()}")

device = torch.device("cuda" if torch.cuda.is_available() else "cpu")
print(f"Using device: {device}")

def validate_labels(dataset):
    for item in dataset:
        label = item['labels']
        if label >= num_speakers or label < 0:
            print(f"Invalid label {label} for item: {item}")
            raise ValueError(f"Invalid label {label} found in dataset.")
    print("All labels are valid.")

batch_size = 2
steps_per_epoch = math.ceil(len(train_dataset) / batch_size)
logging_steps = steps_per_epoch // 5
eval_steps = steps_per_epoch // 5

accuracy_metric = load_metric("accuracy")

def compute_metrics(eval_pred):
    logits, labels = eval_pred
    predictions = np.argmax(logits, axis=-1)
    return accuracy_metric.compute(predictions=predictions, references=labels)

log_dir = "/home/rag/experimental_trial/results/training_logs"
os.makedirs(log_dir, exist_ok=True)
timestamp = datetime.now().strftime("%Y%m%d_%H%M%S")
log_file = os.path.join(log_dir, f"training_log_versuch2_2layer{timestamp}.csv")
with open(log_file, "w") as f:
    f.write("Timestamp,Step,Training Loss,Validation Loss,Accuracy\n")

class SaveMetricsCallback(TrainerCallback):
    def on_log(self, args, state, control, logs=None, **kwargs):
        if logs is not None:
            with open(log_file, "a") as f:
                timestamp = datetime.now().strftime("%Y-%m-%d %H:%M:%S")
                step = state.global_step
                training_loss = logs.get("loss", "")
                validation_loss = logs.get("eval_loss", "")
                accuracy = logs.get("eval_accuracy", "")
                f.write(f"{timestamp},{step},{training_loss},{validation_loss},{accuracy}\n")

class EarlyStoppingCallback(TrainerCallback):
    def __init__(self, early_stopping_patience=100, early_stopping_threshold=0.0):
        self.early_stopping_patience = early_stopping_patience
        self.early_stopping_threshold = early_stopping_threshold
        self.best_metric = None
        self.patience_counter = 0

    def on_evaluate(self, args, state, control, **kwargs):
        metric = kwargs.get("metrics", {}).get("eval_loss")
        if metric is None:
            return
        
        if self.best_metric is None or metric < self.best_metric - self.early_stopping_threshold:
            self.best_metric = metric
            self.patience_counter = 0
        else:
            self.patience_counter += 1
        
        if self.patience_counter >= self.early_stopping_patience:
            print(f"Early stopping at step {state.global_step}")
            control.should_training_stop = True

# Custom classification head with mean pooling
class CustomWhisperForSequenceClassification(torch.nn.Module):
    def __init__(self, config):
        super().__init__()
        self.whisper = WhisperModel(config)
        self.pooling = torch.nn.AdaptiveAvgPool1d(1)
        self.hidden_size = config.d_model
        self.num_labels = config.num_labels
        self.classifier = torch.nn.Linear(self.hidden_size, self.num_labels)

    def forward(self, input_values, attention_mask=None, labels=None):
        # Pass input through Whisper encoder
        encoder_outputs = self.whisper.encoder(input_values)
        hidden_states = encoder_outputs.last_hidden_state
        
        # Apply pooling
        pooled_output = self.pooling(hidden_states.transpose(1, 2)).squeeze(-1)
        
        # Ensure the pooled output has the correct shape
        if pooled_output.dim() == 1:
            pooled_output = pooled_output.unsqueeze(0)
        
        # Pass through classifier
        logits = self.classifier(pooled_output)
        
        loss = None
        if labels is not None:
            loss = F.cross_entropy(logits, labels)
        
        return (loss, logits) if loss is not None else (logits,)

# Custom data collator for Whisper
class DataCollatorForWhisper:
    def __call__(self, features):
        input_values = torch.stack([f["input_values"] for f in features])
        labels = torch.tensor([f["labels"] for f in features], dtype=torch.long)
        return {"input_values": input_values, "labels": labels}

# Extend the Trainer class
class CustomTrainer(Trainer):
    def __init__(self, *args, **kwargs):
        super().__init__(*args, **kwargs)
        self.best_loss_model_dir = "./results/best_model_loss_2layer_versuch2"
        self.best_accuracy_model_dir = "./results/best_model_accuracy_versuch2"
        os.makedirs(self.best_loss_model_dir, exist_ok=True)
        os.makedirs(self.best_accuracy_model_dir, exist_ok=True)
        self.best_eval_loss = float("inf")
        self.best_eval_accuracy = 0.0

    def evaluate(self, eval_dataset=None, ignore_keys=None, metric_key_prefix="eval"):
        eval_metrics = super().evaluate(eval_dataset, ignore_keys, metric_key_prefix)
        
        current_eval_loss = eval_metrics["eval_loss"]
        current_eval_accuracy = eval_metrics["eval_accuracy"]
        
        if current_eval_loss < self.best_eval_loss:
            self.best_eval_loss = current_eval_loss
            self.save_model(self.best_loss_model_dir)
            print(f"Saved best model according to eval_loss: {self.best_eval_loss}")

        if current_eval_accuracy > self.best_eval_accuracy:
            self.best_eval_accuracy = current_eval_accuracy
            self.save_model(self.best_accuracy_model_dir)
            print(f"Saved best model according to eval_accuracy: {self.best_eval_accuracy}")

        return eval_metrics
    
    def compute_loss(self, model, inputs, return_outputs=False):
        input_values = inputs.get("input_values")
        labels = inputs.get("labels")
        outputs = model(input_values=input_values, labels=labels)
        loss = outputs[0]
        return (loss, outputs) if return_outputs else loss

def objective(trial):
    # Suggest the number of layers
    num_layers = trial.suggest_int('num_layers', 1, 24)
    
    # Load the model configuration with the suggested number of layers
    config = WhisperConfig.from_pretrained("openai/whisper-large", num_labels=num_speakers)
    config.num_hidden_layers = num_layers
    model = CustomWhisperForSequenceClassification(config)
    
    # Apply the number of hidden layers correctly
    model.whisper.encoder.layers = torch.nn.ModuleList(model.whisper.encoder.layers[:num_layers])
    
    # Transfer the model to the correct device
    model = model.to(device)
    
    training_args = TrainingArguments(
        output_dir="./results",
        group_by_length=False,
        per_device_train_batch_size=batch_size,
        evaluation_strategy="steps",
        num_train_epochs=3,
        save_steps=logging_steps,
        eval_steps=eval_steps,
        logging_steps=logging_steps,
        learning_rate=1e-5,
        save_total_limit=2,
        no_cuda=False,
        load_best_model_at_end=True,
        metric_for_best_model="eval_loss",
        greater_is_better=False,  # lower eval_loss is better
        save_strategy="steps"  # or "epoch" if you prefer to save every epoch
    )
    
    trainer = CustomTrainer(
        model=model,
        args=training_args,
        train_dataset=train_dataset,
        eval_dataset=validate_dataset,
        data_collator=DataCollatorForWhisper(),
        tokenizer=processor,
        compute_metrics=compute_metrics,
        callbacks=[SaveMetricsCallback(), EarlyStoppingCallback(early_stopping_patience=50)]
    )
    
    trainer.train()
    
    # Evaluate the model
    metrics = trainer.evaluate(validate_dataset)
    return metrics['eval_loss']

study = optuna.create_study(direction='minimize')
study.optimize(objective, n_trials=12)

print("Best trial:")
trial = study.best_trial
print(f"  Value: {trial.value}")
print("  Params: ")
for key, value in trial.params.items():
    print(f"    {key}: {value}")
    
nicht dieses model sondern das undere laufen lassen



SyntaxError: invalid syntax (2902070393.py, line 269)

dieses optuna optimier skript ist das richtige


In [2]:
import optuna
import torch
from torch.utils.data import Dataset, DataLoader, RandomSampler
from transformers import Trainer, TrainingArguments, TrainerCallback, WhisperConfig, WhisperModel, WhisperProcessor
import pandas as pd
import librosa
import numpy as np
import os
import sys
import math
from datasets import load_metric
from datetime import datetime
import logging
import torch.nn.functional as F

# Set up logging for Optuna
log_dir = "/home/rag/experimental_trial/results/training_logs"
os.makedirs(log_dir, exist_ok=True)
timestamp = datetime.now().strftime("%Y%m%d_%H%M%S")
log_file = os.path.join(log_dir, f"training_log_optuna_optim_whisper{timestamp}.csv")

logging.basicConfig(stream=sys.stdout, level=logging.INFO)
logger = logging.getLogger()

# Add file handler to logger
file_handler = logging.FileHandler(log_file)
file_handler.setLevel(logging.INFO)
formatter = logging.Formatter('%(asctime)s - %(levelname)s - %(message)s')
file_handler.setFormatter(formatter)
logger.addHandler(file_handler)

# Redirect Optuna logging to the file
optuna_logger = logging.getLogger("optuna")
optuna_logger.addHandler(file_handler)

# Load the processor
processor = WhisperProcessor.from_pretrained("openai/whisper-large")

# Define the custom dataset class
class LocalAudioDataset(Dataset):
    def __init__(self, csv_file, processor, subset, noise_factor=0.0, max_speakers=50):
        self.processor = processor
        self.data = pd.read_csv(csv_file)
        self.data = self.data[self.data['subset'] == subset]
        
        # Limit the number of speakers to max_speakers
        speaker_counts = self.data['label'].value_counts()
        top_speakers = speaker_counts.nlargest(max_speakers).index
        self.data = self.data[self.data['label'].isin(top_speakers)]
        
        self.speaker_ids = {label: idx for idx, label in enumerate(self.data['label'].unique())}
        self.data['label'] = self.data['label'].map(self.speaker_ids)
        self.noise_factor = noise_factor
        
        print(f"Loaded {len(self.speaker_ids)} speakers: {self.speaker_ids}")
        print(f"Total files in {subset}: {len(self.data)}")

    def __len__(self):
        return len(self.data)

    def __getitem__(self, idx):
        file_path = self.data.iloc[idx]['path']
        label = self.data.iloc[idx]['label']
        
        try:
            audio, sr = librosa.load(file_path, sr=16000)
            audio = librosa.to_mono(audio)
            # Use the processor to extract features
            inputs = self.processor(audio, sampling_rate=sr, return_tensors="pt")
            input_values = inputs.input_features.squeeze(0)
            return {"input_values": input_values, "labels": label}
        except Exception as e:
            print(f"Error loading {file_path}: {e}", file=sys.stderr)
            return self.__getitem__((idx + 1) % len(self))

# Paths to dataset CSV file
csv_file = 'dataset_large.csv'
train_dataset = LocalAudioDataset(csv_file, processor, 'train', noise_factor=0, max_speakers=111)
validate_dataset = LocalAudioDataset(csv_file, processor, 'validate', max_speakers=111)
test_dataset = LocalAudioDataset(csv_file, processor, 'test', max_speakers=111)

num_speakers = len(train_dataset.speaker_ids)
print(f"Number of unique speakers: {num_speakers}")

print(f"Labels in train dataset: {train_dataset.data['label'].tolist()}")
print(f"Labels in test dataset: {test_dataset.data['label'].tolist()}")

device = torch.device("cuda" if torch.cuda.is_available() else "cpu")
print(f"Using device: {device}")

def validate_labels(dataset):
    for item in dataset:
        label = item['labels']
        if label >= num_speakers or label < 0:
            print(f"Invalid label {label} for item: {item}")
            raise ValueError(f"Invalid label {label} found in dataset.")
    print("All labels are valid.")

batch_size = 2
steps_per_epoch = math.ceil(len(train_dataset) / batch_size)
logging_steps = steps_per_epoch // 5
eval_steps = steps_per_epoch // 5

accuracy_metric = load_metric("accuracy")

def compute_metrics(eval_pred):
    logits, labels = eval_pred
    predictions = np.argmax(logits, axis=-1)
    return accuracy_metric.compute(predictions=predictions, references=labels)

class SaveMetricsCallback(TrainerCallback):
    def on_log(self, args, state, control, logs=None, **kwargs):
        if logs is not None:
            with open(log_file, "a") as f:
                timestamp = datetime.now().strftime("%Y-%m-%d %H:%M:%S")
                step = state.global_step
                training_loss = logs.get("loss", "")
                validation_loss = logs.get("eval_loss", "")
                accuracy = logs.get("eval_accuracy", "")
                f.write(f"{timestamp},{step},{training_loss},{validation_loss},{accuracy}\n")

class EarlyStoppingCallback(TrainerCallback):
    def __init__(self, early_stopping_patience=100, early_stopping_threshold=0.0):
        self.early_stopping_patience = early_stopping_patience
        self.early_stopping_threshold = early_stopping_threshold
        self.best_metric = None
        self.patience_counter = 0

    def on_evaluate(self, args, state, control, **kwargs):
        metric = kwargs.get("metrics", {}).get("eval_loss")
        if metric is None:
            return
        
        if self.best_metric is None or metric < self.best_metric - self.early_stopping_threshold:
            self.best_metric = metric
            self.patience_counter = 0
        else:
            self.patience_counter += 1
        
        if self.patience_counter >= self.early_stopping_patience:
            print(f"Early stopping at step {state.global_step}")
            control.should_training_stop = True

# Custom classification head with mean pooling
class CustomWhisperForSequenceClassification(torch.nn.Module):
    def __init__(self, config):
        super().__init__()
        self.whisper = WhisperModel(config)
        self.pooling = torch.nn.AdaptiveAvgPool1d(1)
        self.hidden_size = config.d_model
        self.num_labels = config.num_labels
        self.classifier = torch.nn.Linear(self.hidden_size, self.num_labels)

    def forward(self, input_values, attention_mask=None, labels=None):
        # Pass input through Whisper encoder
        encoder_outputs = self.whisper.encoder(input_values)
        hidden_states = encoder_outputs.last_hidden_state
        
        # Apply pooling
        pooled_output = self.pooling(hidden_states.transpose(1, 2)).squeeze(-1)
        
        # Ensure the pooled output has the correct shape
        if pooled_output.dim() == 1:
            pooled_output = pooled_output.unsqueeze(0)
        
        # Pass through classifier
        logits = self.classifier(pooled_output)
        
        loss = None
        if labels is not None:
            loss = F.cross_entropy(logits, labels)
        
        return (loss, logits) if loss is not None else (logits,)

# Custom data collator for Whisper
class DataCollatorForWhisper:
    def __call__(self, features):
        input_values = torch.stack([f["input_values"] for f in features])
        labels = torch.tensor([f["labels"] for f in features], dtype=torch.long)
        return {"input_values": input_values, "labels": labels}

# Extend the Trainer class
class CustomTrainer(Trainer):
    def __init__(self, *args, **kwargs):
        super().__init__(*args, **kwargs)
        self.best_loss_model_dir = "./results/best_model_loss_2layer_versuch2"
        self.best_accuracy_model_dir = "./results/best_model_accuracy_versuch2"
        os.makedirs(self.best_loss_model_dir, exist_ok=True)
        os.makedirs(self.best_accuracy_model_dir, exist_ok=True)
        self.best_eval_loss = float("inf")
        self.best_eval_accuracy = 0.0

    def evaluate(self, eval_dataset=None, ignore_keys=None, metric_key_prefix="eval"):
        eval_metrics = super().evaluate(eval_dataset, ignore_keys, metric_key_prefix)
        
        current_eval_loss = eval_metrics["eval_loss"]
        current_eval_accuracy = eval_metrics["eval_accuracy"]
        
        if current_eval_loss < self.best_eval_loss:
            self.best_eval_loss = current_eval_loss
            self.save_model(self.best_loss_model_dir)
            print(f"Saved best model according to eval_loss: {self.best_eval_loss}")

        if current_eval_accuracy > self.best_eval_accuracy:
            self.best_eval_accuracy = current_eval_accuracy
            self.save_model(self.best_accuracy_model_dir)
            print(f"Saved best model according to eval_accuracy: {self.best_eval_accuracy}")

        return eval_metrics
    
    def compute_loss(self, model, inputs, return_outputs=False):
        input_values = inputs.get("input_values")
        labels = inputs.get("labels")
        outputs = model(input_values=input_values, labels=labels)
        loss = outputs[0]
        return (loss, outputs) if return_outputs else loss

def objective(trial):
    # Suggest the number of layers
    num_layers = trial.suggest_int('num_layers', 1, 24)
    
    # Load the model configuration with the suggested number of layers
    config = WhisperConfig.from_pretrained("openai/whisper-large", num_labels=num_speakers)
    config.num_hidden_layers = num_layers
    model = CustomWhisperForSequenceClassification(config)
    
    # Apply the number of hidden layers correctly
    model.whisper.encoder.layers = torch.nn.ModuleList(model.whisper.encoder.layers[:num_layers])
    
    # Transfer the model to the correct device
    model = model.to(device)
    
    training_args = TrainingArguments(
        output_dir="./results",
        group_by_length=False,
        per_device_train_batch_size=batch_size,
        evaluation_strategy="steps",
        num_train_epochs=3,
        save_steps=logging_steps,
        eval_steps=eval_steps,
        logging_steps=logging_steps,
        learning_rate=1e-5,
        save_total_limit=2,
        no_cuda=False,
        load_best_model_at_end=True,
        metric_for_best_model="eval_loss",
        greater_is_better=False,  # lower eval_loss is better
        save_strategy="steps"  # or "epoch" if you prefer to save every epoch
    )
    
    trainer = CustomTrainer(
        model=model,
        args=training_args,
        train_dataset=train_dataset,
        eval_dataset=validate_dataset,
        data_collator=DataCollatorForWhisper(),
        tokenizer=processor,
        compute_metrics=compute_metrics,
        callbacks=[SaveMetricsCallback(), EarlyStoppingCallback(early_stopping_patience=50)]
    )
    
    trainer.train()
    
    # Evaluate the model
    metrics = trainer.evaluate(validate_dataset)
    return metrics['eval_loss']

study = optuna.create_study(direction='minimize')
study.optimize(objective, n_trials=11)

result_file = os.path.join(log_dir, "OptunaResult.txt")
with open(result_file, "w") as f:
    f.write("Best trial:\n")
    trial = study.best_trial
    f.write(f"  Value: {trial.value}\n")
    f.write("  Params:\n")
    for key, value in trial.params.items():
        f.write(f"    {key}: {value}\n")
    
    f.write("\nAll trials:\n")
    for i, trial in enumerate(study.trials):
        f.write(f"Trial {i}:\n")
        f.write(f"  Value: {trial.value}\n")
        f.write("  Params:\n")
        for key, value in trial.params.items():
            f.write(f"    {key}: {value}\n")
        f.write("\n")
    
    f.write("Operation finished.\n")

print("Best trial:")
print(f"  Value: {trial.value}")
print("  Params: ")
for key, value in trial.params.items():
    print(f"    {key}: {value}")


Special tokens have been added in the vocabulary, make sure the associated word embeddings are fine-tuned or trained.


Loaded 111 speakers: {'speaker_6': 0, 'speaker_156': 1, 'speaker_22': 2, 'speaker_19': 3, 'speaker_91': 4, 'speaker_27': 5, 'speaker_94': 6, 'speaker_34': 7, 'speaker_97': 8, 'speaker_100': 9, 'speaker_36': 10, 'speaker_128': 11, 'speaker_134': 12, 'speaker_68': 13, 'speaker_9': 14, 'speaker_17': 15, 'speaker_73': 16, 'speaker_42': 17, 'speaker_52': 18, 'speaker_151': 19, 'speaker_150': 20, 'speaker_141': 21, 'speaker_82': 22, 'speaker_130': 23, 'speaker_75': 24, 'speaker_58': 25, 'speaker_74': 26, 'speaker_104': 27, 'speaker_47': 28, 'speaker_135': 29, 'speaker_71': 30, 'speaker_83': 31, 'speaker_116': 32, 'speaker_99': 33, 'speaker_108': 34, 'speaker_31': 35, 'speaker_106': 36, 'speaker_28': 37, 'speaker_65': 38, 'speaker_48': 39, 'speaker_49': 40, 'speaker_53': 41, 'speaker_3': 42, 'speaker_63': 43, 'speaker_138': 44, 'speaker_98': 45, 'speaker_92': 46, 'speaker_123': 47, 'speaker_32': 48, 'speaker_10': 49, 'speaker_155': 50, 'speaker_153': 51, 'speaker_23': 52, 'speaker_59': 53, 's

  accuracy_metric = load_metric("accuracy")
You can avoid this message in future by passing the argument `trust_remote_code=True`.
Passing `trust_remote_code=True` will be mandatory to load this metric from the next major release of `datasets`.
[I 2024-06-20 22:42:21,885] A new study created in memory with name: no-name-cced385d-d227-4b3f-8daa-6538206ca835


Step,Training Loss,Validation Loss,Accuracy
888,4.9215,4.795257,0.017117
1776,4.5711,4.397316,0.024324
2664,4.3163,4.281329,0.030631
3552,4.169,4.050462,0.03964
4440,3.9001,3.75689,0.058559
5328,3.6513,3.500886,0.111712
6216,3.4518,3.270682,0.132432
7104,3.2813,3.143292,0.157658
7992,3.1554,2.969024,0.181081
8880,3.0101,2.893345,0.206306


Saved best model according to eval_loss: 4.795257091522217
Saved best model according to eval_accuracy: 0.017117117117117116
Saved best model according to eval_loss: 4.397315979003906
Saved best model according to eval_accuracy: 0.024324324324324326
Saved best model according to eval_loss: 4.281329154968262
Saved best model according to eval_accuracy: 0.03063063063063063
Saved best model according to eval_loss: 4.050461769104004
Saved best model according to eval_accuracy: 0.03963963963963964
Saved best model according to eval_loss: 3.756890058517456
Saved best model according to eval_accuracy: 0.05855855855855856
Saved best model according to eval_loss: 3.5008862018585205
Saved best model according to eval_accuracy: 0.11171171171171171
Saved best model according to eval_loss: 3.2706823348999023
Saved best model according to eval_accuracy: 0.13243243243243244
Saved best model according to eval_loss: 3.143291711807251
Saved best model according to eval_accuracy: 0.15765765765765766
Save

[I 2024-06-21 02:57:07,878] Trial 0 finished with value: 2.5828609466552734 and parameters: {'num_layers': 16}. Best is trial 0 with value: 2.5828609466552734.


Step,Training Loss,Validation Loss,Accuracy
888,4.9261,4.800497,0.013514
1776,4.5748,4.387306,0.024324
2664,4.2983,4.243195,0.028829
3552,4.1159,3.99158,0.051351
4440,3.8305,3.685561,0.087387
5328,3.5531,3.389331,0.114414
6216,3.3323,3.185437,0.146847
7104,3.1497,3.015546,0.163063
7992,3.046,2.885855,0.213514
8880,2.9214,2.809806,0.233333


Saved best model according to eval_loss: 4.800497055053711
Saved best model according to eval_accuracy: 0.013513513513513514
Saved best model according to eval_loss: 4.387305736541748
Saved best model according to eval_accuracy: 0.024324324324324326
Saved best model according to eval_loss: 4.243194580078125
Saved best model according to eval_accuracy: 0.02882882882882883
Saved best model according to eval_loss: 3.991579532623291
Saved best model according to eval_accuracy: 0.051351351351351354
Saved best model according to eval_loss: 3.685560941696167
Saved best model according to eval_accuracy: 0.08738738738738738
Saved best model according to eval_loss: 3.389331340789795
Saved best model according to eval_accuracy: 0.11441441441441441
Saved best model according to eval_loss: 3.185436964035034
Saved best model according to eval_accuracy: 0.14684684684684685
Saved best model according to eval_loss: 3.0155463218688965
Saved best model according to eval_accuracy: 0.16306306306306306
Save

[I 2024-06-21 08:36:40,952] Trial 1 finished with value: 2.534303665161133 and parameters: {'num_layers': 22}. Best is trial 1 with value: 2.534303665161133.


Step,Training Loss,Validation Loss,Accuracy
888,4.9213,4.799598,0.009009
1776,4.5894,4.391996,0.021622
2664,4.3045,4.28029,0.025225
3552,4.1382,4.032524,0.043243
4440,3.9195,3.789524,0.056757
5328,3.6747,3.469192,0.110811
6216,3.4355,3.277332,0.133333
7104,3.2409,3.085491,0.165766
7992,3.1034,2.933263,0.213514
8880,2.9645,2.83448,0.222523


Saved best model according to eval_loss: 4.79959774017334
Saved best model according to eval_accuracy: 0.009009009009009009
Saved best model according to eval_loss: 4.391996383666992
Saved best model according to eval_accuracy: 0.021621621621621623
Saved best model according to eval_loss: 4.280289649963379
Saved best model according to eval_accuracy: 0.025225225225225224
Saved best model according to eval_loss: 4.032524108886719
Saved best model according to eval_accuracy: 0.043243243243243246
Saved best model according to eval_loss: 3.7895243167877197
Saved best model according to eval_accuracy: 0.05675675675675676
Saved best model according to eval_loss: 3.4691920280456543
Saved best model according to eval_accuracy: 0.11081081081081082
Saved best model according to eval_loss: 3.277332305908203
Saved best model according to eval_accuracy: 0.13333333333333333
Saved best model according to eval_loss: 3.085491418838501
Saved best model according to eval_accuracy: 0.16576576576576577
Sav

[I 2024-06-21 13:32:02,344] Trial 2 finished with value: 2.538898229598999 and parameters: {'num_layers': 19}. Best is trial 1 with value: 2.534303665161133.


Step,Training Loss,Validation Loss,Accuracy
888,4.86,4.581317,0.021622
1776,4.4344,4.344041,0.025225
2664,4.287,4.241272,0.034234
3552,4.153,4.057652,0.040541
4440,3.8047,3.586735,0.091892
5328,3.4403,3.244352,0.163063
6216,3.1788,3.035362,0.177477
7104,2.9916,2.838237,0.241441
7992,2.855,2.72471,0.272072
8880,2.7145,2.609525,0.303604


Saved best model according to eval_loss: 4.581316947937012
Saved best model according to eval_accuracy: 0.021621621621621623
Saved best model according to eval_loss: 4.344041347503662
Saved best model according to eval_accuracy: 0.025225225225225224
Saved best model according to eval_loss: 4.24127197265625
Saved best model according to eval_accuracy: 0.03423423423423423
Saved best model according to eval_loss: 4.057651996612549
Saved best model according to eval_accuracy: 0.04054054054054054
Saved best model according to eval_loss: 3.5867345333099365
Saved best model according to eval_accuracy: 0.0918918918918919
Saved best model according to eval_loss: 3.244352102279663
Saved best model according to eval_accuracy: 0.16306306306306306
Saved best model according to eval_loss: 3.035362482070923
Saved best model according to eval_accuracy: 0.17747747747747747
Saved best model according to eval_loss: 2.8382370471954346
Saved best model according to eval_accuracy: 0.24144144144144145
Saved 

[I 2024-06-21 14:48:57,848] Trial 3 finished with value: 2.32712721824646 and parameters: {'num_layers': 4}. Best is trial 3 with value: 2.32712721824646.


Step,Training Loss,Validation Loss,Accuracy
888,4.923,4.772852,0.013514
1776,4.5406,4.373076,0.024324
2664,4.3054,4.26358,0.032432
3552,4.1526,4.04679,0.042342
4440,3.9263,3.792199,0.057658
5328,3.6591,3.46039,0.126126
6216,3.3905,3.278624,0.127027
7104,3.1767,3.043748,0.172973
7992,3.0583,2.885319,0.223423
8880,2.9282,2.819576,0.225225


Saved best model according to eval_loss: 4.772852420806885
Saved best model according to eval_accuracy: 0.013513513513513514
Saved best model according to eval_loss: 4.37307596206665
Saved best model according to eval_accuracy: 0.024324324324324326
Saved best model according to eval_loss: 4.263580322265625
Saved best model according to eval_accuracy: 0.032432432432432434
Saved best model according to eval_loss: 4.04679012298584
Saved best model according to eval_accuracy: 0.04234234234234234
Saved best model according to eval_loss: 3.792198896408081
Saved best model according to eval_accuracy: 0.05765765765765766
Saved best model according to eval_loss: 3.460390329360962
Saved best model according to eval_accuracy: 0.12612612612612611
Saved best model according to eval_loss: 3.2786242961883545
Saved best model according to eval_accuracy: 0.12702702702702703
Saved best model according to eval_loss: 3.043747901916504
Saved best model according to eval_accuracy: 0.17297297297297298
Saved 

[I 2024-06-21 18:59:01,174] Trial 4 finished with value: 2.526242733001709 and parameters: {'num_layers': 16}. Best is trial 3 with value: 2.32712721824646.


Best trial:
  Value: 2.526242733001709
  Params: 
    num_layers: 16


eval zweiter versuch anne

In [3]:
import optuna
import torch
from torch.utils.data import Dataset, DataLoader, RandomSampler
from transformers import Trainer, TrainingArguments, TrainerCallback, WhisperConfig, WhisperModel, WhisperProcessor
import pandas as pd
import librosa
import numpy as np
import os
import sys
import math
from datasets import load_metric
from datetime import datetime
import logging
import torch.nn.functional as F

# Set up logging for Optuna
log_dir = "/home/rag/experimental_trial/results/training_logs"
os.makedirs(log_dir, exist_ok=True)
timestamp = datetime.now().strftime("%Y%m%d_%H%M%S")
log_file = os.path.join(log_dir, f"training_log_optuna_optim_whisper{timestamp}.csv")

logging.basicConfig(stream=sys.stdout, level=logging.INFO)
logger = logging.getLogger()

# Add file handler to logger
file_handler = logging.FileHandler(log_file)
file_handler.setLevel(logging.INFO)
formatter = logging.Formatter('%(asctime)s - %(levelname)s - %(message)s')
file_handler.setFormatter(formatter)
logger.addHandler(file_handler)

# Redirect Optuna logging to the file
optuna_logger = logging.getLogger("optuna")
optuna_logger.addHandler(file_handler)

# Load the processor
processor = WhisperProcessor.from_pretrained("openai/whisper-large")

# Define the custom dataset class
class LocalAudioDataset(Dataset):
    def __init__(self, csv_file, processor, subset, noise_factor=0.0, max_speakers=50):
        self.processor = processor
        self.data = pd.read_csv(csv_file)
        self.data = self.data[self.data['subset'] == subset]
        
        # Limit the number of speakers to max_speakers
        speaker_counts = self.data['label'].value_counts()
        top_speakers = speaker_counts.nlargest(max_speakers).index
        self.data = self.data[self.data['label'].isin(top_speakers)]
        
        self.speaker_ids = {label: idx for idx, label in enumerate(self.data['label'].unique())}
        self.data['label'] = self.data['label'].map(self.speaker_ids)
        self.noise_factor = noise_factor
        
        print(f"Loaded {len(self.speaker_ids)} speakers: {self.speaker_ids}")
        print(f"Total files in {subset}: {len(self.data)}")

    def __len__(self):
        return len(self.data)

    def __getitem__(self, idx):
        file_path = self.data.iloc[idx]['path']
        label = self.data.iloc[idx]['label']
        
        try:
            audio, sr = librosa.load(file_path, sr=16000)
            audio = librosa.to_mono(audio)
            # Use the processor to extract features
            inputs = self.processor(audio, sampling_rate=sr, return_tensors="pt")
            input_values = inputs.input_features.squeeze(0)
            return {"input_values": input_values, "labels": label}
        except Exception as e:
            print(f"Error loading {file_path}: {e}", file=sys.stderr)
            return self.__getitem__((idx + 1) % len(self))

# Paths to dataset CSV file
csv_file = 'dataset_large.csv'
train_dataset = LocalAudioDataset(csv_file, processor, 'train', noise_factor=0, max_speakers=111)
validate_dataset = LocalAudioDataset(csv_file, processor, 'validate', max_speakers=111)
test_dataset = LocalAudioDataset(csv_file, processor, 'test', max_speakers=111)

num_speakers = len(train_dataset.speaker_ids)
print(f"Number of unique speakers: {num_speakers}")

print(f"Labels in train dataset: {train_dataset.data['label'].tolist()}")
print(f"Labels in test dataset: {test_dataset.data['label'].tolist()}")

device = torch.device("cuda" if torch.cuda.is_available() else "cpu")
print(f"Using device: {device}")

def validate_labels(dataset):
    for item in dataset:
        label = item['labels']
        if label >= num_speakers or label < 0:
            print(f"Invalid label {label} for item: {item}")
            raise ValueError(f"Invalid label {label} found in dataset.")
    print("All labels are valid.")

batch_size = 2
steps_per_epoch = math.ceil(len(train_dataset) / batch_size)
logging_steps = steps_per_epoch // 5
eval_steps = steps_per_epoch // 5

accuracy_metric = load_metric("accuracy")

def compute_metrics(eval_pred):
    logits, labels = eval_pred
    predictions = np.argmax(logits, axis=-1)
    return accuracy_metric.compute(predictions=predictions, references=labels)

class SaveMetricsCallback(TrainerCallback):
    def on_log(self, args, state, control, logs=None, **kwargs):
        if logs is not None:
            with open(log_file, "a") as f:
                timestamp = datetime.now().strftime("%Y-%m-%d %H:%M:%S")
                step = state.global_step
                training_loss = logs.get("loss", "")
                validation_loss = logs.get("eval_loss", "")
                accuracy = logs.get("eval_accuracy", "")
                f.write(f"{timestamp},{step},{training_loss},{validation_loss},{accuracy}\n")

class EarlyStoppingCallback(TrainerCallback):
    def __init__(self, early_stopping_patience=100, early_stopping_threshold=0.0):
        self.early_stopping_patience = early_stopping_patience
        self.early_stopping_threshold = early_stopping_threshold
        self.best_metric = None
        self.patience_counter = 0

    def on_evaluate(self, args, state, control, **kwargs):
        metric = kwargs.get("metrics", {}).get("eval_loss")
        if metric is None:
            return
        
        if self.best_metric is None or metric < self.best_metric - self.early_stopping_threshold:
            self.best_metric = metric
            self.patience_counter = 0
        else:
            self.patience_counter += 1
        
        if self.patience_counter >= self.early_stopping_patience:
            print(f"Early stopping at step {state.global_step}")
            control.should_training_stop = True

# Custom classification head with mean pooling
class CustomWhisperForSequenceClassification(torch.nn.Module):
    def __init__(self, config):
        super().__init__()
        self.whisper = WhisperModel(config)
        self.pooling = torch.nn.AdaptiveAvgPool1d(1)
        self.hidden_size = config.d_model
        self.num_labels = config.num_labels
        self.classifier = torch.nn.Linear(self.hidden_size, self.num_labels)

    def forward(self, input_values, attention_mask=None, labels=None):
        # Pass input through Whisper encoder
        encoder_outputs = self.whisper.encoder(input_values)
        hidden_states = encoder_outputs.last_hidden_state
        
        # Apply pooling
        pooled_output = self.pooling(hidden_states.transpose(1, 2)).squeeze(-1)
        
        # Ensure the pooled output has the correct shape
        if pooled_output.dim() == 1:
            pooled_output = pooled_output.unsqueeze(0)
        
        # Pass through classifier
        logits = self.classifier(pooled_output)
        
        loss = None
        if labels is not None:
            loss = F.cross_entropy(logits, labels)
        
        return (loss, logits) if loss is not None else (logits,)

# Custom data collator for Whisper
class DataCollatorForWhisper:
    def __call__(self, features):
        input_values = torch.stack([f["input_values"] for f in features])
        labels = torch.tensor([f["labels"] for f in features], dtype=torch.long)
        return {"input_values": input_values, "labels": labels}

# Extend the Trainer class
class CustomTrainer(Trainer):
    def __init__(self, *args, **kwargs):
        super().__init__(*args, **kwargs)
        self.best_loss_model_dir = "./results/best_model_loss_2layer_versuch2"
        self.best_accuracy_model_dir = "./results/best_model_accuracy_versuch2"
        os.makedirs(self.best_loss_model_dir, exist_ok=True)
        os.makedirs(self.best_accuracy_model_dir, exist_ok=True)
        self.best_eval_loss = float("inf")
        self.best_eval_accuracy = 0.0

    def evaluate(self, eval_dataset=None, ignore_keys=None, metric_key_prefix="eval"):
        eval_metrics = super().evaluate(eval_dataset, ignore_keys, metric_key_prefix)
        
        current_eval_loss = eval_metrics["eval_loss"]
        current_eval_accuracy = eval_metrics["eval_accuracy"]
        
        if current_eval_loss < self.best_eval_loss:
            self.best_eval_loss = current_eval_loss
            self.save_model(self.best_loss_model_dir)
            print(f"Saved best model according to eval_loss: {self.best_eval_loss}")

        if current_eval_accuracy > self.best_eval_accuracy:
            self.best_eval_accuracy = current_eval_accuracy
            self.save_model(self.best_accuracy_model_dir)
            print(f"Saved best model according to eval_accuracy: {self.best_eval_accuracy}")

        return eval_metrics
    
    def compute_loss(self, model, inputs, return_outputs=False):
        input_values = inputs.get("input_values")
        labels = inputs.get("labels")
        outputs = model(input_values=input_values, labels=labels)
        loss = outputs[0]
        return (loss, outputs) if return_outputs else loss

def objective(trial):
    # Suggest the number of layers
    num_layers = trial.suggest_int('num_layers', 1, 24)
    
    # Load the model configuration with the suggested number of layers
    config = WhisperConfig.from_pretrained("openai/whisper-large", num_labels=num_speakers)
    config.num_hidden_layers = num_layers
    model = CustomWhisperForSequenceClassification(config)
    
    # Apply the number of hidden layers correctly
    model.whisper.encoder.layers = torch.nn.ModuleList(model.whisper.encoder.layers[:num_layers])
    
    # Transfer the model to the correct device
    model = model.to(device)
    
    training_args = TrainingArguments(
        output_dir="./results",
        group_by_length=False,
        per_device_train_batch_size=batch_size,
        evaluation_strategy="steps",
        num_train_epochs=3,
        save_steps=logging_steps,
        eval_steps=eval_steps,
        logging_steps=logging_steps,
        learning_rate=1e-5,
        save_total_limit=2,
        no_cuda=False,
        load_best_model_at_end=True,
        metric_for_best_model="eval_loss",
        greater_is_better=False,  # lower eval_loss is better
        save_strategy="steps"  # or "epoch" if you prefer to save every epoch
    )
    
    trainer = CustomTrainer(
        model=model,
        args=training_args,
        train_dataset=train_dataset,
        eval_dataset=validate_dataset,
        data_collator=DataCollatorForWhisper(),
        tokenizer=processor,
        compute_metrics=compute_metrics,
        callbacks=[SaveMetricsCallback(), EarlyStoppingCallback(early_stopping_patience=50)]
    )
    
    trainer.train()
    
    # Evaluate the model
    metrics = trainer.evaluate(validate_dataset)
    return metrics['eval_loss']

study = optuna.create_study(direction='minimize')
study.optimize(objective, n_trials=12)

result_file = os.path.join(log_dir, "OptunaResult.txt")
with open(result_file, "w") as f:
    f.write("Best trial:\n")
    trial = study.best_trial
    f.write(f"  Value: {trial.value}\n")
    f.write("  Params:\n")
    for key, value in trial.params.items():
        f.write(f"    {key}: {value}\n")
    
    f.write("\nAll trials:\n")
    for i, trial in enumerate(study.trials):
        f.write(f"Trial {i}:\n")
        f.write(f"  Value: {trial.value}\n")
        f.write("  Params:\n")
        for key, value in trial.params.items():
            f.write(f"    {key}: {value}\n")
        f.write("\n")
    
    f.write("Operation finished.\n")

print("Best trial:")
print(f"  Value: {trial.value}")
print("  Params: ")
for key, value in trial.params.items():
    print(f"    {key}: {value}")


Special tokens have been added in the vocabulary, make sure the associated word embeddings are fine-tuned or trained.


Loaded 111 speakers: {'speaker_6': 0, 'speaker_156': 1, 'speaker_22': 2, 'speaker_19': 3, 'speaker_91': 4, 'speaker_27': 5, 'speaker_94': 6, 'speaker_34': 7, 'speaker_97': 8, 'speaker_100': 9, 'speaker_36': 10, 'speaker_128': 11, 'speaker_134': 12, 'speaker_68': 13, 'speaker_9': 14, 'speaker_17': 15, 'speaker_73': 16, 'speaker_42': 17, 'speaker_52': 18, 'speaker_151': 19, 'speaker_150': 20, 'speaker_141': 21, 'speaker_82': 22, 'speaker_130': 23, 'speaker_75': 24, 'speaker_58': 25, 'speaker_74': 26, 'speaker_104': 27, 'speaker_47': 28, 'speaker_135': 29, 'speaker_71': 30, 'speaker_83': 31, 'speaker_116': 32, 'speaker_99': 33, 'speaker_108': 34, 'speaker_31': 35, 'speaker_106': 36, 'speaker_28': 37, 'speaker_65': 38, 'speaker_48': 39, 'speaker_49': 40, 'speaker_53': 41, 'speaker_3': 42, 'speaker_63': 43, 'speaker_138': 44, 'speaker_98': 45, 'speaker_92': 46, 'speaker_123': 47, 'speaker_32': 48, 'speaker_10': 49, 'speaker_155': 50, 'speaker_153': 51, 'speaker_23': 52, 'speaker_59': 53, 's

You can avoid this message in future by passing the argument `trust_remote_code=True`.
Passing `trust_remote_code=True` will be mandatory to load this metric from the next major release of `datasets`.
[I 2024-06-22 09:19:34,521] A new study created in memory with name: no-name-18d4997e-cb0a-4b93-8ab9-23f4cbc2f49f


Step,Training Loss,Validation Loss,Accuracy
888,4.8938,4.672912,0.023423
1776,4.4691,4.340931,0.024324
2664,4.2772,4.216887,0.036937
3552,4.0738,3.885266,0.04955
4440,3.6529,3.464424,0.11982
5328,3.3229,3.143207,0.178378
6216,3.0987,2.957637,0.200901
7104,2.9408,2.77439,0.244144
7992,2.821,2.657195,0.281081
8880,2.6723,2.572147,0.288288


Saved best model according to eval_loss: 4.672911643981934
Saved best model according to eval_accuracy: 0.023423423423423424
Saved best model according to eval_loss: 4.340930938720703
Saved best model according to eval_accuracy: 0.024324324324324326
Saved best model according to eval_loss: 4.216886520385742
Saved best model according to eval_accuracy: 0.036936936936936934
Saved best model according to eval_loss: 3.8852663040161133
Saved best model according to eval_accuracy: 0.04954954954954955
Saved best model according to eval_loss: 3.4644243717193604
Saved best model according to eval_accuracy: 0.11981981981981982
Saved best model according to eval_loss: 3.143206834793091
Saved best model according to eval_accuracy: 0.1783783783783784
Saved best model according to eval_loss: 2.957637071609497
Saved best model according to eval_accuracy: 0.2009009009009009
Saved best model according to eval_loss: 2.7743895053863525
Saved best model according to eval_accuracy: 0.24414414414414415
Save

[I 2024-06-22 11:04:32,442] Trial 0 finished with value: 2.2446231842041016 and parameters: {'num_layers': 6}. Best is trial 0 with value: 2.2446231842041016.


Step,Training Loss,Validation Loss,Accuracy
888,4.833,4.535194,0.025225
1776,4.4151,4.32624,0.023423
2664,4.2736,4.210543,0.033333
3552,4.0934,3.889349,0.047748
4440,3.6672,3.439522,0.11982
5328,3.3025,3.103096,0.198198
6216,3.0429,2.905648,0.21982
7104,2.8644,2.750836,0.259459
7992,2.7705,2.628991,0.297297
8880,2.6365,2.545605,0.304505


Saved best model according to eval_loss: 4.535193920135498
Saved best model according to eval_accuracy: 0.025225225225225224
Saved best model according to eval_loss: 4.326239585876465
Saved best model according to eval_loss: 4.210543155670166
Saved best model according to eval_accuracy: 0.03333333333333333
Saved best model according to eval_loss: 3.8893485069274902
Saved best model according to eval_accuracy: 0.047747747747747746
Saved best model according to eval_loss: 3.4395222663879395
Saved best model according to eval_accuracy: 0.11981981981981982
Saved best model according to eval_loss: 3.1030964851379395
Saved best model according to eval_accuracy: 0.1981981981981982
Saved best model according to eval_loss: 2.9056482315063477
Saved best model according to eval_accuracy: 0.21981981981981982
Saved best model according to eval_loss: 2.75083589553833
Saved best model according to eval_accuracy: 0.2594594594594595
Saved best model according to eval_loss: 2.62899112701416
Saved best m

[I 2024-06-22 12:05:45,995] Trial 1 finished with value: 2.2891428470611572 and parameters: {'num_layers': 3}. Best is trial 0 with value: 2.2446231842041016.


Step,Training Loss,Validation Loss,Accuracy
888,4.9249,4.785599,0.015315
1776,4.5338,4.365416,0.025225
2664,4.2994,4.282217,0.02973
3552,4.1706,4.086433,0.042342
4440,3.9797,3.861633,0.045946
5328,3.7599,3.556464,0.101802
6216,3.5094,3.355268,0.135135
7104,3.3148,3.191609,0.155856
7992,3.1952,3.050605,0.178378
8880,3.0508,2.948289,0.186486


Saved best model according to eval_loss: 4.7855987548828125
Saved best model according to eval_accuracy: 0.015315315315315315
Saved best model according to eval_loss: 4.365415573120117
Saved best model according to eval_accuracy: 0.025225225225225224
Saved best model according to eval_loss: 4.282217025756836
Saved best model according to eval_accuracy: 0.02972972972972973
Saved best model according to eval_loss: 4.086432933807373
Saved best model according to eval_accuracy: 0.04234234234234234
Saved best model according to eval_loss: 3.86163330078125
Saved best model according to eval_accuracy: 0.04594594594594595
Saved best model according to eval_loss: 3.5564637184143066
Saved best model according to eval_accuracy: 0.1018018018018018
Saved best model according to eval_loss: 3.3552680015563965
Saved best model according to eval_accuracy: 0.13513513513513514
Saved best model according to eval_loss: 3.1916091442108154
Saved best model according to eval_accuracy: 0.15585585585585585
Save

[I 2024-06-22 17:28:07,354] Trial 2 finished with value: 2.596937417984009 and parameters: {'num_layers': 21}. Best is trial 0 with value: 2.2446231842041016.


Step,Training Loss,Validation Loss,Accuracy
888,4.9201,4.769555,0.013514
1776,4.5097,4.347812,0.025225
2664,4.2844,4.283791,0.027928
3552,4.1448,4.072552,0.040541
4440,3.9218,3.785553,0.055856
5328,3.6885,3.507492,0.101802
6216,3.4721,3.318033,0.124324
7104,3.268,3.120628,0.142342
7992,3.1453,2.970693,0.184685
8880,3.0053,2.887051,0.199099


Saved best model according to eval_loss: 4.769554615020752
Saved best model according to eval_accuracy: 0.013513513513513514
Saved best model according to eval_loss: 4.347812175750732
Saved best model according to eval_accuracy: 0.025225225225225224
Saved best model according to eval_loss: 4.283790588378906
Saved best model according to eval_accuracy: 0.027927927927927927
Saved best model according to eval_loss: 4.07255220413208
Saved best model according to eval_accuracy: 0.04054054054054054
Saved best model according to eval_loss: 3.785552978515625
Saved best model according to eval_accuracy: 0.055855855855855854
Saved best model according to eval_loss: 3.5074915885925293
Saved best model according to eval_accuracy: 0.1018018018018018
Saved best model according to eval_loss: 3.318033456802368
Saved best model according to eval_accuracy: 0.12432432432432433
Saved best model according to eval_loss: 3.1206278800964355
Saved best model according to eval_accuracy: 0.14234234234234233
Save

[I 2024-06-22 22:20:45,927] Trial 3 finished with value: 2.579988479614258 and parameters: {'num_layers': 19}. Best is trial 0 with value: 2.2446231842041016.


Step,Training Loss,Validation Loss,Accuracy
888,4.9156,4.777668,0.012613
1776,4.5322,4.373528,0.024324
2664,4.2969,4.268996,0.033333
3552,4.1511,4.041599,0.043243
4440,3.9091,3.712463,0.079279
5328,3.5886,3.382426,0.126126
6216,3.3519,3.219248,0.12973
7104,3.1378,3.003518,0.19009
7992,3.0099,2.843326,0.237838
8880,2.8698,2.747936,0.246847


Saved best model according to eval_loss: 4.777667999267578
Saved best model according to eval_accuracy: 0.012612612612612612
Saved best model according to eval_loss: 4.373528480529785
Saved best model according to eval_accuracy: 0.024324324324324326
Saved best model according to eval_loss: 4.268996238708496
Saved best model according to eval_accuracy: 0.03333333333333333
Saved best model according to eval_loss: 4.041599273681641
Saved best model according to eval_accuracy: 0.043243243243243246
Saved best model according to eval_loss: 3.7124626636505127
Saved best model according to eval_accuracy: 0.07927927927927927
Saved best model according to eval_loss: 3.3824262619018555
Saved best model according to eval_accuracy: 0.12612612612612611
Saved best model according to eval_loss: 3.2192482948303223
Saved best model according to eval_accuracy: 0.12972972972972974
Saved best model according to eval_loss: 3.0035183429718018
Saved best model according to eval_accuracy: 0.1900900900900901
Sa

[I 2024-06-23 02:01:25,734] Trial 4 finished with value: 2.447228193283081 and parameters: {'num_layers': 14}. Best is trial 0 with value: 2.2446231842041016.


Step,Training Loss,Validation Loss,Accuracy
888,4.8954,4.6525,0.024324
1776,4.446,4.330514,0.023423
2664,4.2775,4.234533,0.034234
3552,4.1253,4.059233,0.043243
4440,3.866,3.718048,0.081982
5328,3.5967,3.427899,0.117117
6216,3.3453,3.184917,0.145045
7104,3.1607,3.035941,0.173874
7992,3.031,2.887138,0.220721
8880,2.8966,2.776479,0.245946


Saved best model according to eval_loss: 4.652500152587891
Saved best model according to eval_accuracy: 0.024324324324324326
Saved best model according to eval_loss: 4.330514430999756
Saved best model according to eval_loss: 4.234533309936523
Saved best model according to eval_accuracy: 0.03423423423423423
Saved best model according to eval_loss: 4.05923318862915
Saved best model according to eval_accuracy: 0.043243243243243246
Saved best model according to eval_loss: 3.718048334121704
Saved best model according to eval_accuracy: 0.08198198198198198
Saved best model according to eval_loss: 3.4278993606567383
Saved best model according to eval_accuracy: 0.11711711711711711
Saved best model according to eval_loss: 3.1849169731140137
Saved best model according to eval_accuracy: 0.14504504504504503
Saved best model according to eval_loss: 3.0359413623809814
Saved best model according to eval_accuracy: 0.17387387387387387
Saved best model according to eval_loss: 2.8871378898620605
Saved bes

[I 2024-06-23 04:29:18,171] Trial 5 finished with value: 2.4431569576263428 and parameters: {'num_layers': 9}. Best is trial 0 with value: 2.2446231842041016.


Step,Training Loss,Validation Loss,Accuracy
888,4.916,4.783528,0.009009
1776,4.5469,4.377087,0.021622
2664,4.2999,4.254467,0.035135
3552,4.155,4.073232,0.042342
4440,3.9166,3.731267,0.067568
5328,3.5328,3.32986,0.151351
6216,3.2478,3.137992,0.163964
7104,3.0386,2.876686,0.214414
7992,2.9098,2.753243,0.275676
8880,2.767,2.628896,0.28018


Saved best model according to eval_loss: 4.783527851104736
Saved best model according to eval_accuracy: 0.009009009009009009
Saved best model according to eval_loss: 4.377086639404297
Saved best model according to eval_accuracy: 0.021621621621621623
Saved best model according to eval_loss: 4.254466533660889
Saved best model according to eval_accuracy: 0.03513513513513514
Saved best model according to eval_loss: 4.073232173919678
Saved best model according to eval_accuracy: 0.04234234234234234
Saved best model according to eval_loss: 3.731267213821411
Saved best model according to eval_accuracy: 0.06756756756756757
Saved best model according to eval_loss: 3.3298604488372803
Saved best model according to eval_accuracy: 0.15135135135135136
Saved best model according to eval_loss: 3.1379916667938232
Saved best model according to eval_accuracy: 0.16396396396396395
Saved best model according to eval_loss: 2.876685619354248
Saved best model according to eval_accuracy: 0.21441441441441442
Save

[I 2024-06-23 07:12:08,542] Trial 6 finished with value: 2.2187047004699707 and parameters: {'num_layers': 10}. Best is trial 6 with value: 2.2187047004699707.


Step,Training Loss,Validation Loss,Accuracy
888,4.9023,4.728735,0.024324
1776,4.4861,4.362293,0.024324
2664,4.2858,4.219041,0.035135
3552,4.1245,3.981441,0.044144
4440,3.8213,3.631139,0.083784
5328,3.462,3.266313,0.135135
6216,3.2254,3.091312,0.177477
7104,3.0615,2.908785,0.210811
7992,2.9353,2.77369,0.256757
8880,2.7955,2.661446,0.261261


Saved best model according to eval_loss: 4.728735446929932
Saved best model according to eval_accuracy: 0.024324324324324326
Saved best model according to eval_loss: 4.362293243408203
Saved best model according to eval_loss: 4.219041347503662
Saved best model according to eval_accuracy: 0.03513513513513514
Saved best model according to eval_loss: 3.981440544128418
Saved best model according to eval_accuracy: 0.044144144144144144
Saved best model according to eval_loss: 3.631138801574707
Saved best model according to eval_accuracy: 0.08378378378378379
Saved best model according to eval_loss: 3.266313314437866
Saved best model according to eval_accuracy: 0.13513513513513514
Saved best model according to eval_loss: 3.0913116931915283
Saved best model according to eval_accuracy: 0.17747747747747747
Saved best model according to eval_loss: 2.9087846279144287
Saved best model according to eval_accuracy: 0.21081081081081082
Saved best model according to eval_loss: 2.7736897468566895
Saved bes

[I 2024-06-23 09:11:07,046] Trial 7 finished with value: 2.3103833198547363 and parameters: {'num_layers': 7}. Best is trial 6 with value: 2.2187047004699707.


Step,Training Loss,Validation Loss,Accuracy
888,4.9185,4.793799,0.00991
1776,4.5489,4.368474,0.024324
2664,4.2943,4.249649,0.028829
3552,4.1239,4.010673,0.051351
4440,3.8314,3.66796,0.075676
5328,3.5331,3.375092,0.12973
6216,3.3137,3.174528,0.144144
7104,3.1432,3.009641,0.172072
7992,3.0501,2.903434,0.21982
8880,2.9194,2.802516,0.230631


Saved best model according to eval_loss: 4.793798923492432
Saved best model according to eval_accuracy: 0.00990990990990991
Saved best model according to eval_loss: 4.36847448348999
Saved best model according to eval_accuracy: 0.024324324324324326
Saved best model according to eval_loss: 4.249648571014404
Saved best model according to eval_accuracy: 0.02882882882882883
Saved best model according to eval_loss: 4.010672569274902
Saved best model according to eval_accuracy: 0.051351351351351354
Saved best model according to eval_loss: 3.6679604053497314
Saved best model according to eval_accuracy: 0.07567567567567568
Saved best model according to eval_loss: 3.3750922679901123
Saved best model according to eval_accuracy: 0.12972972972972974
Saved best model according to eval_loss: 3.174527883529663
Saved best model according to eval_accuracy: 0.14414414414414414
Saved best model according to eval_loss: 3.009641170501709
Saved best model according to eval_accuracy: 0.17207207207207206
Saved

[I 2024-06-23 13:06:47,220] Trial 8 finished with value: 2.47918438911438 and parameters: {'num_layers': 15}. Best is trial 6 with value: 2.2187047004699707.


Step,Training Loss,Validation Loss,Accuracy
888,4.9135,4.770761,0.016216
1776,4.527,4.370752,0.023423
2664,4.3008,4.270798,0.033333
3552,4.1642,4.071908,0.041441
4440,3.9059,3.735035,0.073874
5328,3.6211,3.407912,0.117117
6216,3.3412,3.184769,0.154054
7104,3.1423,2.96679,0.193694
7992,3.0142,2.862101,0.227027
8880,2.8774,2.76071,0.247748


Saved best model according to eval_loss: 4.770760536193848
Saved best model according to eval_accuracy: 0.016216216216216217
Saved best model according to eval_loss: 4.370752334594727
Saved best model according to eval_accuracy: 0.023423423423423424
Saved best model according to eval_loss: 4.270798206329346
Saved best model according to eval_accuracy: 0.03333333333333333
Saved best model according to eval_loss: 4.0719075202941895
Saved best model according to eval_accuracy: 0.04144144144144144
Saved best model according to eval_loss: 3.735034942626953
Saved best model according to eval_accuracy: 0.07387387387387387
Saved best model according to eval_loss: 3.407912015914917
Saved best model according to eval_accuracy: 0.11711711711711711
Saved best model according to eval_loss: 3.1847691535949707
Saved best model according to eval_accuracy: 0.15405405405405406
Saved best model according to eval_loss: 2.966789722442627
Saved best model according to eval_accuracy: 0.19369369369369369
Save

[I 2024-06-23 15:35:25,693] Trial 9 finished with value: 2.3889431953430176 and parameters: {'num_layers': 9}. Best is trial 6 with value: 2.2187047004699707.


Step,Training Loss,Validation Loss,Accuracy
888,4.9285,4.804889,0.009009
1776,4.5777,4.395104,0.022523
2664,4.3195,4.300702,0.027928
3552,4.173,4.101006,0.042342
4440,3.989,3.895713,0.058559
5328,3.7807,3.605378,0.10991
6216,3.5075,3.368894,0.124324
7104,3.3225,3.19327,0.143243
7992,3.1993,3.027599,0.187387
8880,3.0796,2.948883,0.181081


Saved best model according to eval_loss: 4.80488920211792
Saved best model according to eval_accuracy: 0.009009009009009009
Saved best model according to eval_loss: 4.395103931427002
Saved best model according to eval_accuracy: 0.02252252252252252
Saved best model according to eval_loss: 4.300702095031738
Saved best model according to eval_accuracy: 0.027927927927927927
Saved best model according to eval_loss: 4.101006031036377
Saved best model according to eval_accuracy: 0.04234234234234234
Saved best model according to eval_loss: 3.8957128524780273
Saved best model according to eval_accuracy: 0.05855855855855856
Saved best model according to eval_loss: 3.605377674102783
Saved best model according to eval_accuracy: 0.10990990990990991
Saved best model according to eval_loss: 3.368894338607788
Saved best model according to eval_accuracy: 0.12432432432432433
Saved best model according to eval_loss: 3.193270444869995
Saved best model according to eval_accuracy: 0.14324324324324325
Saved 

[I 2024-06-23 21:40:46,945] Trial 10 finished with value: 2.6530556678771973 and parameters: {'num_layers': 24}. Best is trial 6 with value: 2.2187047004699707.


Step,Training Loss,Validation Loss,Accuracy
888,4.7818,4.66407,0.026126
1776,4.5222,4.403031,0.023423
2664,4.3318,4.276883,0.027928
3552,4.2345,4.16423,0.040541
4440,3.9347,3.726496,0.090991
5328,3.5615,3.41509,0.145045
6216,3.3243,3.215016,0.174775
7104,3.1311,3.058399,0.235135
7992,3.0182,2.93799,0.247748
8880,2.9115,2.838849,0.273874


Saved best model according to eval_loss: 4.664069652557373
Saved best model according to eval_accuracy: 0.026126126126126126
Saved best model according to eval_loss: 4.403030872344971
Saved best model according to eval_loss: 4.276882648468018
Saved best model according to eval_accuracy: 0.027927927927927927
Saved best model according to eval_loss: 4.164229869842529
Saved best model according to eval_accuracy: 0.04054054054054054
Saved best model according to eval_loss: 3.7264957427978516
Saved best model according to eval_accuracy: 0.09099099099099099
Saved best model according to eval_loss: 3.4150896072387695
Saved best model according to eval_accuracy: 0.14504504504504503
Saved best model according to eval_loss: 3.2150156497955322
Saved best model according to eval_accuracy: 0.17477477477477477
Saved best model according to eval_loss: 3.058399200439453
Saved best model according to eval_accuracy: 0.23513513513513515
Saved best model according to eval_loss: 2.9379897117614746
Saved be

[I 2024-06-23 22:14:46,593] Trial 11 finished with value: 2.5737287998199463 and parameters: {'num_layers': 1}. Best is trial 6 with value: 2.2187047004699707.


Best trial:
  Value: 2.5737287998199463
  Params: 
    num_layers: 1


In [None]:
import torch
from torch.utils.data import Dataset, DataLoader, RandomSampler
from transformers import Trainer, TrainingArguments, TrainerCallback, WhisperConfig, WhisperModel, WhisperProcessor
import pandas as pd
import librosa
import numpy as np
import os
import sys
import math
from datasets import load_metric
from datetime import datetime
import logging
import torch.nn.functional as F

# Set up logging
logging.basicConfig(stream=sys.stdout, level=logging.INFO)
logger = logging.getLogger()

# Load the processor
processor = WhisperProcessor.from_pretrained("openai/whisper-large")

# Define the custom dataset class
class LocalAudioDataset(Dataset):
    def __init__(self, csv_file, processor, subset, noise_factor=0.0, max_speakers=50):
        self.processor = processor
        self.data = pd.read_csv(csv_file)
        self.data = self.data[self.data['subset'] == subset]
        
        # Limit the number of speakers to max_speakers
        speaker_counts = self.data['label'].value_counts()
        top_speakers = speaker_counts.nlargest(max_speakers).index
        self.data = self.data[self.data['label'].isin(top_speakers)]
        
        self.speaker_ids = {label: idx for idx, label in enumerate(self.data['label'].unique())}
        self.data['label'] = self.data['label'].map(self.speaker_ids)
        self.noise_factor = noise_factor

    def __len__(self):
        return len(self.data)

    def __getitem__(self, idx):
        file_path = self.data.iloc[idx]['path']
        label = self.data.iloc[idx]['label']
        
        try:
            audio, sr = librosa.load(file_path, sr=16000)
            audio = librosa.to_mono(audio)
            # Use the processor to extract features
            inputs = self.processor(audio, sampling_rate=sr, return_tensors="pt")
            input_values = inputs.input_features.squeeze(0)
            return {"input_values": input_values, "labels": label}
        except Exception as e:
            print(f"Error loading {file_path}: {e}", file=sys.stderr)
            return self.__getitem__((idx + 1) % len(self))

# Paths to dataset CSV file
csv_file = 'dataset_large.csv'
train_dataset = LocalAudioDataset(csv_file, processor, 'train', noise_factor=0, max_speakers=50)
validate_dataset = LocalAudioDataset(csv_file, processor, 'validate', max_speakers=50)
test_dataset = LocalAudioDataset(csv_file, processor, 'test', max_speakers=50)

num_speakers = len(train_dataset.speaker_ids)
print(f"Number of unique speakers: {num_speakers}")

print(f"Labels in train dataset: {train_dataset.data['label'].tolist()}")
print(f"Labels in test dataset: {test_dataset.data['label'].tolist()}")

device = torch.device("cuda" if torch.cuda.is_available() else "cpu")
print(f"Using device: {device}")

def validate_labels(dataset):
    for item in dataset:
        label = item['labels']
        if label >= num_speakers or label < 0:
            print(f"Invalid label {label} for item: {item}")
            raise ValueError(f"Invalid label {label} found in dataset.")
    print("All labels are valid.")

batch_size = 2
steps_per_epoch = math.ceil(len(train_dataset) / batch_size)
logging_steps = steps_per_epoch // 5
eval_steps = steps_per_epoch // 5

accuracy_metric = load_metric("accuracy")

def compute_metrics(eval_pred):
    logits, labels = eval_pred
    predictions = np.argmax(logits, axis=-1)
    return accuracy_metric.compute(predictions=predictions, references=labels)

log_dir = "/home/rag/experimental_trial/results/training_logs"
os.makedirs(log_dir, exist_ok=True)
timestamp = datetime.now().strftime("%Y%m%d_%H%M%S")
log_file = os.path.join(log_dir, f"training_log_versuch2_2layer{timestamp}.csv")
with open(log_file, "w") as f:
    f.write("Timestamp,Step,Training Loss,Validation Loss,Accuracy\n")

class SaveMetricsCallback(TrainerCallback):
    def on_log(self, args, state, control, logs=None, **kwargs):
        if logs is not None:
            with open(log_file, "a") as f:
                timestamp = datetime.now().strftime("%Y-%m-%d %H:%M:%S")
                step = state.global_step
                training_loss = logs.get("loss", "")
                validation_loss = logs.get("eval_loss", "")
                accuracy = logs.get("eval_accuracy", "")
                f.write(f"{timestamp},{step},{training_loss},{validation_loss},{accuracy}\n")

class EarlyStoppingCallback(TrainerCallback):
    def __init__(self, early_stopping_patience=100, early_stopping_threshold=0.0):
        self.early_stopping_patience = early_stopping_patience
        self.early_stopping_threshold = early_stopping_threshold
        self.best_metric = None
        self.patience_counter = 0

    def on_evaluate(self, args, state, control, **kwargs):
        metric = kwargs.get("metrics", {}).get("eval_loss")
        if metric is None:
            return
        
        if self.best_metric is None or metric < self.best_metric - self.early_stopping_threshold:
            self.best_metric = metric
            self.patience_counter = 0
        else:
            self.patience_counter += 1
        
        if self.patience_counter >= self.early_stopping_patience:
            print(f"Early stopping at step {state.global_step}")
            control.should_training_stop = True

# Custom classification head with mean pooling
class CustomWhisperForSequenceClassification(torch.nn.Module):
    def __init__(self, config):
        super().__init__()
        self.whisper = WhisperModel(config)
        self.pooling = torch.nn.AdaptiveAvgPool1d(1)
        self.hidden_size = config.d_model
        self.num_labels = config.num_labels
        self.classifier = torch.nn.Linear(self.hidden_size, self.num_labels)

    def forward(self, input_values, attention_mask=None, labels=None):
        # Pass input through Whisper encoder
        encoder_outputs = self.whisper.encoder(input_values)
        hidden_states = encoder_outputs.last_hidden_state
        
        # Apply pooling
        pooled_output = self.pooling(hidden_states.transpose(1, 2)).squeeze(-1)
        
        # Ensure the pooled output has the correct shape
        if pooled_output.dim() == 1:
            pooled_output = pooled_output.unsqueeze(0)
        
        # Pass through classifier
        logits = self.classifier(pooled_output)
        
        loss = None
        if labels is not None:
            loss = F.cross_entropy(logits, labels)
        
        return (loss, logits) if loss is not None else (logits,)

# Custom data collator for Whisper
class DataCollatorForWhisper:
    def __call__(self, features):
        input_values = torch.stack([f["input_values"] for f in features])
        labels = torch.tensor([f["labels"] for f in features], dtype=torch.long)
        return {"input_values": input_values, "labels": labels}

# Extend the Trainer class
class CustomTrainer(Trainer):
    def __init__(self, *args, **kwargs):
        super().__init__(*args, **kwargs)
        self.best_loss_model_dir = "./results/best_model_loss_2layer_versuch2"
        self.best_accuracy_model_dir = "./results/best_model_accuracy_versuch2"
        os.makedirs(self.best_loss_model_dir, exist_ok=True)
        os.makedirs(self.best_accuracy_model_dir, exist_ok=True)
        self.best_eval_loss = float("inf")
        self.best_eval_accuracy = 0.0

    def evaluate(self, eval_dataset=None, ignore_keys=None, metric_key_prefix="eval"):
        eval_metrics = super().evaluate(eval_dataset, ignore_keys, metric_key_prefix)
        
        current_eval_loss = eval_metrics["eval_loss"]
        current_eval_accuracy = eval_metrics["eval_accuracy"]
        
        if current_eval_loss < self.best_eval_loss:
            self.best_eval_loss = current_eval_loss
            self.save_model(self.best_loss_model_dir)
            print(f"Saved best model according to eval_loss: {self.best_eval_loss}")

        if current_eval_accuracy > self.best_eval_accuracy:
            self.best_eval_accuracy = current_eval_accuracy
            self.save_model(self.best_accuracy_model_dir)
            print(f"Saved best model according to eval_accuracy: {self.best_eval_accuracy}")

        return eval_metrics
    
    def compute_loss(self, model, inputs, return_outputs=False):
        input_values = inputs.get("input_values")
        labels = inputs.get("labels")
        outputs = model(input_values=input_values, labels=labels)
        loss = outputs[0]
        return (loss, outputs) if return_outputs else loss

# Load the model configuration with all layers
config = WhisperConfig.from_pretrained("openai/whisper-large", num_labels=num_speakers)
model = CustomWhisperForSequenceClassification(config)

# Transfer the model to the correct device
model = model.to(device)

training_args = TrainingArguments(
    output_dir="./results",
    group_by_length=False,
    per_device_train_batch_size=batch_size,
    evaluation_strategy="steps",
    num_train_epochs=20,
    save_steps=logging_steps,
    eval_steps=eval_steps,
    logging_steps=logging_steps,
    learning_rate=1e-5,
    save_total_limit=2,
    no_cuda=False,
    load_best_model_at_end=True,
    metric_for_best_model="eval_loss",
    greater_is_better=False,  # lower eval_loss is better
    save_strategy="steps"  # or "epoch" if you prefer to save every epoch
)

trainer = CustomTrainer(
    model=model,
    args=training_args,
    train_dataset=train_dataset,
    eval_dataset=validate_dataset,
    data_collator=DataCollatorForWhisper(),
    tokenizer=processor,
    compute_metrics=compute_metrics,
    callbacks=[SaveMetricsCallback(), EarlyStoppingCallback(early_stopping_patience=50)]
)

trainer.train()

# Evaluate the model
metrics = trainer.evaluate(validate_dataset)
print(metrics)

In [2]:
import os
import numpy as np
import torch
from transformers import WhisperProcessor, WhisperConfig, WhisperModel
from safetensors.torch import load_file as safe_load
from tqdm import tqdm
import librosa

device = torch.device("cuda" if torch.cuda.is_available() else "cpu")

# Define the custom classification head with mean pooling for Whisper
class CustomWhisperForSequenceClassification(torch.nn.Module):
    def __init__(self, config):
        super().__init__()
        self.whisper = WhisperModel(config)
        self.pooling = torch.nn.AdaptiveAvgPool1d(1)
        self.hidden_size = config.d_model
        self.num_labels = config.num_labels
        self.classifier = torch.nn.Linear(self.hidden_size, self.num_labels)

    def forward(self, input_values, attention_mask=None, labels=None):
        # Pass input through Whisper encoder
        encoder_outputs = self.whisper.encoder(input_values)
        hidden_states = encoder_outputs.last_hidden_state
        
        # Apply pooling
        pooled_output = self.pooling(hidden_states.transpose(1, 2)).squeeze(-1)
        
        # Ensure the pooled output has the correct shape
        if pooled_output.dim() == 1:
            pooled_output = pooled_output.unsqueeze(0)
        
        # Pass through classifier
        logits = self.classifier(pooled_output)
        
        loss = None
        if labels is not None:
            loss = F.cross_entropy(logits, labels)
        
        return (loss, logits) if loss is not None else (logits,)

# Path to the fine-tuned model weights file
model_path = "/home/rag/experimental_trial/results/best_model_loss_whisper_110/model.safetensors"

# Load the pre-trained Whisper large model and processor
processor = WhisperProcessor.from_pretrained("openai/whisper-large")

# Load the model configuration
config = WhisperConfig.from_pretrained("openai/whisper-large", num_labels=110)  # Adjust num_labels as needed

# Initialize the custom model with the configuration
model = CustomWhisperForSequenceClassification(config)

# Load the model weights from safetensors file
state_dict = safe_load(model_path)
model.load_state_dict(state_dict)
model.to(device)

def check_directories_exist(directory, layer_indices):
    """Prüft, ob die benötigten Verzeichnisse für jede Schicht bereits existieren."""
    all_exist = True
    for index in layer_indices:
        layer_dir = os.path.join(directory, f"layer_{index}")
        if not os.path.exists(layer_dir):
            all_exist = False
            break
    return all_exist

def load_audio_files(input_directory, output_directory, layer_indices=[-1]):
    """Lädt alle MP3-Dateien im angegebenen Verzeichnis und extrahiert die Repräsentationen aus den spezifizierten Schichten."""
    for filename in tqdm(os.listdir(input_directory)):
        if filename.endswith(".mp3"):
            file_path = os.path.join(input_directory, filename)
            audio, sr = librosa.load(file_path, sr=16000)
            inputs = processor(audio, sampling_rate=sr, return_tensors="pt")
            input_values = inputs["input_features"].to(device)
            
            with torch.no_grad():
                outputs = model.whisper.encoder(input_values, output_hidden_states=True)
                for index in layer_indices:
                    hidden_states = outputs.hidden_states[index]
                    mean_pooled_hidden_states = hidden_states.mean(dim=1)  # Mean Pooling über die Zeitdimension
                    # creating sub directory for each layer in output directory
                    layer_dir = os.path.join(output_directory, f"layer_{index}")
                    os.makedirs(layer_dir, exist_ok=True)
                    save_path = os.path.join(layer_dir, f"{os.path.splitext(filename)[0]}_layer_{index}.npy")
                    np.save(save_path, mean_pooled_hidden_states.cpu().numpy())

def process_audio_directory(input_base_directory, output_base_directory, layer_indices=range(25)):
    """Verarbeitet Audio-Dateien in den angegebenen Verzeichnissen und speichert die Ergebnisse im Zielverzeichnis."""
    for d in os.listdir(input_base_directory):
        input_dir_path = os.path.join(input_base_directory, d)
        output_dir_path = os.path.join(output_base_directory, d)
        if os.path.isdir(input_dir_path) and not check_directories_exist(output_dir_path, layer_indices):
            load_audio_files(input_dir_path, output_dir_path, layer_indices)

input_directory_path = os.path.expanduser("/home/rag/experimental_trial/data/all_speakers_backup")
output_directory_path = os.path.expanduser("/home/rag/experimental_trial/data/all_speakers_whisper_finetuned2")
process_audio_directory(input_directory_path, output_directory_path)

Special tokens have been added in the vocabulary, make sure the associated word embeddings are fine-tuned or trained.
100%|██████████| 50/50 [00:19<00:00,  2.52it/s]
100%|██████████| 50/50 [00:16<00:00,  3.04it/s]
100%|██████████| 50/50 [00:16<00:00,  3.00it/s]
100%|██████████| 50/50 [00:16<00:00,  2.96it/s]
100%|██████████| 50/50 [00:16<00:00,  2.96it/s]
100%|██████████| 50/50 [00:16<00:00,  3.02it/s]
100%|██████████| 50/50 [00:16<00:00,  2.98it/s]
100%|██████████| 50/50 [00:16<00:00,  2.99it/s]
100%|██████████| 50/50 [00:16<00:00,  3.01it/s]
100%|██████████| 50/50 [00:16<00:00,  3.01it/s]
100%|██████████| 50/50 [00:16<00:00,  3.03it/s]
100%|██████████| 50/50 [00:16<00:00,  3.01it/s]
100%|██████████| 50/50 [00:16<00:00,  3.02it/s]
100%|██████████| 50/50 [00:16<00:00,  3.03it/s]
100%|██████████| 50/50 [00:16<00:00,  2.97it/s]
100%|██████████| 50/50 [00:16<00:00,  3.02it/s]
100%|██████████| 50/50 [00:16<00:00,  3.02it/s]
100%|██████████| 50/50 [00:16<00:00,  2.99it/s]
100%|██████████| 5