In [1]:
# Cell 1: Import necessary libraries and set device
import re
import torch
import torchaudio
from datasets import load_dataset, Dataset
from transformers import WhisperProcessor, WhisperForConditionalGeneration, Seq2SeqTrainer, Seq2SeqTrainingArguments
import evaluate
from torch.utils.data import DataLoader
from dataclasses import dataclass
from typing import Any, Dict, List, Union

# Set device to GPU if available
device = torch.device("cuda" if torch.cuda.is_available() else "cpu")
print(f"Using device: {device}")



Using device: cuda


In [2]:
from datasets import load_from_disk # Load the datasets
small_validation_set = load_from_disk("small_validation_set")

In [3]:
# Cell 7: Define data collator without moving tensors to device
@dataclass
class DataCollatorSpeechSeq2SeqWithPadding:
    processor: WhisperProcessor
    padding: Union[bool, str] = True

    def __call__(self, features: List[Dict[str, Any]]) -> Dict[str, torch.Tensor]:
        # Process audio
        input_features = [self.processor(feature["audio"]["array"], sampling_rate=16000, return_tensors="pt").input_features[0] for feature in features]
        # Process labels
        labels = [self.processor.tokenizer(feature["text"]).input_ids for feature in features]

        # Pad inputs and labels
        input_features = torch.stack(input_features)
        labels = torch.nn.utils.rnn.pad_sequence(
            [torch.tensor(l) for l in labels],
            batch_first=True,
            padding_value=self.processor.tokenizer.pad_token_id
        )

        # Replace padding token id's of the labels by -100 so they are ignored in the loss computation
        labels[labels == self.processor.tokenizer.pad_token_id] = -100

        batch = {
            "input_features": input_features,  # Do not move to device here
            "labels": labels,                  # Do not move to device here
        }
        return batch


In [4]:
processor = WhisperProcessor.from_pretrained("openai/whisper-tiny")

In [5]:
# Cell 8: Create DataLoaders with pin_memory=True
data_collator = DataCollatorSpeechSeq2SeqWithPadding(processor=processor)
# train_dataloader = DataLoader(train_dataset, batch_size=2, shuffle=True, collate_fn=data_collator, pin_memory=True)
# val_dataloader = DataLoader(val_dataset, batch_size=2, shuffle=False, collate_fn=data_collator, pin_memory=True)


In [6]:
import evaluate
import numpy as np

wer_metric = evaluate.load("wer")
cer_metric = evaluate.load("cer")


def compute_metrics(pred):
    pred_ids = pred.predictions
    label_ids = pred.label_ids

    # Convert to numpy arrays and move to CPU
    if isinstance(pred_ids, torch.Tensor):
        pred_ids = pred_ids.cpu().numpy()
    else:
        pred_ids = np.array(pred_ids)

    if isinstance(label_ids, torch.Tensor):
        label_ids = label_ids.cpu().numpy()
    else:
        label_ids = np.array(label_ids)

    # Replace -100 with the pad token ID
    label_ids[label_ids == -100] = processor.tokenizer.pad_token_id

    # Decode predictions and labels
    pred_str = processor.tokenizer.batch_decode(pred_ids, skip_special_tokens=True)
    label_str = processor.tokenizer.batch_decode(label_ids, skip_special_tokens=True)

    # Compute WER and CER
    wer = wer_metric.compute(predictions=pred_str, references=label_str)
    cer = cer_metric.compute(predictions=pred_str, references=label_str)

    return {"wer": wer, "cer": cer}



In [7]:
# Import necessary libraries
import torch
from transformers import WhisperForConditionalGeneration, Seq2SeqTrainingArguments, Seq2SeqTrainer
from tqdm.auto import tqdm
small_val_dataset = small_validation_set
# Step 1: Set device and clear GPU cache
torch.cuda.empty_cache()
device = torch.device("cuda" if torch.cuda.is_available() else "cpu")
print(f"Using device: {device}")

# Step 2: Load the fine-tuned Whisper model on GPU
model1 = WhisperForConditionalGeneration.from_pretrained("./whisper-finetuned-dev-clean7/checkpoint-22832").to(device) #change

# Step 3: Define evaluation training arguments
eval_training_args = Seq2SeqTrainingArguments(
    output_dir="./whisper-finetuned-dev-clean-eval7",
    per_device_eval_batch_size=1,      # Keep batch size low to manage GPU memory
    dataloader_num_workers=0,          # Set to 0 to simplify I/O operations
    remove_unused_columns=False,
    predict_with_generate=True,
    fp16=False,                        # Disable fp16 for stability during evaluation
    evaluation_strategy="no",
    disable_tqdm=False,
    logging_dir="./logs-eval",
    logging_steps=10,
    report_to="none"
)

# Step 4: Initialize Trainer
trainer1 = Seq2SeqTrainer(
    model=model1,
    args=eval_training_args,
    eval_dataset=small_val_dataset,
    tokenizer=processor.tokenizer,
    data_collator=data_collator,
    compute_metrics=compute_metrics
)

# Step 5: Perform Evaluation with Debug Print Statements
torch.cuda.empty_cache()  # Clear GPU cache
print("Starting evaluation...")

try:
    eval_results = trainer1.evaluate()
    print(f"Final WER: {eval_results['eval_wer']}")
    print(f"Final CER: {eval_results['eval_cer']}")
except RuntimeError as e:
    print(f"Runtime error during evaluation: {e}")
    torch.cuda.empty_cache()


Using device: cuda


  trainer1 = Seq2SeqTrainer(


Starting evaluation...


Due to a bug fix in https://github.com/huggingface/transformers/pull/28687 transcription using a multilingual Whisper will default to language detection followed by transcription instead of translation to English.This might be a breaking change for your use case. If you want to instead always translate your audio to English, make sure to pass `language='en'`.
Passing a tuple of `past_key_values` is deprecated and will be removed in Transformers v4.43.0. You should pass an instance of `EncoderDecoderCache` instead, e.g. `past_key_values=EncoderDecoderCache.from_legacy_cache(past_key_values)`.
The attention mask is not set and cannot be inferred from input because pad token is same as eos token. As a consequence, you may observe unexpected behavior. Please pass your input's `attention_mask` to obtain reliable results.
Trainer.tokenizer is now deprecated. You should use Trainer.processing_class instead.
Trainer.tokenizer is now deprecated. You should use Trainer.processing_class instead.


  0%|          | 0/100 [00:00<?, ?it/s]

Trainer.tokenizer is now deprecated. You should use Trainer.processing_class instead.
Trainer.tokenizer is now deprecated. You should use Trainer.processing_class instead.
Trainer.tokenizer is now deprecated. You should use Trainer.processing_class instead.
Trainer.tokenizer is now deprecated. You should use Trainer.processing_class instead.
Trainer.tokenizer is now deprecated. You should use Trainer.processing_class instead.
Trainer.tokenizer is now deprecated. You should use Trainer.processing_class instead.
Trainer.tokenizer is now deprecated. You should use Trainer.processing_class instead.
Trainer.tokenizer is now deprecated. You should use Trainer.processing_class instead.
Trainer.tokenizer is now deprecated. You should use Trainer.processing_class instead.
Trainer.tokenizer is now deprecated. You should use Trainer.processing_class instead.
Trainer.tokenizer is now deprecated. You should use Trainer.processing_class instead.
Trainer.tokenizer is now deprecated. You should use Tr

Final WER: 0.20552519732847602
Final CER: 0.057348686822848716


In [None]:
import mlflow
import mlflow.pytorch
from mlflow.models.signature import infer_signature

# Step 1: Set MLflow Tracking URI and Experiment Name
mlflow.set_tracking_uri("http://127.0.0.1:5555")  
experiment_name = "Whisper Experiment"
mlflow.set_experiment(experiment_name)

In [9]:
def log_model_to_mlflow(model, processor, eval_results, eval_training_args, experiment_name):
    with mlflow.start_run(run_name="Whisper-Ft7 Evaluation Run"):
        # Log model parameters
        mlflow.log_param("model_name", "WhisperFineTunedModel7")
        mlflow.log_param("checkpoint", "./whisper-finetuned-dev-clean7/checkpoint-22832")

        # Log evaluation metrics
        mlflow.log_metric("eval_wer", eval_results["eval_wer"])
        mlflow.log_metric("eval_cer", eval_results["eval_cer"])  # Log CER

        # Save the model locally before logging to MLflow
        model_dir = "./whisper_model_mlflow"
        model.save_pretrained(model_dir)
        processor.save_pretrained(model_dir)

        # Log the model to MLflow
        mlflow.pytorch.log_model(
            pytorch_model=model,
            artifact_path="whisper_model",
            registered_model_name="WhisperFineTunedModel7"
        )
        print("Model and training arguments logged successfully to MLflow.")

log_model_to_mlflow(model1, processor, eval_results, eval_training_args, experiment_name)


Successfully registered model 'WhisperFineTunedModel7'.
2024/12/08 19:30:08 INFO mlflow.store.model_registry.abstract_store: Waiting up to 300 seconds for model version to finish creation. Model name: WhisperFineTunedModel7, version 1


Model and training arguments logged successfully to MLflow.
🏃 View run Whisper-Ft7 Evaluation Run at: http://127.0.0.1:5555/#/experiments/1/runs/e5d2b17245f64192ac5a5ef25ea6b715
🧪 View experiment at: http://127.0.0.1:5555/#/experiments/1


Created version '1' of model 'WhisperFineTunedModel7'.


In [11]:


small_val_dataset = small_validation_set


# Step 1: Set device and clear GPU cache
torch.cuda.empty_cache()
device = torch.device("cuda" if torch.cuda.is_available() else "cpu")
print(f"Using device: {device}")

# Step 2: Load the fine-tuned Whisper model on GPU
model2 = WhisperForConditionalGeneration.from_pretrained("./whisper-finetuned-dev-clean4/checkpoint-8562").to(device) #change

# Step 3: Define evaluation training arguments
eval_training_args2 = Seq2SeqTrainingArguments(
    output_dir="./whisper-finetuned-dev-clean-eval4",
    per_device_eval_batch_size=1,      # Keep batch size low to manage GPU memory
    dataloader_num_workers=0,          # Set to 0 to simplify I/O operations
    remove_unused_columns=False,
    predict_with_generate=True,
    fp16=False,                        # Disable fp16 for stability during evaluation
    evaluation_strategy="no",
    disable_tqdm=False,
    logging_dir="./logs-eval",
    logging_steps=10,
    report_to="none"
)

# Step 4: Initialize Trainer
trainer2 = Seq2SeqTrainer(    #change
    model=model2,  #change
    args=eval_training_args2,
    eval_dataset=small_val_dataset,
    tokenizer=processor.tokenizer,
    data_collator=data_collator,
    compute_metrics=compute_metrics
)

# Step 5: Perform Evaluation with Debug Print Statements
torch.cuda.empty_cache()  # Clear GPU cache
print("Starting evaluation...")

try:
    eval_results2 = trainer2.evaluate()   #change
    print(f"Final WER: {eval_results2['eval_wer']}")   #change
    print(f"Final CER: {eval_results2['eval_cer']}")
except RuntimeError as e:
    print(f"Runtime error during evaluation: {e}")
    torch.cuda.empty_cache()


  trainer2 = Seq2SeqTrainer(    #change


Using device: cpu
Starting evaluation...


Trainer.tokenizer is now deprecated. You should use Trainer.processing_class instead.
Trainer.tokenizer is now deprecated. You should use Trainer.processing_class instead.
Trainer.tokenizer is now deprecated. You should use Trainer.processing_class instead.
Trainer.tokenizer is now deprecated. You should use Trainer.processing_class instead.
Trainer.tokenizer is now deprecated. You should use Trainer.processing_class instead.
Trainer.tokenizer is now deprecated. You should use Trainer.processing_class instead.
Trainer.tokenizer is now deprecated. You should use Trainer.processing_class instead.
Trainer.tokenizer is now deprecated. You should use Trainer.processing_class instead.
  0%|          | 0/100 [00:00<?, ?it/s]Trainer.tokenizer is now deprecated. You should use Trainer.processing_class instead.
Trainer.tokenizer is now deprecated. You should use Trainer.processing_class instead.
Trainer.tokenizer is now deprecated. You should use Trainer.processing_class instead.
Trainer.tokeniz

Final WER: 0.23436551305403763
Final CER: 0.06534687163197005





In [12]:
def log_model_to_mlflow2(model, processor, eval_results2, eval_training_args2, experiment_name):
    with mlflow.start_run(run_name="Whisper-Ft4 Evaluation Run"):
        # Log model parameters
        mlflow.log_param("model_name", "WhisperFineTunedModel4")
        mlflow.log_param("checkpoint", "./whisper-finetuned-dev-clean4/checkpoint-8562")

        # Log evaluation metrics
        mlflow.log_metric("eval_wer", eval_results2["eval_wer"])
        mlflow.log_metric("eval_cer", eval_results2["eval_cer"])  # Log CER


        # Save the model locally before logging to MLflow
        model_dir = "./whisper_model_mlflow2"
        model.save_pretrained(model_dir)
        processor.save_pretrained(model_dir)

        # Log the model to MLflow
        mlflow.pytorch.log_model(
            pytorch_model=model,
            artifact_path="whisper_model4",
            registered_model_name="WhisperFineTunedModel4"
        )
        print("Model and training arguments logged successfully to MLflow.")

log_model_to_mlflow2(model2, processor, eval_results2, eval_training_args2, experiment_name)


Successfully registered model 'WhisperFineTunedModel4'.
2024/12/08 19:31:08 INFO mlflow.store.model_registry.abstract_store: Waiting up to 300 seconds for model version to finish creation. Model name: WhisperFineTunedModel4, version 1


Model and training arguments logged successfully to MLflow.
🏃 View run Whisper-Ft4 Evaluation Run at: http://127.0.0.1:5555/#/experiments/1/runs/86f007253e414a6d82bbf66468bfe896
🧪 View experiment at: http://127.0.0.1:5555/#/experiments/1


Created version '1' of model 'WhisperFineTunedModel4'.


In [13]:


small_val_dataset = small_validation_set


# Step 1: Set device and clear GPU cache
torch.cuda.empty_cache()
device = torch.device("cuda" if torch.cuda.is_available() else "cpu")
print(f"Using device: {device}")

# Step 2: Load the fine-tuned Whisper model on GPU
model3 = WhisperForConditionalGeneration.from_pretrained("./whisper-finetuned-dev-clean2/checkpoint-8562").to(device) #change

# Step 3: Define evaluation training arguments
eval_training_args3 = Seq2SeqTrainingArguments(
    output_dir="./whisper-finetuned-dev-clean-eval2",
    per_device_eval_batch_size=1,      # Keep batch size low to manage GPU memory
    dataloader_num_workers=0,          # Set to 0 to simplify I/O operations
    remove_unused_columns=False,
    predict_with_generate=True,
    fp16=False,                        # Disable fp16 for stability during evaluation
    evaluation_strategy="no",
    disable_tqdm=False,
    logging_dir="./logs-eval",
    logging_steps=10,
    report_to="none"
)

# Step 4: Initialize Trainer
trainer3 = Seq2SeqTrainer(    #change
    model=model3,  #change
    args=eval_training_args3,
    eval_dataset=small_val_dataset,
    tokenizer=processor.tokenizer,
    data_collator=data_collator,
    compute_metrics=compute_metrics
)

# Step 5: Perform Evaluation with Debug Print Statements
torch.cuda.empty_cache()  # Clear GPU cache
print("Starting evaluation...")

try:
    eval_results3 = trainer3.evaluate()   #change
    print(f"Final WER: {eval_results3['eval_wer']}")   #change
    print(f"Final CER: {eval_results3['eval_cer']}")
except RuntimeError as e:
    print(f"Runtime error during evaluation: {e}")
    torch.cuda.empty_cache()


  trainer3 = Seq2SeqTrainer(    #change


Using device: cpu
Starting evaluation...


Trainer.tokenizer is now deprecated. You should use Trainer.processing_class instead.
Trainer.tokenizer is now deprecated. You should use Trainer.processing_class instead.
Trainer.tokenizer is now deprecated. You should use Trainer.processing_class instead.
Trainer.tokenizer is now deprecated. You should use Trainer.processing_class instead.
Trainer.tokenizer is now deprecated. You should use Trainer.processing_class instead.
Trainer.tokenizer is now deprecated. You should use Trainer.processing_class instead.
Trainer.tokenizer is now deprecated. You should use Trainer.processing_class instead.
Trainer.tokenizer is now deprecated. You should use Trainer.processing_class instead.
  0%|          | 0/100 [00:00<?, ?it/s]Trainer.tokenizer is now deprecated. You should use Trainer.processing_class instead.
Trainer.tokenizer is now deprecated. You should use Trainer.processing_class instead.
Trainer.tokenizer is now deprecated. You should use Trainer.processing_class instead.
Trainer.tokeniz

Final WER: 0.25531268973891924
Final CER: 0.06943105110896818





In [14]:
def log_model_to_mlflow3(model, processor, eval_results3, eval_training_args3, experiment_name):
    with mlflow.start_run(run_name="Whisper-Ft2 Evaluation Run"):
        # Log model parameters
        mlflow.log_param("model_name", "WhisperFineTunedModel2")
        mlflow.log_param("checkpoint", "./whisper-finetuned-dev-clean2/checkpoint-8562")

        # Log evaluation metrics
        mlflow.log_metric("eval_wer", eval_results3["eval_wer"])
        mlflow.log_metric("eval_cer", eval_results3["eval_cer"])  # Log CER
        # Save the model locally before logging to MLflow
        model_dir = "./whisper_model_mlflow3"
        model.save_pretrained(model_dir)
        processor.save_pretrained(model_dir)

        # Log the model to MLflow
        mlflow.pytorch.log_model(
            pytorch_model=model,
            artifact_path="whisper_model2",
            registered_model_name="WhisperFineTunedModel2"
        )
        print("Model and training arguments logged successfully to MLflow.")

log_model_to_mlflow3(model3, processor, eval_results3, eval_training_args3, experiment_name)

Successfully registered model 'WhisperFineTunedModel2'.
2024/12/08 19:32:09 INFO mlflow.store.model_registry.abstract_store: Waiting up to 300 seconds for model version to finish creation. Model name: WhisperFineTunedModel2, version 1


Model and training arguments logged successfully to MLflow.
🏃 View run Whisper-Ft2 Evaluation Run at: http://127.0.0.1:5555/#/experiments/1/runs/880374dfa9e5450c8c7e4c802ab3edb4
🧪 View experiment at: http://127.0.0.1:5555/#/experiments/1


Created version '1' of model 'WhisperFineTunedModel2'.


In [15]:
from mlflow.tracking.client import MlflowClient

# Initialize MlflowClient
client = MlflowClient()

# Retrieve all registered model names using the Model Registry API
registered_models = client.search_registered_models()
for model in registered_models:
    print(model.name)  # Print the name of each registered model


WhisperFineTunedModel2
WhisperFineTunedModel4
WhisperFineTunedModel7


In [16]:
from mlflow.tracking import MlflowClient
import mlflow.pytorch

# Initialize MlflowClient
client = MlflowClient()

# Retrieve all runs for the experiment
experiment = client.get_experiment_by_name(experiment_name)
if not experiment:
    raise ValueError(f"Experiment '{experiment_name}' not found.")

# Dictionary to store model names and their WER metrics
model_metrics = {}

# Iterate through runs in the experiment
for run in client.search_runs(experiment.experiment_id):
    model_name = run.data.params.get("model_name")
    eval_wer = run.data.metrics.get("eval_wer")

    print(model_name)
    if model_name and eval_wer is not None:
        model_metrics[model_name] = eval_wer

# Identify the best model with the lowest WER
best_model_name = min(model_metrics, key=model_metrics.get)
print(f"Best model is: {best_model_name} with WER: {model_metrics[best_model_name]}")


WhisperFineTunedModel2
WhisperFineTunedModel4
WhisperFineTunedModel7
Best model is: WhisperFineTunedModel7 with WER: 0.20552519732847602


In [21]:
model_metrics

{'WhisperFineTunedModel2': 0.25531268973891924,
 'WhisperFineTunedModel4': 0.23436551305403763,
 'WhisperFineTunedModel7': 0.20552519732847602}

In [17]:

try:
    # Retrieve all registered versions of the best model
    registered_versions = client.get_registered_model(name=best_model_name).latest_versions
    # print(registered_versions)
    # Find the latest version with stage "None" (or any other desired stage, e.g., "Production")
    best_model_version = next((v.version for v in registered_versions if v.current_stage == "None"), None)

    if best_model_version is None:
        raise ValueError(f"No version of {best_model_name} is available in stage 'None'.")

    # Load the best model from the Model Registry
    model_uri = f"models:/{best_model_name}/{best_model_version}"
    best_model = mlflow.pytorch.load_model(model_uri)

    # Make predictions using the best model
    print(f"Loaded the best model '{best_model_name}' successfully.")
except mlflow.exceptions.RestException as e:
    print(f"Error: {e}")
    print(f"The model '{best_model_name}' is not registered. Please verify the `registered_model_name`.")


Loaded the best model 'WhisperFineTunedModel7' successfully.


In [19]:
import soundfile as sf  # Make sure this is imported

def transcribe_audio(audio_file_path, processor, model):
    # Load the audio file
    audio, sample_rate = sf.read(audio_file_path)

    # Ensure the audio is a single channel (mono)
    if len(audio.shape) > 1:
        audio = np.mean(audio, axis=1)

    # Convert the audio to a tensor and ensure it is float32
    audio = torch.tensor(audio, dtype=torch.float32)

    # Resample the audio to 16kHz if necessary
    if sample_rate != 16000:
        resample_transform = torchaudio.transforms.Resample(orig_freq=sample_rate, new_freq=16000)
        audio = resample_transform(audio)

    # Truncate or pad the audio to fit Whisper's requirements
    max_audio_length = 30 * 16000  # Whisper expects a maximum of 30 seconds
    if audio.shape[0] > max_audio_length:
        audio = audio[:max_audio_length]
    else:
        audio = torch.nn.functional.pad(audio, (0, max_audio_length - audio.shape[0]))

    # Process the audio input
    inputs = processor(audio.numpy(), sampling_rate=16000, return_tensors="pt", padding=True)

    # Move inputs to the same device as the model
    input_features = inputs.input_features.to(next(model.parameters()).device)

    # Generate predictions using the model
    with torch.no_grad():
        predicted_ids = model.generate(input_features)

    # Decode the predictions to text
    transcription = processor.decode(predicted_ids[0], skip_special_tokens=True)
    return transcription

# Example usage
audio_file_path = "inp9.wav"  # Replace with your audio file path

# Transcribe the audio file
transcription = transcribe_audio(audio_file_path, processor, best_model)
print(f"Transcription: {transcription}")


Transcription:  Honestly speaking I really can't wait to get back to India


In [20]:
# Example usage2
audio_file_path = "inp.wav"  # Replace with your audio file path

# Transcribe the audio file
transcription = transcribe_audio(audio_file_path, processor, best_model)
print(f"Transcription: {transcription}")

Transcription:  Hello, I hope this project works
