In [321]:
import os

os.environ['PYTORCH_CUDA_ALLOC_CONF'] = 'max_split_size_mb:128'


In [None]:
import torch
import gc

# Assume 'model' is your model loaded on the GPU
# Release the model from GPU memory

# Clear CUDA cache
torch.cuda.empty_cache()

# Collect garbage
gc.collect()

# Optionally, verify GPU memory usage
print(torch.cuda.memory_summary(device=None, abbreviated=False))


In [323]:
# Cell 1: Import necessary libraries and set device
import re
import torch
import torchaudio
from datasets import load_dataset, Dataset
from transformers import WhisperProcessor, WhisperForConditionalGeneration, Seq2SeqTrainer, Seq2SeqTrainingArguments
import evaluate
from torch.utils.data import DataLoader
from dataclasses import dataclass
from typing import Any, Dict, List, Union

# Set device to GPU if available
device = torch.device("cuda" if torch.cuda.is_available() else "cpu")
print(f"Using device: {device}")


Using device: cuda


In [324]:
# Cell 2: Load the LibriSpeech dev-clean dataset
librispeech_dataset = load_dataset("librispeech_asr", "clean", split="train.100")


In [325]:
# Cell 3: Preprocess the text
def preprocess_text(batch):
    batch["text"] = batch["text"].lower()
    batch["text"] = re.sub(r"[^\w\s']", '', batch["text"])
    return batch

librispeech_dataset = librispeech_dataset.map(preprocess_text)


In [326]:
# Cell 4: Resample audio to 16 kHz
def resample(batch):
    audio = batch["audio"]["array"]
    sampling_rate = batch["audio"]["sampling_rate"]
    resampler = torchaudio.transforms.Resample(orig_freq=sampling_rate, new_freq=16000)
    batch["audio"]["array"] = resampler(torch.tensor(audio)).numpy()
    batch["audio"]["sampling_rate"] = 16000
    return batch

librispeech_dataset = librispeech_dataset.map(resample)


In [327]:
# Cell 5: Split the dataset into training and validation sets
from sklearn.model_selection import train_test_split
import pandas as pd

# Cell 5: Split the dataset into training and validation sets using the Dataset's built-in method
train_test_split = librispeech_dataset.train_test_split(test_size=0.2, seed=42)
train_dataset = train_test_split['train']
val_dataset = train_test_split['test']



In [328]:
# Cell 6: Load Whisper processor
processor = WhisperProcessor.from_pretrained("openai/whisper-tiny")


In [329]:
# Cell 7: Define data collator without moving tensors to device
@dataclass
class DataCollatorSpeechSeq2SeqWithPadding:
    processor: WhisperProcessor
    padding: Union[bool, str] = True

    def __call__(self, features: List[Dict[str, Any]]) -> Dict[str, torch.Tensor]:
        # Process audio
        input_features = [self.processor(feature["audio"]["array"], sampling_rate=16000, return_tensors="pt").input_features[0] for feature in features]
        # Process labels
        labels = [self.processor.tokenizer(feature["text"]).input_ids for feature in features]

        # Pad inputs and labels
        input_features = torch.stack(input_features)
        labels = torch.nn.utils.rnn.pad_sequence(
            [torch.tensor(l) for l in labels],
            batch_first=True,
            padding_value=self.processor.tokenizer.pad_token_id
        )

        # Replace padding token id's of the labels by -100 so they are ignored in the loss computation
        labels[labels == self.processor.tokenizer.pad_token_id] = -100

        batch = {
            "input_features": input_features,  # Do not move to device here
            "labels": labels,                  # Do not move to device here
        }
        return batch


In [330]:
# Cell 8: Create DataLoaders with pin_memory=True
data_collator = DataCollatorSpeechSeq2SeqWithPadding(processor=processor)
train_dataloader = DataLoader(train_dataset, batch_size=2, shuffle=True, collate_fn=data_collator, pin_memory=True)
val_dataloader = DataLoader(val_dataset, batch_size=2, shuffle=False, collate_fn=data_collator, pin_memory=True)


In [331]:
# Cell 9: Load the pre-trained Whisper model and move it to GPU
model = WhisperForConditionalGeneration.from_pretrained("openai/whisper-tiny").to(device)
print(f"Model device: {next(model.parameters()).device}")



Model device: cuda:0


In [332]:
# Cell 10: Set the model to training mode
model.train()

# # Optionally freeze the encoder to save memory
# for param in model.model.encoder.parameters():
#     param.requires_grad = False
# N = 3  # Number of decoder layers to freeze (adjust as needed)
# for idx in range(N):
#     for param in model.model.decoder.layers[idx].parameters():
#         param.requires_grad = False
model.to(device)

WhisperForConditionalGeneration(
  (model): WhisperModel(
    (encoder): WhisperEncoder(
      (conv1): Conv1d(80, 384, kernel_size=(3,), stride=(1,), padding=(1,))
      (conv2): Conv1d(384, 384, kernel_size=(3,), stride=(2,), padding=(1,))
      (embed_positions): Embedding(1500, 384)
      (layers): ModuleList(
        (0-3): 4 x WhisperEncoderLayer(
          (self_attn): WhisperSdpaAttention(
            (k_proj): Linear(in_features=384, out_features=384, bias=False)
            (v_proj): Linear(in_features=384, out_features=384, bias=True)
            (q_proj): Linear(in_features=384, out_features=384, bias=True)
            (out_proj): Linear(in_features=384, out_features=384, bias=True)
          )
          (self_attn_layer_norm): LayerNorm((384,), eps=1e-05, elementwise_affine=True)
          (activation_fn): GELUActivation()
          (fc1): Linear(in_features=384, out_features=1536, bias=True)
          (fc2): Linear(in_features=1536, out_features=384, bias=True)
          

In [333]:
# Cell 11: Define training arguments for training phase
training_args = Seq2SeqTrainingArguments(
    output_dir="./whisper-finetuned-dev-clean7",
    per_device_train_batch_size=2,  # Start with 2, adjust as possible
    gradient_accumulation_steps=4,  # Adjust to maintain effective batch size
    evaluation_strategy="no",       # Disable evaluation during training
    num_train_epochs=8,
    fp16=True,
    save_steps=1000,
    logging_steps=200,
    learning_rate=9e-9,
    save_total_limit=2,
    remove_unused_columns=False,
    gradient_checkpointing=True,
)


In [334]:
import evaluate
import numpy as np

wer_metric = evaluate.load("wer")

def compute_metrics(pred):
    pred_ids = pred.predictions
    label_ids = pred.label_ids

    # Convert to numpy arrays and move to CPU
    if isinstance(pred_ids, torch.Tensor):
        pred_ids = pred_ids.cpu().numpy()
    else:
        pred_ids = np.array(pred_ids)

    if isinstance(label_ids, torch.Tensor):
        label_ids = label_ids.cpu().numpy()
    else:
        label_ids = np.array(label_ids)

    # Replace -100 with the pad token ID
    label_ids[label_ids == -100] = processor.tokenizer.pad_token_id

    # Decode predictions and labels
    pred_str = processor.tokenizer.batch_decode(pred_ids, skip_special_tokens=True)
    label_str = processor.tokenizer.batch_decode(label_ids, skip_special_tokens=True)

    # Compute WER
    wer = wer_metric.compute(predictions=pred_str, references=label_str)

    return {"wer": wer}


In [335]:
# Cell 13: Initialize the trainer
trainer = Seq2SeqTrainer(
    model=model,
    args=training_args,
    train_dataset=train_dataset,
    data_collator=data_collator,
    tokenizer=processor.tokenizer
)


  trainer = Seq2SeqTrainer(


In [336]:
print(f"Training dataset size: {len(train_dataset)}")
print(f"Validation dataset size: {len(val_dataset)}")


Training dataset size: 22831
Validation dataset size: 5708


In [337]:
print(train_dataset[0])
print(val_dataset[0])


{'file': 'C:\\Users\\srinivas\\.cache\\huggingface\\datasets\\downloads\\extracted\\01aa88dd34f3973baea0bd887e54b2dda028e31fb50aafaaa13aaf5302dca0f0\\87-121553-0082.flac', 'audio': {'path': None, 'array': array([0.        , 0.        , 0.        , ..., 0.00222778, 0.00418091,
       0.00350952]), 'sampling_rate': 16000}, 'text': "i followed in his train against that law's iniquity whose people doth usurp your just possession through your pastor's fault there by that execrable race was i released from bonds of the fallacious world", 'speaker_id': 87, 'chapter_id': 121553, 'id': '87-121553-0082'}
{'file': 'C:\\Users\\srinivas\\.cache\\huggingface\\datasets\\downloads\\extracted\\01aa88dd34f3973baea0bd887e54b2dda028e31fb50aafaaa13aaf5302dca0f0\\5703-47198-0059.flac', 'audio': {'path': None, 'array': array([-1.52587891e-03, -2.41088867e-03, -2.53295898e-03, ...,
        2.74658203e-04,  1.83105469e-04,  9.15527344e-05]), 'sampling_rate': 16000}, 'text': 'before she started to keep her appo

In [338]:
total_params = sum(p.numel() for p in model.parameters())
trainable_params = sum(p.numel() for p in model.parameters() if p.requires_grad)
print(f"Total parameters: {total_params}")
print(f"Trainable parameters: {trainable_params}")


Total parameters: 37760640
Trainable parameters: 37184640


In [339]:
# # Cell 14: Start training
torch.cuda.empty_cache()
trainer.train()


  0%|          | 0/22832 [00:00<?, ?it/s]

{'loss': 2.6284, 'grad_norm': 264.8693542480469, 'learning_rate': 8.923134197617379e-09, 'epoch': 0.07}
{'loss': 2.5683, 'grad_norm': 277.5022277832031, 'learning_rate': 8.844297477224947e-09, 'epoch': 0.14}
{'loss': 2.5357, 'grad_norm': 252.46693420410156, 'learning_rate': 8.765460756832515e-09, 'epoch': 0.21}
{'loss': 2.4697, 'grad_norm': 203.41180419921875, 'learning_rate': 8.686624036440085e-09, 'epoch': 0.28}
{'loss': 2.4038, 'grad_norm': 149.03846740722656, 'learning_rate': 8.608181499649615e-09, 'epoch': 0.35}




{'loss': 2.3743, 'grad_norm': 166.1772003173828, 'learning_rate': 8.529344779257183e-09, 'epoch': 0.42}
{'loss': 2.3333, 'grad_norm': 193.91989135742188, 'learning_rate': 8.450508058864751e-09, 'epoch': 0.49}
{'loss': 2.2996, 'grad_norm': 221.02835083007812, 'learning_rate': 8.371671338472319e-09, 'epoch': 0.56}
{'loss': 2.264, 'grad_norm': 120.60548400878906, 'learning_rate': 8.292834618079887e-09, 'epoch': 0.63}
{'loss': 2.2766, 'grad_norm': 122.80549621582031, 'learning_rate': 8.213997897687455e-09, 'epoch': 0.7}
{'loss': 2.2041, 'grad_norm': 107.83150482177734, 'learning_rate': 8.135161177295024e-09, 'epoch': 0.77}
{'loss': 2.1656, 'grad_norm': 146.72447204589844, 'learning_rate': 8.056324456902592e-09, 'epoch': 0.84}
{'loss': 2.1516, 'grad_norm': 113.12567138671875, 'learning_rate': 7.977487736510162e-09, 'epoch': 0.91}
{'loss': 2.1099, 'grad_norm': 115.53015899658203, 'learning_rate': 7.898651016117728e-09, 'epoch': 0.98}
{'loss': 2.0778, 'grad_norm': 98.9894027709961, 'learning_

TrainOutput(global_step=22832, training_loss=1.6778973211503647, metrics={'train_runtime': 17738.8323, 'train_samples_per_second': 10.297, 'train_steps_per_second': 1.287, 'total_flos': 4.49659012939776e+18, 'train_loss': 1.6778973211503647, 'epoch': 8.0})

In [340]:
import os

checkpoint_dir = "./whisper-finetuned-dev-clean7/checkpoint-22832"
print(os.listdir(checkpoint_dir))


['added_tokens.json', 'config.json', 'generation_config.json', 'merges.txt', 'model.safetensors', 'normalizer.json', 'optimizer.pt', 'rng_state.pth', 'scheduler.pt', 'special_tokens_map.json', 'tokenizer_config.json', 'trainer_state.json', 'training_args.bin', 'vocab.json']


In [341]:
# Import necessary libraries
import torch
from transformers import WhisperForConditionalGeneration, Seq2SeqTrainingArguments, Seq2SeqTrainer
from tqdm.auto import tqdm
small_val_dataset = val_dataset.select(range(100))
# Step 1: Set device and clear GPU cache
torch.cuda.empty_cache()
device = torch.device("cuda" if torch.cuda.is_available() else "cpu")
print(f"Using device: {device}")

# Step 2: Load the fine-tuned Whisper model on GPU
model1 = WhisperForConditionalGeneration.from_pretrained("./whisper-finetuned-dev-clean7/checkpoint-22832").to(device)

# Step 3: Define evaluation training arguments
eval_training_args = Seq2SeqTrainingArguments(
    output_dir="./whisper-finetuned-dev-clean-eval",
    per_device_eval_batch_size=1,      # Keep batch size low to manage GPU memory
    dataloader_num_workers=0,          # Set to 0 to simplify I/O operations
    remove_unused_columns=False,
    predict_with_generate=True,
    fp16=False,                        # Disable fp16 for stability during evaluation
    evaluation_strategy="no",
    disable_tqdm=False,
    logging_dir="./logs-eval",
    logging_steps=10,
    report_to="none"
)

# Step 4: Initialize Trainer
trainer1 = Seq2SeqTrainer(
    model=model1,
    args=eval_training_args,
    eval_dataset=small_val_dataset,
    tokenizer=processor.tokenizer,
    data_collator=data_collator,
    compute_metrics=compute_metrics
)

# Step 5: Perform Evaluation with Debug Print Statements
torch.cuda.empty_cache()  # Clear GPU cache
print("Starting evaluation...")

try:
    eval_results = trainer1.evaluate()
    print(f"Final WER: {eval_results['eval_wer']}")
except RuntimeError as e:
    print(f"Runtime error during evaluation: {e}")
    torch.cuda.empty_cache()


Using device: cuda


  trainer1 = Seq2SeqTrainer(


Starting evaluation...


Trainer.tokenizer is now deprecated. You should use Trainer.processing_class instead.
Trainer.tokenizer is now deprecated. You should use Trainer.processing_class instead.
Trainer.tokenizer is now deprecated. You should use Trainer.processing_class instead.
Trainer.tokenizer is now deprecated. You should use Trainer.processing_class instead.
Trainer.tokenizer is now deprecated. You should use Trainer.processing_class instead.
Trainer.tokenizer is now deprecated. You should use Trainer.processing_class instead.
Trainer.tokenizer is now deprecated. You should use Trainer.processing_class instead.
Trainer.tokenizer is now deprecated. You should use Trainer.processing_class instead.


  0%|          | 0/100 [00:00<?, ?it/s]

Trainer.tokenizer is now deprecated. You should use Trainer.processing_class instead.
Trainer.tokenizer is now deprecated. You should use Trainer.processing_class instead.
Trainer.tokenizer is now deprecated. You should use Trainer.processing_class instead.
Trainer.tokenizer is now deprecated. You should use Trainer.processing_class instead.
Trainer.tokenizer is now deprecated. You should use Trainer.processing_class instead.
Trainer.tokenizer is now deprecated. You should use Trainer.processing_class instead.
Trainer.tokenizer is now deprecated. You should use Trainer.processing_class instead.
Trainer.tokenizer is now deprecated. You should use Trainer.processing_class instead.
Trainer.tokenizer is now deprecated. You should use Trainer.processing_class instead.
Trainer.tokenizer is now deprecated. You should use Trainer.processing_class instead.
Trainer.tokenizer is now deprecated. You should use Trainer.processing_class instead.
Trainer.tokenizer is now deprecated. You should use Tr

Final WER: 0.20643594414086217


In [342]:
# Cell 16: Save the fine-tuned model and processor
#trainer.save_model("./whisper-finetuned-dev-clean")
processor.save_pretrained("./whisper-finetuned-dev-clean7")


[]

In [343]:
# # Cell 17: Load the fine-tuned model and processor
# processor = WhisperProcessor.from_pretrained("./whisper-finetuned-dev-clean")
# model = WhisperForConditionalGeneration.from_pretrained("./whisper-finetuned-dev-clean").to(device)


In [344]:
# Cell 18: Test the fine-tuned model on new audio
# Load an audio file
# speech_array, sampling_rate = torchaudio.load("inp.wav")
# speech_array = torchaudio.functional.resample(speech_array, orig_freq=sampling_rate, new_freq=16000)

# # Prepare the input features and move them to the GPU
# input_features = processor(speech_array.squeeze(), sampling_rate=16000, return_tensors="pt").input_features.to(device)

# # Generate transcription
# model.eval()  # Set the model to evaluation mode
# with torch.no_grad():
#     predicted_ids = model.generate(input_features)

# # Decode the transcription
# transcription = processor.decode(predicted_ids[0], skip_special_tokens=True)

# print(f"Transcription: {transcription}")


In [345]:
from transformers import WhisperProcessor, WhisperForConditionalGeneration

# Load the base Whisper model and processor
processor_base = WhisperProcessor.from_pretrained("openai/whisper-tiny")
model_base = WhisperForConditionalGeneration.from_pretrained("openai/whisper-tiny")


In [346]:
import torch

device = torch.device("cuda" if torch.cuda.is_available() else "cpu")
model_base.to(device)


WhisperForConditionalGeneration(
  (model): WhisperModel(
    (encoder): WhisperEncoder(
      (conv1): Conv1d(80, 384, kernel_size=(3,), stride=(1,), padding=(1,))
      (conv2): Conv1d(384, 384, kernel_size=(3,), stride=(2,), padding=(1,))
      (embed_positions): Embedding(1500, 384)
      (layers): ModuleList(
        (0-3): 4 x WhisperEncoderLayer(
          (self_attn): WhisperSdpaAttention(
            (k_proj): Linear(in_features=384, out_features=384, bias=False)
            (v_proj): Linear(in_features=384, out_features=384, bias=True)
            (q_proj): Linear(in_features=384, out_features=384, bias=True)
            (out_proj): Linear(in_features=384, out_features=384, bias=True)
          )
          (self_attn_layer_norm): LayerNorm((384,), eps=1e-05, elementwise_affine=True)
          (activation_fn): GELUActivation()
          (fc1): Linear(in_features=384, out_features=1536, bias=True)
          (fc2): Linear(in_features=1536, out_features=384, bias=True)
          

In [347]:
from dataclasses import dataclass
from typing import Any, Dict, List, Union

@dataclass
class DataCollatorSpeechSeq2SeqWithPadding:
    processor: WhisperProcessor
    padding: Union[bool, str] = True

    def __call__(self, features: List[Dict[str, Any]]) -> Dict[str, torch.Tensor]:
        # Process audio
        input_features = [self.processor(
            feature["audio"]["array"], sampling_rate=16000, return_tensors="pt"
        ).input_features[0] for feature in features]

        # Process labels
        labels = [self.processor.tokenizer(feature["text"]).input_ids for feature in features]

        # Pad inputs and labels
        input_features = torch.stack(input_features)
        labels = torch.nn.utils.rnn.pad_sequence(
            [torch.tensor(l) for l in labels],
            batch_first=True,
            padding_value=self.processor.tokenizer.pad_token_id
        )

        # Replace padding token id's of the labels by -100 so they are ignored in the loss computation
        labels[labels == self.processor.tokenizer.pad_token_id] = -100

        batch = {
            "input_features": input_features,
            "labels": labels,
        }
        return batch


In [348]:
data_collator = DataCollatorSpeechSeq2SeqWithPadding(processor=processor_base)


In [349]:
import evaluate

wer_metric = evaluate.load("wer")

def compute_metrics(pred):
    pred_ids = pred.predictions
    label_ids = pred.label_ids

    # Convert to numpy arrays and move to CPU
    if isinstance(pred_ids, torch.Tensor):
        pred_ids = pred_ids.cpu().numpy()
    if isinstance(label_ids, torch.Tensor):
        label_ids = label_ids.cpu().numpy()

    # Replace -100 with the pad token ID
    label_ids[label_ids == -100] = processor_base.tokenizer.pad_token_id

    # Decode predictions and labels
    pred_str = processor_base.tokenizer.batch_decode(pred_ids, skip_special_tokens=True)
    label_str = processor_base.tokenizer.batch_decode(label_ids, skip_special_tokens=True)

    # Compute WER
    wer = wer_metric.compute(predictions=pred_str, references=label_str)

    return {"wer": wer}


In [350]:
from transformers import Seq2SeqTrainingArguments, Seq2SeqTrainer

eval_training_args = Seq2SeqTrainingArguments(
    output_dir="./whisper-base-eval",
    per_device_eval_batch_size=1,
    dataloader_num_workers=0,
    remove_unused_columns=False,
    predict_with_generate=True,
    fp16=False,  # Set to True if using GPU with enough memory
    evaluation_strategy="no",
)


In [351]:
trainer_base = Seq2SeqTrainer(
    model=model_base,
    args=eval_training_args,
    eval_dataset=small_val_dataset,
    tokenizer=processor_base.tokenizer,
    data_collator=data_collator,
    compute_metrics=compute_metrics,
)


  trainer_base = Seq2SeqTrainer(


In [352]:
import torch

torch.cuda.empty_cache()


In [353]:
eval_results_base = trainer_base.evaluate()
print(f"Base Model WER: {eval_results_base['eval_wer']}")


Trainer.tokenizer is now deprecated. You should use Trainer.processing_class instead.
Trainer.tokenizer is now deprecated. You should use Trainer.processing_class instead.
Trainer.tokenizer is now deprecated. You should use Trainer.processing_class instead.
Trainer.tokenizer is now deprecated. You should use Trainer.processing_class instead.
Trainer.tokenizer is now deprecated. You should use Trainer.processing_class instead.
Trainer.tokenizer is now deprecated. You should use Trainer.processing_class instead.
Trainer.tokenizer is now deprecated. You should use Trainer.processing_class instead.
Trainer.tokenizer is now deprecated. You should use Trainer.processing_class instead.


  0%|          | 0/100 [00:00<?, ?it/s]

Trainer.tokenizer is now deprecated. You should use Trainer.processing_class instead.
Trainer.tokenizer is now deprecated. You should use Trainer.processing_class instead.
Trainer.tokenizer is now deprecated. You should use Trainer.processing_class instead.
Trainer.tokenizer is now deprecated. You should use Trainer.processing_class instead.
Trainer.tokenizer is now deprecated. You should use Trainer.processing_class instead.
Trainer.tokenizer is now deprecated. You should use Trainer.processing_class instead.
Trainer.tokenizer is now deprecated. You should use Trainer.processing_class instead.
Trainer.tokenizer is now deprecated. You should use Trainer.processing_class instead.
Trainer.tokenizer is now deprecated. You should use Trainer.processing_class instead.
Trainer.tokenizer is now deprecated. You should use Trainer.processing_class instead.
Trainer.tokenizer is now deprecated. You should use Trainer.processing_class instead.
Trainer.tokenizer is now deprecated. You should use Tr

Base Model WER: 0.2568306010928962
