### prepare environment

In [3]:
!pip install --upgrade -q pip
!pip install --upgrade -q datasets[audio] transformers accelerate evaluate jiwer tensorboard gradio
!pip install --upgrade -q librosa

[2K   [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m1.8/1.8 MB[0m [31m39.3 MB/s[0m eta [36m0:00:00[0m
[2K   [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m5.5/5.5 MB[0m [31m105.8 MB/s[0m eta [36m0:00:00[0m
[2K   [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m57.2/57.2 MB[0m [31m55.4 MB/s[0m eta [36m0:00:00[0m
[2K   [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m3.1/3.1 MB[0m [31m104.1 MB/s[0m eta [36m0:00:00[0m
[2K   [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m11.3/11.3 MB[0m [31m139.1 MB/s[0m eta [36m0:00:00[0m
[?25h[31mERROR: pip's dependency resolver does not currently take into account all the packages that are installed. This behaviour is the source of the following dependency conflicts.
gcsfs 2024.10.0 requires fsspec==2024.10.0, but you have fsspec 2024.9.0 which is incompatible.
tensorflow 2.17.1 requires tensorboard<2.18,>=2.17, but you have tensorboard 2.18.0 which is incompatible.[0m[31m
[0m

### login notebook

In [2]:
from huggingface_hub import notebook_login

notebook_login()

VBox(children=(HTML(value='<center> <img\nsrc=https://huggingface.co/front/assets/huggingface_logo-noborder.sv…

### import libraries

In [4]:
import os
import json
import pandas as pd
import librosa
from sklearn.model_selection import train_test_split

In [5]:
from google.colab import drive
import os
drive.mount('/content/drive')
os.chdir('/content/drive/MyDrive/whisper_fine_tuning/')

Mounted at /content/drive


## Load dataset

In [None]:
def load_audios(path):
  audios = []
  for l in sorted(os.listdir(path)):
    temp_dct = {}
    y, sr = librosa.load(path=os.path.join(path, l), sr=16_000)
    temp_dct['audio'] = y
    temp_dct['sampling_rate'] = sr
    temp_dct['file_name'] = l
    audios.append(temp_dct)
  return audios

def load_labels(path):
  labels = []
  for l in sorted(os.listdir(path)):
    with open(os.path.join(path, l), 'r') as file:
      temp_dct = {}
      temp_dct['text'] = json.load(file)['manual_transcription']['text']
    labels.append(temp_dct)
  return labels


In [None]:
path_prefix = '/content/drive/MyDrive/whisper_fine_tuning'
dataset = {}
dataset['audios'] = load_audios(os.path.join(path_prefix, 'audios_small'))
dataset['labels'] = load_labels(os.path.join(path_prefix, 'labels_small'))

In [None]:
dataset['audios'][0]

{'audio': array([0.        , 0.        , 0.        , ..., 0.00302124, 0.0100708 ,
        0.01977539], dtype=float32),
 'sampling_rate': 16000,
 'file_name': 'audio_000000-00.wav'}

### train-test split

In [None]:
train = {}
test = {}
train['audios'], test['audios'], train['labels'], test['labels'] = train_test_split(
              dataset['audios'], dataset['labels'], test_size=0.2, random_state=42)

In [None]:
print(f'num_trainset:\t{len(train["audios"])}')
print(f'num_testset:\t{len(test["audios"])}')

num_trainset:	100
num_testset:	25


## Prepare Dataset

### prepare whisper stuff

In [6]:
from transformers import WhisperFeatureExtractor, WhisperTokenizer, WhisperProcessor

feature_extractor = WhisperFeatureExtractor.from_pretrained("openai/whisper-large-v3-turbo")
tokenizer = WhisperTokenizer.from_pretrained("openai/whisper-large-v3-turbo", language="Urdu", task="transcribe")
processor = WhisperProcessor.from_pretrained("openai/whisper-large-v3-turbo", language="Urdu", task="transcribe")



The secret `HF_TOKEN` does not exist in your Colab secrets.
To authenticate with the Hugging Face Hub, create a token in your settings tab (https://huggingface.co/settings/tokens), set it as secret in your Google Colab and restart your session.
You will be able to reuse this secret in all of your notebooks.
Please note that authentication is recommended but still optional to access public models or datasets.


preprocessor_config.json:   0%|          | 0.00/340 [00:00<?, ?B/s]

tokenizer_config.json:   0%|          | 0.00/283k [00:00<?, ?B/s]

vocab.json:   0%|          | 0.00/1.04M [00:00<?, ?B/s]

tokenizer.json:   0%|          | 0.00/2.71M [00:00<?, ?B/s]

merges.txt:   0%|          | 0.00/494k [00:00<?, ?B/s]

normalizer.json:   0%|          | 0.00/52.7k [00:00<?, ?B/s]

added_tokens.json:   0%|          | 0.00/34.6k [00:00<?, ?B/s]

special_tokens_map.json:   0%|          | 0.00/2.19k [00:00<?, ?B/s]

### verify tokenizer functionality

In [None]:
input_str = train["labels"][0]['text']
labels = tokenizer(input_str).input_ids
decoded_with_special = tokenizer.decode(labels, skip_special_tokens=False)
decoded_str = tokenizer.decode(labels, skip_special_tokens=True)

print(f"Input:                 {input_str}")
print(f"Decoded w/ special:    {decoded_with_special}")
print(f"Decoded w/out special: {decoded_str}")
print(f"Are equal:             {input_str == decoded_str}")


Input:                 دیا۔ So even things like Muhammad bin Qasim کو hero بنانا اور یہ سب یہ Zia کے time میں in trend زیادہ ہوا یہ۔ بھئی آپ Muhammad bin Qasim کے بارے میں سچ تو بولتے نہیں۔ وہ ساری کہانی جو ہے اس کے بھی تو تین پہلو ہیں۔ آپ تو ایک ہی پہلو کا ذکر کرتے ہیں۔ What about the other two? اس کے آنے سے پہلے کیا ہوا اور اس کے جانے کے بعد کیا ہوا۔
Decoded w/ special:    <|startoftranscript|><|ur|><|transcribe|><|notimestamps|>دیا۔ So even things like Muhammad bin Qasim کو hero بنانا اور یہ سب یہ Zia کے time میں in trend زیادہ ہوا یہ۔ بھئی آپ Muhammad bin Qasim کے بارے میں سچ تو بولتے نہیں۔ وہ ساری کہانی جو ہے اس کے بھی تو تین پہلو ہیں۔ آپ تو ایک ہی پہلو کا ذکر کرتے ہیں۔ What about the other two? اس کے آنے سے پہلے کیا ہوا اور اس کے جانے کے بعد کیا ہوا۔<|endoftext|>
Decoded w/out special: دیا۔ So even things like Muhammad bin Qasim کو hero بنانا اور یہ سب یہ Zia کے time میں in trend زیادہ ہوا یہ۔ بھئی آپ Muhammad bin Qasim کے بارے میں سچ تو بولتے نہیں۔ وہ ساری کہانی جو ہے اس کے بھی 

### extract features

In [5]:
def prepare_dataset(data):
    audio = data["audios"]
    # compute log-Mel input features from input audio array
    data["input_features"] = feature_extractor(audio["audio"], sampling_rate=audio["sampling_rate"]).input_features[0]
    # encode target text to label ids
    data["sentence"] = data['labels']["text"]
    data["labels"] = tokenizer(data['labels']["text"]).input_ids
    return data


In [None]:
prepared_train = list(map(prepare_dataset, pd.DataFrame(train).to_dict(orient="records")))
prepared_test = list(map(prepare_dataset, pd.DataFrame(test).to_dict(orient="records")))

In [None]:
prepared_train[0]

{'audios': {'audio': array([ 0.03942871,  0.03869629,  0.0378418 , ..., -0.00015259,
         -0.00018311, -0.00012207], dtype=float32),
  'sampling_rate': 16000,
  'file_name': 'audio_000180-02.wav'},
 'labels': [50258,
  50290,
  50360,
  50364,
  3215,
  38849,
  24621,
  407,
  754,
  721,
  411,
  19360,
  5171,
  1249,
  296,
  332,
  31561,
  5316,
  44945,
  7649,
  995,
  32930,
  35324,
  8608,
  3555,
  35324,
  1176,
  654,
  24049,
  565,
  27875,
  294,
  6028,
  30767,
  4135,
  18513,
  6354,
  12138,
  14407,
  35324,
  24621,
  4724,
  14268,
  19986,
  4135,
  46201,
  19360,
  5171,
  1249,
  296,
  332,
  24049,
  4724,
  9640,
  7369,
  27875,
  8608,
  24061,
  33427,
  4724,
  12610,
  2655,
  7369,
  50194,
  24621,
  44291,
  8608,
  9640,
  4135,
  33491,
  7649,
  4135,
  10874,
  2407,
  23905,
  24525,
  24049,
  4724,
  36078,
  33427,
  6055,
  32151,
  21453,
  6354,
  1211,
  2407,
  38904,
  24621,
  46201,
  33427,
  1975,
  29325,
  12138,
  4135,
 

### checking if any datapoint has more than max tokens


In [None]:
# lens=[len(p_t['labels']) for p_t in prepared_train]
lens=[i for i, p_t in enumerate(prepared_train) if len(p_t['labels'])>480]
print(lens)
# prepared_train[lens[0]]

[]


In [None]:
model = 0

## Fine-tune model

### load pre-trained checkpoint

In [None]:
from transformers import WhisperForConditionalGeneration

model = WhisperForConditionalGeneration.from_pretrained("openai/whisper-large-v3-turbo")
model.generation_config.language = "Urdu"
model.generation_config.task = "transcribe"

model.generation_config.forced_decoder_ids = None


config.json:   0%|          | 0.00/1.26k [00:00<?, ?B/s]

model.safetensors:   0%|          | 0.00/1.62G [00:00<?, ?B/s]

generation_config.json:   0%|          | 0.00/3.77k [00:00<?, ?B/s]

### define data collector

In [None]:
import torch

from dataclasses import dataclass
from typing import Any, Dict, List, Union

@dataclass
class DataCollatorSpeechSeq2SeqWithPadding:
    processor: Any
    decoder_start_token_id: int

    def __call__(self, features: List[Dict[str, Union[List[int], torch.Tensor]]]) -> Dict[str, torch.Tensor]:
        # split inputs and labels since they have to be of different lengths and need different padding methods
        # first treat the audio inputs by simply returning torch tensors
        input_features = [{"input_features": feature["input_features"]} for feature in features]
        batch = self.processor.feature_extractor.pad(input_features, return_tensors="pt")

        # get the tokenized label sequences
        label_features = [{"input_ids": feature["labels"]} for feature in features]
        # pad the labels to max length
        labels_batch = self.processor.tokenizer.pad(label_features, return_tensors="pt")

        # replace padding with -100 to ignore loss correctly
        labels = labels_batch["input_ids"].masked_fill(labels_batch.attention_mask.ne(1), -100)

        # if bos token is appended in previous tokenization step,
        # cut bos token here as it's append later anyways
        if (labels[:, 0] == self.decoder_start_token_id).all().cpu().item():
            labels = labels[:, 1:]

        batch["labels"] = labels

        return batch


In [None]:
data_collator = DataCollatorSpeechSeq2SeqWithPadding(
    processor=processor,
    decoder_start_token_id=model.config.decoder_start_token_id,
)


### define evaluation metrics - wer

In [None]:
import evaluate

metric = evaluate.load("wer")

Downloading builder script:   0%|          | 0.00/4.49k [00:00<?, ?B/s]

In [None]:
def compute_metrics(pred):
    pred_ids = pred.predictions
    label_ids = pred.label_ids

    # replace -100 with the pad_token_id
    label_ids[label_ids == -100] = tokenizer.pad_token_id

    # we do not want to group tokens when computing the metrics
    pred_str = tokenizer.batch_decode(pred_ids, skip_special_tokens=True)
    label_str = tokenizer.batch_decode(label_ids, skip_special_tokens=True)

    wer = 100 * metric.compute(predictions=pred_str, references=label_str)

    return {"wer": wer}

### define training arguments

In [None]:
from transformers import Seq2SeqTrainingArguments

training_args = Seq2SeqTrainingArguments(
    output_dir="/content/drive/MyDrive/whisper_fine_tuning/whisper-large-v3-turbo-ur-6",  # change to a repo name of your choice
    resume_from_checkpoint="/content/drive/MyDrive/whisper_fine_tuning/whisper-large-v3-turbo-ur-6",  # change to a repo name of your choice
    per_device_train_batch_size=8,
    gradient_accumulation_steps=2,  # increase by 2x for every 2x decrease in batch size
    learning_rate=5e-7,
    warmup_steps=200,
    max_steps=5000,
    gradient_checkpointing=True,
    fp16=True,
    evaluation_strategy="steps",
    per_device_eval_batch_size=4,
    predict_with_generate=True,
    generation_max_length=225,
    save_steps=100,
    eval_steps=20,
    logging_steps=20,
    report_to=["tensorboard"],
    load_best_model_at_end=True,
    metric_for_best_model="wer",
    greater_is_better=False,
    push_to_hub=True,
)




In [None]:
from transformers import TrainerCallback
from transformers.trainer_pt_utils import IterableDatasetShard
from torch.utils.data import IterableDataset

# trainer callback to reinitialise and reshuffle the streamable datasets at the beginning of each epoch
class ShuffleCallback(TrainerCallback):
    def on_epoch_begin(self, args, state, control, train_dataloader, **kwargs):
        if isinstance(train_dataloader.dataset, IterableDatasetShard):
            pass  # set_epoch() is handled by the Trainer
        elif isinstance(train_dataloader.dataset, IterableDataset):
            train_dataloader.dataset.set_epoch(train_dataloader.dataset._epoch + 1)

In [None]:
# del trainer

In [None]:
from transformers import Seq2SeqTrainer

trainer = Seq2SeqTrainer(
    args=training_args,
    model=model,
    train_dataset=prepared_train,
    eval_dataset=prepared_test,
    data_collator=data_collator,
    compute_metrics=compute_metrics,
    tokenizer=processor.feature_extractor,
    callbacks=[ShuffleCallback()],
)

  trainer = Seq2SeqTrainer(


In [None]:
import gc
gc.collect()
torch.cuda.empty_cache()

### start training



In [36]:
trainer.train()

Passing a tuple of `past_key_values` is deprecated and will be removed in Transformers v4.43.0. You should pass an instance of `EncoderDecoderCache` instead, e.g. `past_key_values=EncoderDecoderCache.from_legacy_cache(past_key_values)`.
`use_cache = True` is incompatible with gradient checkpointing. Setting `use_cache = False`...


Step,Training Loss,Validation Loss,Wer
20,1.3351,0.691259,78.804348
40,1.2778,0.665551,67.391304
60,1.1713,0.60134,55.298913
80,1.0111,0.543332,48.143116
100,0.8928,0.495622,38.632246
120,0.747,0.462127,32.065217
140,0.6541,0.439742,24.456522
160,0.5605,0.417551,21.240942
180,0.4401,0.401028,21.376812
200,0.3475,0.398159,21.150362


The attention mask is not set and cannot be inferred from input because pad token is same as eos token. As a consequence, you may observe unexpected behavior. Please pass your input's `attention_mask` to obtain reliable results.
Trainer.tokenizer is now deprecated. You should use Trainer.processing_class instead.
Trainer.tokenizer is now deprecated. You should use Trainer.processing_class instead.
Trainer.tokenizer is now deprecated. You should use Trainer.processing_class instead.
Trainer.tokenizer is now deprecated. You should use Trainer.processing_class instead.
Trainer.tokenizer is now deprecated. You should use Trainer.processing_class instead.
Trainer.tokenizer is now deprecated. You should use Trainer.processing_class instead.
Trainer.tokenizer is now deprecated. You should use Trainer.processing_class instead.
Trainer.tokenizer is now deprecated. You should use Trainer.processing_class instead.
Trainer.tokenizer is now deprecated. You should use Trainer.processing_class instea

Step,Training Loss,Validation Loss,Wer
20,1.3351,0.691259,78.804348
40,1.2778,0.665551,67.391304
60,1.1713,0.60134,55.298913
80,1.0111,0.543332,48.143116
100,0.8928,0.495622,38.632246
120,0.747,0.462127,32.065217
140,0.6541,0.439742,24.456522
160,0.5605,0.417551,21.240942
180,0.4401,0.401028,21.376812
200,0.3475,0.398159,21.150362


Trainer.tokenizer is now deprecated. You should use Trainer.processing_class instead.
Trainer.tokenizer is now deprecated. You should use Trainer.processing_class instead.
Trainer.tokenizer is now deprecated. You should use Trainer.processing_class instead.
Trainer.tokenizer is now deprecated. You should use Trainer.processing_class instead.
Trainer.tokenizer is now deprecated. You should use Trainer.processing_class instead.
Trainer.tokenizer is now deprecated. You should use Trainer.processing_class instead.
Trainer.tokenizer is now deprecated. You should use Trainer.processing_class instead.
Trainer.tokenizer is now deprecated. You should use Trainer.processing_class instead.
Trainer.tokenizer is now deprecated. You should use Trainer.processing_class instead.
Trainer.tokenizer is now deprecated. You should use Trainer.processing_class instead.
Trainer.tokenizer is now deprecated. You should use Trainer.processing_class instead.
Trainer.tokenizer is now deprecated. You should use Tr

KeyboardInterrupt: 

AttributeError: 'Seq2SeqTrainer' object has no attribute 'best_model_checkpoint'

In [43]:
!ls /root/empty

ls: cannot access '/root/empty_cache/': No such file or directory


In [44]:
trainer.model

WhisperForConditionalGeneration(
  (model): WhisperModel(
    (encoder): WhisperEncoder(
      (conv1): Conv1d(128, 1280, kernel_size=(3,), stride=(1,), padding=(1,))
      (conv2): Conv1d(1280, 1280, kernel_size=(3,), stride=(2,), padding=(1,))
      (embed_positions): Embedding(1500, 1280)
      (layers): ModuleList(
        (0-31): 32 x WhisperEncoderLayer(
          (self_attn): WhisperSdpaAttention(
            (k_proj): Linear(in_features=1280, out_features=1280, bias=False)
            (v_proj): Linear(in_features=1280, out_features=1280, bias=True)
            (q_proj): Linear(in_features=1280, out_features=1280, bias=True)
            (out_proj): Linear(in_features=1280, out_features=1280, bias=True)
          )
          (self_attn_layer_norm): LayerNorm((1280,), eps=1e-05, elementwise_affine=True)
          (activation_fn): GELUActivation()
          (fc1): Linear(in_features=1280, out_features=5120, bias=True)
          (fc2): Linear(in_features=5120, out_features=1280, bia

## Test Model

In [7]:
from transformers import WhisperForConditionalGeneration

model = WhisperForConditionalGeneration.from_pretrained("/content/drive/MyDrive/whisper_fine_tuning/whisper-large-v3-turbo-ur-2/checkpoint-80")
model.generation_config.language = "Urdu"
model.generation_config.task = "transcribe"

model.generation_config.forced_decoder_ids = None


In [8]:
from transformers import pipeline
# import gradio as gr

pipe = pipeline(task="automatic-speech-recognition", model=model,
                tokenizer=tokenizer, feature_extractor=feature_extractor,
                processor=processor)  # change to "your-username/the-name-you-picked"

def transcribe(audio):
    text = pipe(audio)["text"]
    return text

# iface = gr.Interface(
#     fn=transcribe,
#     inputs=gr.Audio(source="microphone", type="filepath"),
#     outputs="text",
#     title="Whisper Large Urdu",
#     description="Realtime demo for Urdu speech recognition using a fine-tuned Whisper large model.",
# )

# iface.launch()


Device set to use cuda:0


In [29]:
# a, sr = librosa.load('/content/drive/MyDrive/whisper_fine_tuning/audios_test/audio_004156.wav', sr=16_000)
# l = int(len(a)/9)  # need segment < 30sec
# l

320000

In [15]:
from IPython.display import Audio
test_input_path_prefix = '/content/drive/MyDrive/whisper_fine_tuning/audios_test'
test_output_path_prefix = '/content/drive/MyDrive/whisper_fine_tuning/output_test'
ls = os.listdir(test_input_path_prefix)
for l in ls:
  if not l.removesuffix('.wav')+'_transcribed_output.txt' in os.listdir(test_output_path_prefix):
    print(f'transcribing {l}')
    a, sr = librosa.load(os.path.join(test_input_path_prefix, l), sr=16_000)
    seg_len = int(len(a)/9)  # need segment < 30sec
    with open(os.path.join(test_output_path_prefix, l.removesuffix('.wav')+'_transcribed_output.txt'), 'w') as file:
      for i in range(0,9):
        file.write(transcribe(a[seg_len*i:seg_len*(i+1)])+' ')

transcribing audio_000122.wav


