In [1]:
import pandas as pd
import librosa
from tqdm import tqdm
tqdm.pandas()
from datasets import Dataset, DatasetDict
from transformers import AutoProcessor
from transformers import WhisperForConditionalGeneration, TrainingArguments, Trainer

  from .autonotebook import tqdm as notebook_tqdm


In [2]:
df = pd.read_excel("data/ASRdataset.xlsx")
df.head()

Unnamed: 0,path,text
0,data/audio/Recording_1.wav,ยานี้ชื่อไอบูโปรเพนความแรง400มิลลิกรัมจำนวน10เม็ด
1,data/audio/Recording_2.wav,ยานี้ชื่ออะมอกซีซิลินความแรง500มิลลิกรัมจำนวน3...
2,data/audio/Recording_3.wav,ยานี้ชื่อเด็กซ์ออฟจำนวน1ขวดใช้สำหรับหยอดหู
3,data/audio/Recording_4.wav,สวัสดีครับผมเป็นเภสัชกร
4,data/audio/Recording_5.wav,หนูเป็นเภสัชกรค่ะ


In [14]:
def get_array(file_path):
    audio_input, samplerate = librosa.load(file_path, sr=16000)
    return audio_input

In [15]:
df['array_audio'] = df['path'].progress_apply(get_array)

100%|██████████| 24/24 [00:00<00:00, 980.63it/s]


In [16]:
df.to_excel("data/ASRdatasetA.xlsx", index=False)  # Save without index

In [133]:
# Convert to Hugging Face Dataset
dataset = Dataset.from_pandas(df)

In [134]:
# Split into training and validation sets
dataset = dataset.train_test_split(test_size=0.1)

In [124]:
dataset

DatasetDict({
    train: Dataset({
        features: ['path', 'text', 'array_audio'],
        num_rows: 21
    })
    test: Dataset({
        features: ['path', 'text', 'array_audio'],
        num_rows: 3
    })
})

In [None]:
# Load processor (Whisper tokenizer + feature extractor)
processor = AutoProcessor.from_pretrained("scb10x/monsoon-whisper-medium-gigaspeech2")

In [137]:
# Function to process audio & text
def preprocess_function(batch):
    # Extract audio features using the feature_extractor
    inputs = processor.feature_extractor(batch["array_audio"], sampling_rate=16000, padding=True, truncation=True, return_tensors="pt")
    
    # Tokenize the transcription labels with proper padding and truncation
    labels = processor.tokenizer(batch["text"], padding="max_length", truncation=True, max_length=63, return_tensors="pt").input_ids

    return {
        "input_values": inputs['input_features'].squeeze(0),  # Audio features
        "labels": labels.squeeze(0)  # Tokenized text
    }

# Apply preprocessing
dataset = dataset.map(preprocess_function,remove_columns=["path","array_audio", "text"])

Map: 100%|██████████| 21/21 [00:00<00:00, 46.49 examples/s]
Map: 100%|██████████| 3/3 [00:00<00:00, 41.66 examples/s]


In [138]:
dataset["train"][0]

{'input_values': [[-0.4501652717590332,
   -1.0264601707458496,
   -1.0264601707458496,
   -1.0186576843261719,
   -1.0000979900360107,
   -0.9855144023895264,
   -0.8715577125549316,
   -0.6948832273483276,
   -0.6944653987884521,
   -0.6081035137176514,
   -0.6148099899291992,
   -0.9883877038955688,
   -0.6988160610198975,
   -0.5806037187576294,
   -0.7163089513778687,
   -0.7485007047653198,
   -0.5610278844833374,
   -0.5365934371948242,
   -0.3863520622253418,
   -0.5109004974365234,
   -0.5683385133743286,
   -0.7730655670166016,
   -1.0264601707458496,
   -0.7149299383163452,
   -0.5597474575042725,
   -0.7669703960418701,
   -0.7867625951766968,
   -0.8718956708908081,
   -1.0264601707458496,
   -0.8947041034698486,
   -0.8853510618209839,
   -1.005678653717041,
   -1.0264601707458496,
   -1.0264601707458496,
   -1.0264601707458496,
   -1.0264601707458496,
   -0.8707360029220581,
   -0.3442995548248291,
   -0.31979286670684814,
   -0.21941494941711426,
   -0.5323359966278076,

In [147]:
# Load ASR model
model = WhisperForConditionalGeneration.from_pretrained("scb10x/monsoon-whisper-medium-gigaspeech2")

In [148]:
# Replace 'evaluation_strategy' with 'eval_strategy'
training_args = TrainingArguments(
    output_dir="./asr_finetuned",
    per_device_train_batch_size=4,
    per_device_eval_batch_size=4,
    eval_strategy="epoch",  # Update to 'eval_strategy'
    save_strategy="epoch",
    learning_rate=1e-4,
    weight_decay=0.01,
    num_train_epochs=5,
    save_total_limit=2,
    logging_dir="./logs",
    logging_steps=10,
    fp16=True,  # Enable mixed precision training for speedup
    push_to_hub=False
)

In [149]:
trainer = Trainer(
    model=model,
    args=training_args,
    train_dataset=dataset["train"],
    eval_dataset=dataset["test"],
)

In [150]:
# Start fine-tuning
trainer.train()

AttributeError: 'NoneType' object has no attribute 'shape'

# tut

In [86]:
from huggingface_hub import login

login("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")

In [57]:
from huggingface_hub import notebook_login

notebook_login()

VBox(children=(HTML(value='<center> <img\nsrc=https://huggingface.co/front/assets/huggingface_logo-noborder.sv…

In [87]:
from huggingface_hub import whoami

print(whoami())


{'type': 'user', 'id': '6669207f1a032fb6eeb443f8', 'name': 'sittikornnn', 'fullname': 'chaloemkittichai', 'isPro': False, 'avatarUrl': '/avatars/4c227b8be20566af76c59109bbaaa0cf.svg', 'orgs': [], 'auth': {'type': 'access_token', 'accessToken': {'displayName': 'fine', 'role': 'fineGrained', 'createdAt': '2025-02-13T02:40:26.244Z', 'fineGrained': {'canReadGatedRepos': False, 'global': [], 'scoped': [{'entity': {'_id': '67a8413b80b1fde76d3344e1', 'type': 'space', 'name': 'sittikornnn/trail1'}, 'permissions': ['repo.content.read', 'discussion.write', 'repo.write']}, {'entity': {'_id': '6669207f1a032fb6eeb443f8', 'type': 'user', 'name': 'sittikornnn'}, 'permissions': ['collection.read', 'repo.content.read', 'collection.write']}]}}}}


In [60]:
from datasets import load_dataset, DatasetDict

common_voice = DatasetDict()

common_voice["train"] = load_dataset("mozilla-foundation/common_voice_11_0", "hi", split="train+validation")
common_voice["test"] = load_dataset("mozilla-foundation/common_voice_11_0", "hi", split="test")

print(common_voice)

n_shards.json:   0%|          | 0.00/12.2k [00:00<?, ?B/s]

To support symlinks on Windows, you either need to activate Developer Mode or to run Python as an administrator. In order to activate developer mode, see this article: https://docs.microsoft.com/en-us/windows/apps/get-started/enable-your-device-for-development


hi_train_0.tar:   0%|          | 0.00/114M [00:00<?, ?B/s]

hi_dev_0.tar:   0%|          | 0.00/61.9M [00:00<?, ?B/s]

hi_test_0.tar:   0%|          | 0.00/92.2M [00:00<?, ?B/s]

hi_other_0.tar:   0%|          | 0.00/113M [00:00<?, ?B/s]

hi_invalidated_0.tar:   0%|          | 0.00/23.4M [00:00<?, ?B/s]

train.tsv:   0%|          | 0.00/1.30M [00:00<?, ?B/s]

dev.tsv:   0%|          | 0.00/627k [00:00<?, ?B/s]

test.tsv:   0%|          | 0.00/824k [00:00<?, ?B/s]

other.tsv:   0%|          | 0.00/1.04M [00:00<?, ?B/s]

invalidated.tsv:   0%|          | 0.00/201k [00:00<?, ?B/s]

Generating train split: 0 examples [00:00, ? examples/s]

Reading metadata...: 4361it [00:00, 229542.95it/s]


Generating validation split: 0 examples [00:00, ? examples/s]

Reading metadata...: 2179it [00:00, 241859.54it/s]


Generating test split: 0 examples [00:00, ? examples/s]

Reading metadata...: 2894it [00:00, 137810.12it/s]


Generating other split: 0 examples [00:00, ? examples/s]

Reading metadata...: 3328it [00:00, 237739.61it/s]


Generating invalidated split: 0 examples [00:00, ? examples/s]

Reading metadata...: 680it [00:00, 92670.72it/s]


DatasetDict({
    train: Dataset({
        features: ['client_id', 'path', 'audio', 'sentence', 'up_votes', 'down_votes', 'age', 'gender', 'accent', 'locale', 'segment'],
        num_rows: 6540
    })
    test: Dataset({
        features: ['client_id', 'path', 'audio', 'sentence', 'up_votes', 'down_votes', 'age', 'gender', 'accent', 'locale', 'segment'],
        num_rows: 2894
    })
})


In [61]:
common_voice = common_voice.remove_columns(["accent", "age", "client_id", "down_votes", "gender", "locale", "path", "segment", "up_votes"])

print(common_voice)

DatasetDict({
    train: Dataset({
        features: ['audio', 'sentence'],
        num_rows: 6540
    })
    test: Dataset({
        features: ['audio', 'sentence'],
        num_rows: 2894
    })
})


In [63]:
from transformers import WhisperFeatureExtractor

feature_extractor = WhisperFeatureExtractor.from_pretrained("openai/whisper-small")

In [64]:
from transformers import WhisperTokenizer

tokenizer = WhisperTokenizer.from_pretrained("openai/whisper-small", language="Hindi", task="transcribe")

In [65]:
from transformers import WhisperProcessor

processor = WhisperProcessor.from_pretrained("openai/whisper-small", language="Hindi", task="transcribe")

In [66]:
print(common_voice["train"][0])

{'audio': {'path': 'C:\\Users\\LENOVO\\.cache\\huggingface\\datasets\\downloads\\extracted\\0fca57e4965e9a19758e6c01a96995aa512ec125031b5cd20b56887dd2bf1bd7\\hi_train_0/common_voice_hi_26008353.mp3', 'array': array([ 6.46234854e-26, -1.35709319e-25, -8.07793567e-26, ...,
        1.06425944e-07,  4.46417090e-08,  2.61451660e-09]), 'sampling_rate': 48000}, 'sentence': 'हमने उसका जन्मदिन मनाया।'}


In [67]:
from datasets import Audio

common_voice = common_voice.cast_column("audio", Audio(sampling_rate=16000))

In [68]:
print(common_voice["train"][0])

{'audio': {'path': 'C:\\Users\\LENOVO\\.cache\\huggingface\\datasets\\downloads\\extracted\\0fca57e4965e9a19758e6c01a96995aa512ec125031b5cd20b56887dd2bf1bd7\\hi_train_0/common_voice_hi_26008353.mp3', 'array': array([ 5.98479599e-17,  3.12250226e-17, -1.04083409e-17, ...,
       -1.31181878e-07,  2.62807589e-07,  4.76284185e-08]), 'sampling_rate': 16000}, 'sentence': 'हमने उसका जन्मदिन मनाया।'}


In [71]:
def prepare_dataset(batch):
    # load and resample audio data from 48 to 16kHz
    audio = batch["audio"]

    # compute log-Mel input features from input audio array
    batch["input_features"] = feature_extractor(audio["array"], sampling_rate=audio["sampling_rate"]).input_features[0]

    # encode target text to label ids
    batch["labels"] = tokenizer(batch["sentence"]).input_ids
    return batch

In [73]:
common_voice = common_voice.map(prepare_dataset, remove_columns=common_voice.column_names["train"])

Map:   0%|          | 0/6540 [00:00<?, ? examples/s]

Map:   0%|          | 0/2894 [00:00<?, ? examples/s]

In [74]:
from transformers import WhisperForConditionalGeneration

model = WhisperForConditionalGeneration.from_pretrained("openai/whisper-small")

In [75]:
model.generation_config.language = "hindi"
model.generation_config.task = "transcribe"

model.generation_config.forced_decoder_ids = None

In [77]:
import torch

from dataclasses import dataclass
from typing import Any, Dict, List, Union

@dataclass
class DataCollatorSpeechSeq2SeqWithPadding:
    processor: Any
    decoder_start_token_id: int

    def __call__(self, features: List[Dict[str, Union[List[int], torch.Tensor]]]) -> Dict[str, torch.Tensor]:
        # split inputs and labels since they have to be of different lengths and need different padding methods
        # first treat the audio inputs by simply returning torch tensors
        input_features = [{"input_features": feature["input_features"]} for feature in features]
        batch = self.processor.feature_extractor.pad(input_features, return_tensors="pt")

        # get the tokenized label sequences
        label_features = [{"input_ids": feature["labels"]} for feature in features]
        # pad the labels to max length
        labels_batch = self.processor.tokenizer.pad(label_features, return_tensors="pt")

        # replace padding with -100 to ignore loss correctly
        labels = labels_batch["input_ids"].masked_fill(labels_batch.attention_mask.ne(1), -100)

        # if bos token is appended in previous tokenization step,
        # cut bos token here as it's append later anyways
        if (labels[:, 0] == self.decoder_start_token_id).all().cpu().item():
            labels = labels[:, 1:]

        batch["labels"] = labels

        return batch

In [78]:
data_collator = DataCollatorSpeechSeq2SeqWithPadding(
    processor=processor,
    decoder_start_token_id=model.config.decoder_start_token_id,
)

In [79]:
import evaluate

metric = evaluate.load("wer")

In [80]:
def compute_metrics(pred):
    pred_ids = pred.predictions
    label_ids = pred.label_ids

    # replace -100 with the pad_token_id
    label_ids[label_ids == -100] = tokenizer.pad_token_id

    # we do not want to group tokens when computing the metrics
    pred_str = tokenizer.batch_decode(pred_ids, skip_special_tokens=True)
    label_str = tokenizer.batch_decode(label_ids, skip_special_tokens=True)

    wer = 100 * metric.compute(predictions=pred_str, references=label_str)

    return {"wer": wer}

In [81]:
from transformers import Seq2SeqTrainingArguments

training_args = Seq2SeqTrainingArguments(
    output_dir="./whisper-small-hi",  # change to a repo name of your choice
    per_device_train_batch_size=16,
    gradient_accumulation_steps=1,  # increase by 2x for every 2x decrease in batch size
    learning_rate=1e-5,
    warmup_steps=500,
    max_steps=4000,
    gradient_checkpointing=True,
    fp16=True,
    evaluation_strategy="steps",
    per_device_eval_batch_size=8,
    predict_with_generate=True,
    generation_max_length=225,
    save_steps=1000,
    eval_steps=1000,
    logging_steps=25,
    report_to=["tensorboard"],
    load_best_model_at_end=True,
    metric_for_best_model="wer",
    greater_is_better=False,
    push_to_hub=True,
)



In [89]:
from transformers import Seq2SeqTrainer

trainer = Seq2SeqTrainer(
    args=training_args,
    model=model,
    train_dataset=common_voice["train"],
    eval_dataset=common_voice["test"],
    data_collator=data_collator,
    compute_metrics=compute_metrics,
    tokenizer=processor.feature_extractor,
)

  trainer = Seq2SeqTrainer(


In [90]:
processor.save_pretrained(training_args.output_dir)

[]

In [None]:
trainer.train()

Passing a tuple of `past_key_values` is deprecated and will be removed in Transformers v4.43.0. You should pass an instance of `EncoderDecoderCache` instead, e.g. `past_key_values=EncoderDecoderCache.from_legacy_cache(past_key_values)`.
`use_cache = True` is incompatible with gradient checkpointing. Setting `use_cache = False`...


Step,Training Loss,Validation Loss
