In [1]:
import pandas as pd
from tqdm import tqdm
tqdm.pandas()
import librosa

In [2]:
df = pd.read_excel("data/ASRdataset.xlsx")
df

Unnamed: 0,path,text
0,data/audio/Recording_1.wav,ยานี้ชื่อไอบูโปรเพนความแรง400มิลลิกรัมจำนวน10เม็ด
1,data/audio/Recording_2.wav,ยานี้ชื่ออะมอกซีซิลินความแรง500มิลลิกรัมจำนวน3...
2,data/audio/Recording_3.wav,ยานี้ชื่อเด็กซ์ออฟจำนวน1ขวดใช้สำหรับหยอดหู
3,data/audio/Recording_4.wav,สวัสดีครับผมเป็นเภสัชกร
4,data/audio/Recording_5.wav,หนูเป็นเภสัชกรค่ะ
5,data/audio/Recording_6.wav,มีอาการปวดหูที่ข้างซ้ายใช่ไหมคะ
6,data/audio/Recording_7.wav,คนไข้ตั้งครรภ์ไหมคะ
7,data/audio/Recording_8.wav,ความดันคนไข้สูงนะคะ
8,data/audio/Recording_9.wav,ยานี้ควรรับประทานติดต่อกันทุกวันจนหมด
9,data/audio/Recording_10.wav,ล้างมือให้สะอาด


In [3]:
from datasets import Dataset, DatasetDict

In [4]:
dataset = Dataset.from_pandas(df)
dataset

Dataset({
    features: ['path', 'text'],
    num_rows: 24
})

In [5]:
dataset = dataset.train_test_split(test_size=0.1)
dataset

DatasetDict({
    train: Dataset({
        features: ['path', 'text'],
        num_rows: 21
    })
    test: Dataset({
        features: ['path', 'text'],
        num_rows: 3
    })
})

In [6]:
from transformers import WhisperFeatureExtractor

feature_extractor = WhisperFeatureExtractor.from_pretrained("scb10x/monsoon-whisper-medium-gigaspeech2")

In [7]:
from transformers import WhisperTokenizer

tokenizer = WhisperTokenizer.from_pretrained("scb10x/monsoon-whisper-medium-gigaspeech2", language="th", task="transcribe")

In [8]:
from transformers import WhisperProcessor

processor = WhisperProcessor.from_pretrained("scb10x/monsoon-whisper-medium-gigaspeech2", language="th", task="transcribe")

In [9]:
def get_array(file_path):
    audio_input, samplerate = librosa.load(file_path, sr=16000)
    return audio_input

In [10]:
def prepare_dataset(batch):
    print(batch['path'])
    arr = get_array(f'{batch['path']}')
    # compute log-Mel input features from input audio array
    batch["input_features"] = feature_extractor(arr, sampling_rate=16000).input_features[0]

    # encode target text to label ids
    batch["labels"] = tokenizer(batch['text']).input_ids
    return batch

In [11]:
dataset = dataset.map(prepare_dataset)

Map:   0%|          | 0/21 [00:00<?, ? examples/s]

data/audio/Recording_1.wav
data/audio/Recording_10.wav
data/audio/Recording_18.wav
data/audio/Recording_24.wav
data/audio/Recording_13.wav
data/audio/Recording_7.wav
data/audio/Recording_3.wav
data/audio/Recording_8.wav
data/audio/Recording_11.wav
data/audio/Recording_2.wav
data/audio/Recording_16.wav
data/audio/Recording_4.wav
data/audio/Recording_14.wav
data/audio/Recording_23.wav
data/audio/Recording_6.wav
data/audio/Recording_21.wav
data/audio/Recording_22.wav
data/audio/Recording_5.wav
data/audio/Recording_15.wav
data/audio/Recording_19.wav
data/audio/Recording_20.wav


Map:   0%|          | 0/3 [00:00<?, ? examples/s]

data/audio/Recording_17.wav
data/audio/Recording_9.wav
data/audio/Recording_12.wav


In [12]:
dataset

DatasetDict({
    train: Dataset({
        features: ['path', 'text', 'input_features', 'labels'],
        num_rows: 21
    })
    test: Dataset({
        features: ['path', 'text', 'input_features', 'labels'],
        num_rows: 3
    })
})

In [13]:
from transformers import WhisperForConditionalGeneration

model = WhisperForConditionalGeneration.from_pretrained("scb10x/monsoon-whisper-medium-gigaspeech2")

In [15]:
model.generation_config.language = "th"
model.generation_config.task = "transcribe"
model.generation_config.forced_decoder_ids = None

In [16]:
import torch

from dataclasses import dataclass
from typing import Any, Dict, List, Union

@dataclass
class DataCollatorSpeechSeq2SeqWithPadding:
    processor: Any
    decoder_start_token_id: int

    def __call__(self, features: List[Dict[str, Union[List[int], torch.Tensor]]]) -> Dict[str, torch.Tensor]:
        # split inputs and labels since they have to be of different lengths and need different padding methods
        # first treat the audio inputs by simply returning torch tensors
        input_features = [{"input_features": feature["input_features"]} for feature in features]
        batch = self.processor.feature_extractor.pad(input_features, return_tensors="pt")

        # get the tokenized label sequences
        label_features = [{"input_ids": feature["labels"]} for feature in features]
        # pad the labels to max length
        labels_batch = self.processor.tokenizer.pad(label_features, return_tensors="pt")

        # replace padding with -100 to ignore loss correctly
        labels = labels_batch["input_ids"].masked_fill(labels_batch.attention_mask.ne(1), -100)

        # if bos token is appended in previous tokenization step,
        # cut bos token here as it's append later anyways
        if (labels[:, 0] == self.decoder_start_token_id).all().cpu().item():
            labels = labels[:, 1:]

        batch["labels"] = labels

        return batch

In [17]:
data_collator = DataCollatorSpeechSeq2SeqWithPadding(
    processor=processor,
    decoder_start_token_id=model.config.decoder_start_token_id,
)

In [18]:
import evaluate

metric = evaluate.load("wer")

In [20]:
def compute_metrics(pred):
    pred_ids = pred.predictions
    label_ids = pred.label_ids

    # replace -100 with the pad_token_id
    label_ids[label_ids == -100] = tokenizer.pad_token_id

    # we do not want to group tokens when computing the metrics
    pred_str = tokenizer.batch_decode(pred_ids, skip_special_tokens=True)
    label_str = tokenizer.batch_decode(label_ids, skip_special_tokens=True)

    wer = 100 * metric.compute(predictions=pred_str, references=label_str)

    return {"wer": wer}

In [23]:
from transformers import Seq2SeqTrainingArguments

training_args = Seq2SeqTrainingArguments(
    output_dir="./whisper-monsoon-t1",
    per_device_train_batch_size=4,
    gradient_accumulation_steps=1,
    num_train_epochs=3.0,
    learning_rate=5e-5,
    gradient_checkpointing=True,
    fp16=True,
    bf16=False,
    optim="adamw_torch_fused", # adamw_torch_fused, adamw_8bit, adamw_torch, sgd
    eval_strategy="epoch",
    per_device_eval_batch_size=4,
    predict_with_generate=True,
    generation_max_length=256,
    save_strategy="epoch",
    save_total_limit=5,
    logging_steps=50,
    report_to=None,
    push_to_hub=False,
)

In [24]:
# from transformers import Seq2SeqTrainingArguments

# training_args = Seq2SeqTrainingArguments(
#     output_dir="./whisper-monsoon-t1",  # change to a repo name of your choice
#     per_device_train_batch_size=16,
#     gradient_accumulation_steps=1,  # increase by 2x for every 2x decrease in batch size
#     learning_rate=1e-5,
#     warmup_steps=500,
#     max_steps=4000,
#     gradient_checkpointing=True,
#     fp16=True,
#     evaluation_strategy="steps",
#     per_device_eval_batch_size=8,
#     predict_with_generate=True,
#     generation_max_length=225,
#     save_steps=1000,
#     eval_steps=1000,
#     logging_steps=25,
#     report_to=["tensorboard"],
#     load_best_model_at_end=True,
#     metric_for_best_model="wer",
#     greater_is_better=False,
#     push_to_hub=True,
# )

In [25]:
device = "cuda" if torch.cuda.is_available() else "cpu"
print(device)

cuda


In [26]:
from transformers import Seq2SeqTrainer

trainer = Seq2SeqTrainer(
    args=training_args,
    model=model.to(device),
    train_dataset=dataset["train"],
    eval_dataset=dataset["test"],
    data_collator=data_collator,
    compute_metrics=compute_metrics,
    tokenizer=processor.feature_extractor,
)

  trainer = Seq2SeqTrainer(


In [27]:
processor.save_pretrained(training_args.output_dir)

[]

In [28]:
trainer.train()

Epoch,Training Loss,Validation Loss,Wer
1,No log,0.162984,33.333333
2,No log,0.161941,33.333333
3,No log,0.118921,33.333333


You have passed task=transcribe, but also have set `forced_decoder_ids` to [[1, 50289], [2, 50359], [3, 50363]] which creates a conflict. `forced_decoder_ids` will be ignored in favor of task=transcribe.
The attention mask is not set and cannot be inferred from input because pad token is same as eos token. As a consequence, you may observe unexpected behavior. Please pass your input's `attention_mask` to obtain reliable results.


TrainOutput(global_step=18, training_loss=0.841770225101047, metrics={'train_runtime': 1499.1443, 'train_samples_per_second': 0.042, 'train_steps_per_second': 0.012, 'total_flos': 6.429810917376e+16, 'train_loss': 0.841770225101047, 'epoch': 3.0})

In [36]:
trainer.save_model("./whisper-monsoon-t1")

In [31]:
import json
import matplotlib.pyplot as plt

# Load training logs
log_file = "./whisper-monsoon-t1/trainer_state.json"
with open(log_file, "r") as f:
    logs = json.load(f)

# Extract loss values
steps = [x["step"] for x in logs["log_history"] if "loss" in x]
losses = [x["loss"] for x in logs["log_history"] if "loss" in x]

# Plot training loss
plt.figure(figsize=(8,5))
plt.plot(steps, losses, marker="o", linestyle="-")
plt.xlabel("Training Steps")
plt.ylabel("Loss")
plt.title("Training Loss Curve")
plt.grid()
plt.show()

FileNotFoundError: [Errno 2] No such file or directory: './whisper-monsoon-t1/trainer_state.json'

# try to use it

In [34]:
ls

 Volume in drive C is Windows-SSD
 Volume Serial Number is 0EC5-F6D5

 Directory of c:\Users\LENOVO\Desktop\DemoP\flask

02/19/2025  09:43 AM    <DIR>          .
02/07/2025  08:43 PM    <DIR>          ..
02/07/2025  08:38 PM               768 ASR.py
02/12/2025  09:10 PM    <DIR>          asr_finetuned
02/07/2025  08:28 PM    <DIR>          audio
02/18/2025  05:40 PM    <DIR>          checkpoints
02/07/2025  08:31 PM         1,564,844 converted.wav
02/18/2025  08:23 PM    <DIR>          data
02/18/2025  08:05 PM            20,784 evalanotherdata.ipynb
02/13/2025  10:09 AM         1,150,346 findTuningASR.ipynb
02/19/2025  12:53 PM            30,423 findTuningSentT.ipynb
02/21/2025  07:37 AM            32,815 fineTourASR.ipynb
02/18/2025  06:35 PM         1,423,433 flow.ipynb
02/05/2025  02:41 PM    <DIR>          mms
02/18/2025  06:34 PM            38,567 recording_results.xlsx
02/07/2025  08:52 PM             7,133 server.py
02/06/2025  08:33 PM            69,690 techno_output.wav
02/09

In [18]:
from transformers import WhisperForConditionalGeneration, WhisperProcessor
import torch

# Define model path
model_path = "./whisper-monsoon-t1"

# Load processor (tokenizer + feature extractor)
processor = WhisperProcessor.from_pretrained(model_path)

# Load model
device = torch.device("cuda" if torch.cuda.is_available() else "cpu")
model = WhisperForConditionalGeneration.from_pretrained(model_path).to(device)

# Set to evaluation mode
model.eval()

WhisperForConditionalGeneration(
  (model): WhisperModel(
    (encoder): WhisperEncoder(
      (conv1): Conv1d(80, 1024, kernel_size=(3,), stride=(1,), padding=(1,))
      (conv2): Conv1d(1024, 1024, kernel_size=(3,), stride=(2,), padding=(1,))
      (embed_positions): Embedding(1500, 1024)
      (layers): ModuleList(
        (0-23): 24 x WhisperEncoderLayer(
          (self_attn): WhisperSdpaAttention(
            (k_proj): Linear(in_features=1024, out_features=1024, bias=False)
            (v_proj): Linear(in_features=1024, out_features=1024, bias=True)
            (q_proj): Linear(in_features=1024, out_features=1024, bias=True)
            (out_proj): Linear(in_features=1024, out_features=1024, bias=True)
          )
          (self_attn_layer_norm): LayerNorm((1024,), eps=1e-05, elementwise_affine=True)
          (activation_fn): GELUActivation()
          (fc1): Linear(in_features=1024, out_features=4096, bias=True)
          (fc2): Linear(in_features=4096, out_features=1024, bias

In [19]:
def split_audio(audio, sr, max_length_sec=30):
    max_samples = sr * max_length_sec
    return [audio[i:i + max_samples] for i in range(0, len(audio), max_samples)]

In [23]:
def ASR(file_path):
    sr=16000
    # Load audio file
    audio_input, _ = librosa.load(file_path, sr=sr)  # Ensure 16kHz sample rate

    # Split audio into chunks
    audio_chunks = split_audio(audio_input, sr)
    
    transcriptions = []
    
    for chunk in audio_chunks:
        # Process the chunk to match model input requirements, 
        # include any generation parameters as needed (e.g., language)
        input_features = processor(chunk, sampling_rate=sr, return_tensors="pt").input_features.to(device)
        
        # Generate token IDs using the model
        with torch.no_grad():
            predicted_ids = model.generate(input_features)
        
        # Decode token IDs to text, skipping any special tokens
        transcription = processor.batch_decode(predicted_ids, skip_special_tokens=True)[0]
        transcriptions.append(transcription)
    
    # Combine transcriptions from all chunks (add a separator if needed)
    return "".join(transcriptions)

In [24]:
print(ASR(r'data\test\recording.wav'))

  audio_input, _ = librosa.load(file_path, sr=sr)  # Ensure 16kHz sample rate


วันนี้มารับยา3ตัวนะครับยานี้ชื่อไอบูโปรเพนความแรง400มิลลิกรัมจำนวนเม็ดใช้เพื่อบรรเทาอาการปวดรับประทานครั้งละยานี้ควรรับประทานติดต่อกันทุกวันจนหมดและยานี้ชื่อเด็กซ์ออฟจำนวน1ขวดใช้สำหรับหยอดหูเพื่อรักษาอาการติดเชื้อที่หูหยอดสามหยอดที่หูข้างซ้ายวันละสามครั้งหลังอาหารเช้ากลางวันเย็น 
