In [43]:
#installed required library
!pip install transformers datasets jiwer torchaudio



In [44]:
#imports
import os
import torch
import torchaudio
from datasets import load_dataset
from dataclasses import dataclass
from typing import Any, Dict, List, Union
from transformers import (
    WhisperProcessor,
    WhisperForConditionalGeneration,
    Seq2SeqTrainingArguments,
    Seq2SeqTrainer,
    pipeline
)


In [45]:
# Load Dataset
train_file = "data/transcripts/train.jsonl"
train_dataset = load_dataset("json", data_files=train_file, split="train")
print(train_dataset)

Dataset({
    features: ['audio', 'sentence', 'language'],
    num_rows: 10
})


In [46]:
#loading pre-trained whisper model

model_name = "openai/whisper-small"
processor = WhisperProcessor.from_pretrained(model_name, language="en", task="transcribe")
model = WhisperForConditionalGeneration.from_pretrained(model_name)
model.generation_config.forced_decoder_ids = None
model.generation_config.suppress_tokens = []



In [47]:
#Preprocessing audio files from 44kHz to 16 KHz
def preprocess(batch):
    audio_path = batch["audio"]
    waveform, sr = torchaudio.load(audio_path)

    if sr != 16000:
        resampler = torchaudio.transforms.Resample(sr, 16000)
        waveform = resampler(waveform)
        sr = 16000

    array = waveform.squeeze().numpy()


    batch["input_features"] = processor.feature_extractor(
        array, sampling_rate=sr
    ).input_features[0]


    batch["labels"] = processor.tokenizer(batch["sentence"]).input_ids
    return batch

train_dataset = train_dataset.map(preprocess, remove_columns=train_dataset.column_names)


Map:   0%|          | 0/10 [00:00<?, ? examples/s]

  s = torchaudio.io.StreamReader(src, format, None, buffer_size)


In [48]:
@dataclass
class DataCollatorSpeechSeq2SeqWithPadding:
    processor: Any
    decoder_start_token_id: int

    def __call__(self, features: List[Dict[str, Union[List[int], torch.Tensor]]]) -> Dict[str, torch.Tensor]:

        input_features = [{"input_features": f["input_features"]} for f in features]
        batch = self.processor.feature_extractor.pad(input_features, return_tensors="pt")


        label_features = [{"input_ids": f["labels"]} for f in features]
        labels_batch = self.processor.tokenizer.pad(label_features, return_tensors="pt")
        labels = labels_batch["input_ids"].masked_fill(labels_batch.attention_mask.ne(1), -100)

        if (labels[:, 0] == self.decoder_start_token_id).all().cpu().item():
            labels = labels[:, 1:]

        batch["labels"] = labels
        return batch

data_collator = DataCollatorSpeechSeq2SeqWithPadding(
    processor=processor,
    decoder_start_token_id=model.config.decoder_start_token_id,
)


In [49]:
# 5. Training parameters

training_args = Seq2SeqTrainingArguments(
    output_dir="./finetuned-small",
    per_device_train_batch_size=2,
    gradient_accumulation_steps=4,
    num_train_epochs=10,
    learning_rate=3e-5,
    logging_steps=10,
    save_total_limit=1,
    fp16=torch.cuda.is_available(),
    push_to_hub=False,
    report_to=[]
)


In [50]:
# 6. Training

trainer = Seq2SeqTrainer(
    args=training_args,
    model=model,
    train_dataset=train_dataset,
    data_collator=data_collator,
    tokenizer=processor,
)

trainer.train()

  trainer = Seq2SeqTrainer(
You're using a WhisperTokenizerFast tokenizer. Please note that with a fast tokenizer, using the `__call__` method is faster than using a method to encode the text followed by a call to the `pad` method to get a padded encoding.


Step,Training Loss
10,3.5112
20,1.1394




TrainOutput(global_step=20, training_loss=2.325255012512207, metrics={'train_runtime': 151.9703, 'train_samples_per_second': 0.658, 'train_steps_per_second': 0.132, 'total_flos': 2.8858540032e+16, 'train_loss': 2.325255012512207, 'epoch': 10.0})

In [51]:
# saving new finetuned model
model.save_pretrained("finetuned-small")
processor.save_pretrained("finetuned-small")
print("Fine-tuned Whisper model saved locally at ./finetuned-small")

Fine-tuned Whisper model saved locally at ./finetuned-small


In [52]:
import json
os.makedirs("results", exist_ok=True)

baseline_pipe = pipeline(
    "automatic-speech-recognition",
    model="openai/whisper-small",
    generate_kwargs={"language": "en", "task": "transcribe"}
)

ft_pipe = pipeline(
    "automatic-speech-recognition",
    model="./finetuned-small",
    generate_kwargs={"language": "en", "task": "transcribe"}
)

test_file = "data/transcripts/test.jsonl"
out_file = "results/comparison.txt"

print("\n Baseline vs Fine-tuned comparison")

with open(test_file, "r") as f, open(out_file, "w") as outf:
    for line in f:
        entry = json.loads(line)
        audio_file = entry["audio"]
        target_text = entry["sentence"]

        baseline_pred = baseline_pipe(audio_file)["text"]
        ft_pred = ft_pipe(audio_file)["text"]

        print(f"\nAudio: {audio_file}")
        print("Target    :", target_text)
        print("Baseline  :", baseline_pred)
        print("Finetuned :", ft_pred)

        outf.write(f"\nAudio: {audio_file}\n")
        outf.write(f"Target    : {target_text}\n")
        outf.write(f"Baseline  : {baseline_pred}\n")
        outf.write(f"Finetuned : {ft_pred}\n\n")

print(f"\n Results Stored in {out_file}")

Device set to use cuda:0
Device set to use cuda:0



 Baseline vs Fine-tuned comparison

Audio: data/audio/test/test.mp3
Target    : Using mvn and maven I synced git to github and used a portkey to call an openai chatgpt llm powered by groq and Grok.
Baseline  :  using mvn and maven I sync git and github and use a port key to call and open ai chat gpt llm powered by groke and groke
Finetuned :  Using mvn and Maven, I sync Git and GitHub and use a Portkey to call an openAI chatGPT LLM powered by Groq and Groq.

 Results Stored in results/comparison.txt


In [None]:
#imports
import os
import torch
import torchaudio
from datasets import load_dataset
from dataclasses import dataclass
from typing import Any, Dict, List, Union
from transformers import (
    WhisperProcessor,
    WhisperForConditionalGeneration,
    Seq2SeqTrainingArguments,
    Seq2SeqTrainer,
    pipeline
)

# ========= Load Dataset =========
train_file = "data/transcripts/train.jsonl"
train_dataset = load_dataset("json", data_files=train_file, split="train")
print(train_dataset)

# ========= Load Pretrained Whisper =========
model_name = "openai/whisper-small"   # <- for resource reasons, still small
processor = WhisperProcessor.from_pretrained(model_name, language="en", task="transcribe")
model = WhisperForConditionalGeneration.from_pretrained(model_name)

# Disable task forcing
model.generation_config.forced_decoder_ids = None
model.generation_config.suppress_tokens = []

# ========= Preprocessing =========
def preprocess(batch):
    audio_path = batch["audio"]
    waveform, sr = torchaudio.load(audio_path)

    if sr != 16000:
        resampler = torchaudio.transforms.Resample(sr, 16000)
        waveform = resampler(waveform)
        sr = 16000

    array = waveform.squeeze().numpy()

    # Extract input features
    batch["input_features"] = processor.feature_extractor(
        array, sampling_rate=sr
    ).input_features[0]

    # Tokenize sentence
    batch["labels"] = processor.tokenizer(batch["sentence"]).input_ids
    return batch

train_dataset = train_dataset.map(preprocess, remove_columns=train_dataset.column_names)

@dataclass
class DataCollatorSpeechSeq2SeqWithPadding:
    processor: Any
    decoder_start_token_id: int

    def __call__(self, features: List[Dict[str, Union[List[int], torch.Tensor]]]) -> Dict[str, torch.Tensor]:
        # pad input_features
        input_features = [{"input_features": f["input_features"]} for f in features]
        batch = self.processor.feature_extractor.pad(input_features, return_tensors="pt")

        # pad labels
        label_features = [{"input_ids": f["labels"]} for f in features]
        labels_batch = self.processor.tokenizer.pad(label_features, return_tensors="pt")
        labels = labels_batch["input_ids"].masked_fill(labels_batch.attention_mask.ne(1), -100)

        if (labels[:, 0] == self.decoder_start_token_id).all().cpu().item():
            labels = labels[:, 1:]

        batch["labels"] = labels
        return batch

data_collator = DataCollatorSpeechSeq2SeqWithPadding(
    processor=processor,
    decoder_start_token_id=model.config.decoder_start_token_id,
)

# ========= Training =========
training_args = Seq2SeqTrainingArguments(
    output_dir="./finetuned-small",
    per_device_train_batch_size=2,                  
    gradient_accumulation_steps=4,                  
    num_train_epochs=10,                            
    learning_rate=3e-5,                             
    logging_steps=10,
    save_total_limit=1,
    fp16=torch.cuda.is_available(),
    push_to_hub=False,
    report_to=[]
)

trainer = Seq2SeqTrainer(
    args=training_args,
    model=model,
    train_dataset=train_dataset,
    data_collator=data_collator,
    tokenizer=processor,  
)

trainer.train()


model.save_pretrained("finetuned-small")
processor.save_pretrained("finetuned-small")
print("Fine-tuned Whisper model saved locally at ./finetuned-small")


import json
os.makedirs("results", exist_ok=True)

baseline_pipe = pipeline(
    "automatic-speech-recognition",
    model="openai/whisper-small",
    generate_kwargs={"language": "en", "task": "transcribe"} 
)

ft_pipe = pipeline(
    "automatic-speech-recognition",
    model="./finetuned-small",
    generate_kwargs={"language": "en", "task": "transcribe"}
)

test_file = "data/transcripts/test.jsonl"
out_file = "results/comparison.txt"

print("\n Baseline vs Fine-tuned comparison")

with open(test_file, "r") as f, open(out_file, "w") as outf:
    for line in f:
        entry = json.loads(line)
        audio_file = entry["audio"]
        target_text = entry["sentence"]

        baseline_pred = baseline_pipe(audio_file)["text"]
        ft_pred = ft_pipe(audio_file)["text"]

        print(f"\nAudio: {audio_file}")
        print("Target    :", target_text)
        print("Baseline  :", baseline_pred)
        print("Finetuned :", ft_pred)

        outf.write(f"\nAudio: {audio_file}\n")
        outf.write(f"Target    : {target_text}\n")
        outf.write(f"Baseline  : {baseline_pred}\n")
        outf.write(f"Finetuned : {ft_pred}\n\n")

print(f"\n Results Stored in {out_file}")

Dataset({
    features: ['audio', 'sentence', 'language'],
    num_rows: 10
})


  trainer = Seq2SeqTrainer(
You're using a WhisperTokenizerFast tokenizer. Please note that with a fast tokenizer, using the `__call__` method is faster than using a method to encode the text followed by a call to the `pad` method to get a padded encoding.


Step,Training Loss
10,3.5108
20,1.1845




Fine-tuned Whisper model saved locally at ./finetuned-small


Device set to use cuda:0
Device set to use cuda:0



 Baseline vs Fine-tuned comparison

Audio: data/audio/test/test.mp3
Target    : Using mvn and maven I synced git to github and used a portkey to call an openai chatgpt llm powered by groq and Grok.
Baseline  :  using mvn and maven I sync git and github and use a port key to call and open ai chat gpt llm powered by groke and groke
Finetuned :  Using mvn and Maven, I sync Git and GitHub and use a Portkey to call an openAI chatGPT LLM powered by Groq and Groq.

 Results Stored in results/comparison.txt
