# **method one**

In [None]:
# === Install (run once) ===
!pip install -q peft transformers datasets accelerate jiwer evaluate soundfile

# === Imports ===
import os, json, random, shutil
from pathlib import Path
import torch
from datasets import load_dataset, Audio, Dataset
from transformers import (
    WhisperProcessor,
    WhisperForConditionalGeneration,
    Seq2SeqTrainingArguments,
    Seq2SeqTrainer
)
from peft import get_peft_model, LoraConfig, TaskType, PeftModel, PeftConfig
import evaluate
from dataclasses import dataclass
from typing import Any, Dict, List

# === USER CONFIG - edit these paths & hyperparams ===
INPUT_JSON = "all_data.json"          # your combined JSON manifest (audio_filepath + text)
AUDIO_ROOT = "./audio_files"          # root where audio files live (if paths in JSON are relative)
BASE_MODEL = "openai/whisper-large-v3-turbo"
SAVED_ADAPTER_DIR = "turbo_adapter"   # your existing adapter dir (if you want to resume from it)
SAVED_PROCESSOR_DIR = "turbo_whisper" # processor dir you saved earlier
OUTPUT_ADAPTER_DIR = "turbo_adapter_updated"   # where to save updated adapter after training
MERGED_MODEL_DIR = "turbo_whisper_merged"      # final merged model for direct inference / download
ZIP_OUTPUT = MERGED_MODEL_DIR + ".zip"

TRAIN_RATIO = 0.8
VALID_RATIO = 0.1
TEST_RATIO  = 0.1

# Training hyperparams (tune for your GPU)
PER_DEVICE_BATCH_SIZE = 4
GRAD_ACCUM_STEPS = 8
LEARNING_RATE = 3e-5
NUM_EPOCHS = 3
SAMPLING_RATE = 16000   # set to the rate you used previously

# === 1) Split the JSON into train/valid/test ===
with open(INPUT_JSON, "r", encoding="utf-8") as f:
    # support both JSON list and JSONL
    try:
        data = json.load(f)
        if isinstance(data, dict):
            # if top-level dict (unexpected), try to find a list field
            raise ValueError("JSON is a dict — expected a list of examples")
    except Exception:
        # fallback: try read lines as JSONL
        f.seek(0)
        data = [json.loads(line) for line in f if line.strip()]

random.shuffle(data)
n = len(data)
n_train = int(n * TRAIN_RATIO)
n_valid = int(n * VALID_RATIO)
train_data = data[:n_train]
valid_data = data[n_train:n_train + n_valid]
test_data  = data[n_train + n_valid:]

print(f"Total examples: {n} → train={len(train_data)}, valid={len(valid_data)}, test={len(test_data)}")

# Save splits to files (used by datasets.load_dataset)
os.makedirs("splits", exist_ok=True)
for name, arr in (("train", train_data), ("validation", valid_data), ("test", test_data)):
    out_path = os.path.join("splits", f"{name}.jsonl")
    with open(out_path, "w", encoding="utf-8") as fo:
        for ex in arr:
            # ensure audio path is absolute if needed
            if "audio_filepath" in ex and not os.path.isabs(ex["audio_filepath"]):
                ex["audio_filepath"] = os.path.join(AUDIO_ROOT, ex["audio_filepath"])
            fo.write(json.dumps(ex, ensure_ascii=False) + "\n")
    print("Wrote", out_path)

# === 2) Load processor (tokenizer + feature_extractor) ===
processor = WhisperProcessor.from_pretrained(SAVED_PROCESSOR_DIR)

# === 3) Load base model and apply LoRA (PEFT) ===
device = "cuda" if torch.cuda.is_available() else "cpu"
print("Using device:", device)

base_model = WhisperForConditionalGeneration.from_pretrained(BASE_MODEL)
base_model.to(device)

# Configure LoRA
lora_config = LoraConfig(
    r=16,
    lora_alpha=32,
    target_modules=["q_proj","v_proj"],   # typical for transformers; adjust if needed for Whisper's module names
    lora_dropout=0.1,
    bias="none",
    task_type=TaskType.SEQ_2_SEQ_LM
)

# Attach a new LoRA adapter to base model
model = get_peft_model(base_model, lora_config)

# If you have a previously saved adapter and want to resume from it, load it:
if os.path.isdir(SAVED_ADAPTER_DIR):
    print("Loading existing adapter from", SAVED_ADAPTER_DIR)
    # load PEFT adapter weights onto model
    model = PeftModel.from_pretrained(model, SAVED_ADAPTER_DIR, is_trainable=True)

model.to(device)

# === 4) Load datasets via Hugging Face datasets and preprocess ===
data_files = {
    "train": "splits/train.jsonl",
    "validation": "splits/validation.jsonl",
    "test": "splits/test.jsonl"
}
raw_dsets = load_dataset("json", data_files=data_files)

# Cast the "audio_filepath" column to Audio to stream/normalize sampling rate
# If your JSON column name is "audio_filepath", create an "audio" column for datasets
def load_audio_example(ex):
    # datasets Audio expects the column name to be "audio" when using cast_column.
    # So create a new dict with an 'audio' key pointing to the file path
    return {"audio": ex["audio_filepath"], **({k:v for k,v in ex.items() if k!="audio_filepath"})}

for split in raw_dsets:
    raw_dsets[split] = raw_dsets[split].map(load_audio_example)

raw_dsets = raw_dsets.cast_column("audio", Audio(sampling_rate=SAMPLING_RATE))

# Preprocessing: extract input_features and tokenize targets
def preprocess_function(batch):
    # batch["audio"]["array"] is a numpy array
    audio = batch["audio"]["array"]
    # feature extraction
    inputs = processor.feature_extractor(audio, sampling_rate=SAMPLING_RATE, return_tensors="pt")
    input_features = inputs.input_features.squeeze(0).numpy()
    # tokenize label
    with processor.as_target_processor():
        labels = processor.tokenizer(batch["text"]).input_ids
    return {"input_features": input_features, "labels": labels}

# Map preprocess; do not keep original audio to save memory
for split in raw_dsets:
    raw_dsets[split] = raw_dsets[split].map(preprocess_function, remove_columns=["audio","audio_filepath"], num_proc=1)

# === 5) Data collator ===
@dataclass
class DataCollatorSpeechSeq2Seq:
    processor: WhisperProcessor

    def __call__(self, features: List[Dict[str, Any]]) -> Dict[str, torch.Tensor]:
        input_feats = [torch.tensor(f["input_features"], dtype=torch.float32) for f in features]
        input_feats = torch.nn.utils.rnn.pad_sequence(input_feats, batch_first=True, padding_value=0.0)
        label_ids = [torch.tensor(f["labels"], dtype=torch.long) for f in features]
        label_ids = torch.nn.utils.rnn.pad_sequence(label_ids, batch_first=True, padding_value=processor.tokenizer.pad_token_id)
        return {"input_features": input_feats, "labels": label_ids}

data_collator = DataCollatorSpeechSeq2Seq(processor=processor)

# === 6) Training setup ===
training_args = Seq2SeqTrainingArguments(
    output_dir=OUTPUT_ADAPTER_DIR,
    per_device_train_batch_size=PER_DEVICE_BATCH_SIZE,
    gradient_accumulation_steps=GRAD_ACCUM_STEPS,
    learning_rate=LEARNING_RATE,
    num_train_epochs=NUM_EPOCHS,
    fp16=torch.cuda.is_available(),
    logging_steps=50,
    save_strategy="epoch",
    save_total_limit=3,
    remove_unused_columns=False,
    predict_with_generate=False,
)

trainer = Seq2SeqTrainer(
    model=model,
    args=training_args,
    train_dataset=raw_dsets["train"],
    eval_dataset=raw_dsets["validation"],
    data_collator=data_collator,
    tokenizer=processor.tokenizer
)

# === 7) Train ===
trainer.train()

# === 8) Save updated adapter only (this keeps base model separate) ===
os.makedirs(OUTPUT_ADAPTER_DIR, exist_ok=True)
model.save_pretrained(OUTPUT_ADAPTER_DIR)
processor.save_pretrained(SAVED_PROCESSOR_DIR)
print("✅ Adapter saved to", OUTPUT_ADAPTER_DIR)

# === 9) Evaluate on the test split (compute WER) ===
# For evaluation we will merge adapter into base model for straightforward generation,
# or load base model + adapter and generate with .generate().

# Option A: merge adapter weights into the base model (this returns a plain model)
print("Merging adapter into base model for evaluation...")
try:
    merged = model.merge_and_unload()  # if supported by PeFT version
    # merged is a normal nn.Module without PEFT hooks
    merged.save_pretrained(MERGED_MODEL_DIR)
    processor.save_pretrained(MERGED_MODEL_DIR)
    eval_model = WhisperForConditionalGeneration.from_pretrained(MERGED_MODEL_DIR).to(device)
    print("Merged model saved to", MERGED_MODEL_DIR)
except Exception as e:
    print("merge_and_unload not available or failed:", str(e))
    # fallback: use PeftModel instance for generation
    eval_model = model
    print("Will evaluate using PeftModel (adapter attached).")

# Load WER metric
wer_metric = evaluate.load("wer")

# Function to do generation for a batch (simple loop to avoid memory spikes)
def generate_transcript(example):
    input_features = torch.tensor(example["input_features"], dtype=torch.float32).unsqueeze(0).to(device)
    # prepare model-specific inputs: for Whisper, inputs go via input_features keyword
    outputs = eval_model.generate(inputs=input_features, max_new_tokens=256)
    pred_text = processor.tokenizer.batch_decode(outputs, skip_special_tokens=True)[0]
    return pred_text

# Run over test set and compute WER (may be slow; sample or batched if large)
refs = []
preds = []
for i, ex in enumerate(raw_dsets["test"]):
    pred = generate_transcript(ex)
    preds.append(pred)
    refs.append(ex["labels"])  # these are token ids; decode to string
    # decode reference
    ref_text = processor.tokenizer.decode(ex["labels"], skip_special_tokens=True)
    refs[-1] = ref_text
    if (i+1) % 50 == 0:
        print(f"Processed {i+1}/{len(raw_dsets['test'])}")

wer_score = wer_metric.compute(predictions=preds, references=refs)
print(f"Test WER: {wer_score:.4f}")

# === 10) Make the merged model easily downloadable (zip the directory) ===
if os.path.isdir(MERGED_MODEL_DIR):
    shutil.make_archive(MERGED_MODEL_DIR, 'zip', MERGED_MODEL_DIR)
    print("Zipped merged model to", ZIP_OUTPUT)
    # In many notebook environments (Colab/Jupyter) you can then provide the zip for download:
    print("Download path:", os.path.abspath(ZIP_OUTPUT))
else:
    print("Merged model directory not found; adapter-only saved at", OUTPUT_ADAPTER_DIR)
    # zip adapter dir instead
    shutil.make_archive(OUTPUT_ADAPTER_DIR, 'zip', OUTPUT_ADAPTER_DIR)
    print("Zipped adapter to", OUTPUT_ADAPTER_DIR + ".zip")
    print("Download path:", os.path.abspath(OUTPUT_ADAPTER_DIR + ".zip"))

print("All done. ✅")


# **method 2**

In [None]:
# === Install required packages (run once) ===
!pip install -q peft transformers datasets accelerate jiwer evaluate soundfile

# === Imports ===
import os
import torch
from datasets import load_dataset, Audio
from transformers import (WhisperProcessor,
                          WhisperForConditionalGeneration,
                          Seq2SeqTrainer,
                          Seq2SeqTrainingArguments)
from peft import PeftModel, PeftConfig
import json
from dataclasses import dataclass
from typing import Any, Dict, List

# === Paths: change these to your environment ===
BASE_MODEL_NAME = "openai/whisper-large-v3-turbo"   # or the base model you originally used
PEFT_ADAPTER_DIR = "turbo_adapter"                  # where you saved adapter
PROCESSOR_DIR = "turbo_whisper"                     # where you saved processor
NEW_JSON = "new_audio_dataset.json"                 # your new JSON manifest (audio path + text)
AUDIO_ROOT = "./data_audio"                         # root dir of the 2000 new audio files
OUTPUT_DIR = "turbo_adapter_updated"                # where to save updated adapter

# === Device ===
device = "cuda" if torch.cuda.is_available() else "cpu"

# === Load processor (tokenizer + feature extractor) ===
processor = WhisperProcessor.from_pretrained(PROCESSOR_DIR)

# === Load base model and attach saved PEFT adapter ===
base_model = WhisperForConditionalGeneration.from_pretrained(BASE_MODEL_NAME)
# Move to device BEFORE wrapping is sometimes recommended for some workflows:
base_model.to(device)

# Load the adapter into the base model; keep it trainable
# PeftModel.from_pretrained attaches the saved adapter weights to the base model.
model = PeftModel.from_pretrained(base_model, PEFT_ADAPTER_DIR, is_trainable=True)
model.to(device)

# === Load dataset: expect a JSON manifest with fields like {"audio_filepath": "...", "text": "...", "language":"hi"} ===
# Example JSON lines shape: [{"audio_filepath":"path/to/file.wav","text":"transcript text","id":"001"}, ...]
ds = load_dataset("json", data_files=NEW_JSON, split="train")

# If your audio paths in the JSON are relative, optionally fix them:
def fix_paths(example):
    path = example["audio_filepath"]
    if not os.path.isabs(path):
        example["audio_filepath"] = os.path.join(AUDIO_ROOT, path)
    return example

ds = ds.map(fix_paths)

# Load audio (resamples handled by transformers/datasets)
ds = ds.cast_column("audio_filepath", Audio(sampling_rate=16000))  # Use appropriate sample rate

# Preprocess: convert audio to input_features and text to labels
def preprocess_function(batch):
    # 'audio' column may be dict with 'array' + 'sampling_rate'
    audio = batch["audio_filepath"]["array"]
    # extract input features for the model
    inputs = processor.feature_extractor(audio, sampling_rate=16000, return_tensors="pt")
    input_features = inputs.input_features.squeeze(0).numpy()  # shape: (seq_len, feature_dim) or as required

    # prepare labels: use tokenizer from processor
    # for Whisper, set language / task if necessary
    # tokenizer: processor.tokenizer
    with processor.as_target_processor():
        labels = processor.tokenizer(batch["text"]).input_ids

    return {"input_features": input_features, "labels": labels}

# IMPORTANT: the dataset "ds" may be large; map in batched or non-batched depending on memory
ds = ds.map(preprocess_function, remove_columns=["audio_filepath", "audio"], num_proc=4)

# Convert arrays to appropriate torch tensors on the fly by a data collator (see below)

# === Data collator ===
@dataclass
class DataCollatorSpeechSeq2Seq:
    processor: WhisperProcessor
    pad_to_multiple_of: int = None

    def __call__(self, features: List[Dict[str, Any]]) -> Dict[str, Any]:
        # features contain 'input_features' (numpy arrays) and 'labels' (list of ids)
        input_features = [torch.tensor(f["input_features"]) for f in features]
        input_features = torch.nn.utils.rnn.pad_sequence(input_features, batch_first=True, padding_value=0.0)
        # labels: pad with tokenizer.pad_token_id
        label_ids = [torch.tensor(f["labels"], dtype=torch.long) for f in features]
        label_ids = torch.nn.utils.rnn.pad_sequence(label_ids, batch_first=True, padding_value=processor.tokenizer.pad_token_id)

        batch = {
            "input_features": input_features,
            "labels": label_ids
        }
        return batch

data_collator = DataCollatorSpeechSeq2Seq(processor=processor)

# === TrainingArguments ===
training_args = Seq2SeqTrainingArguments(
    output_dir=OUTPUT_DIR,
    per_device_train_batch_size=4,     # reduce if OOM
    gradient_accumulation_steps=8,     # simulate larger batch
    learning_rate=3e-5,                # try small LR for PEFT
    num_train_epochs=3,
    fp16=torch.cuda.is_available(),
    logging_steps=50,
    save_strategy="epoch",
    save_total_limit=3,
    predict_with_generate=False,       # set to True if you want validation generation
)

# === Trainer ===
trainer = Seq2SeqTrainer(
    model=model,
    args=training_args,
    train_dataset=ds,
    data_collator=data_collator,
    tokenizer=processor.tokenizer,   # tokenizer helps with saving & padding
)

# === Train ===
trainer.train()

# === Save updated adapter (do NOT overwrite base model) ===
# Save PEFT adapter only (this keeps the base model untouched)
model.save_pretrained(OUTPUT_DIR)
processor.save_pretrained(PROCESSOR_DIR)  # optional: only if processor changed

print("✅ Updated adapter saved to", OUTPUT_DIR)


<div class="markdown-google-sans">

<a name="machine-learning-examples"></a>

### Featured examples

</div>

- [Retraining an Image Classifier](https://tensorflow.org/hub/tutorials/tf2_image_retraining): Build a Keras model on top of a pre-trained image classifier to distinguish flowers.
- [Text Classification](https://tensorflow.org/hub/tutorials/tf2_text_classification): Classify IMDB movie reviews as either *positive* or *negative*.
- [Style Transfer](https://tensorflow.org/hub/tutorials/tf2_arbitrary_image_stylization): Use deep learning to transfer style between images.
- [Multilingual Universal Sentence Encoder Q&A](https://tensorflow.org/hub/tutorials/retrieval_with_tf_hub_universal_encoder_qa): Use a machine learning model to answer questions from the SQuAD dataset.
- [Video Interpolation](https://tensorflow.org/hub/tutorials/tweening_conv3d): Predict what happened in a video between the first and the last frame.
