In [1]:
%pip install bitsandbytes

Note: you may need to restart the kernel to use updated packages.


In [2]:
import os
import json
from datasets import load_dataset, Dataset, Audio
from tqdm import tqdm
from transformers import WhisperProcessor, WhisperForConditionalGeneration, TrainingArguments, Trainer
import torch
from torch.utils.data import Dataset as TorchDataset

  from .autonotebook import tqdm as notebook_tqdm


In [3]:
manifest_path = "manifest_silero_v2.jsonl"  # <== меняем путь
entries = []

with open(manifest_path, "r", encoding="utf-8") as f:
    for line in f:
        item = json.loads(line)
        item["audio_filepath"] = item["audio_filepath"].replace("\\", "/")
        entries.append({
            "audio": item["audio_filepath"],
            "text": item["text"]
        })

raw_dataset = Dataset.from_list(entries)
print(f"[✓] Loaded {len(raw_dataset)} samples from {manifest_path}")

[✓] Loaded 452584 samples from manifest_silero_v2.jsonl


In [4]:
raw_dataset = raw_dataset.cast_column("audio", Audio(sampling_rate=16000))

In [5]:
model_name = "openai/whisper-large-v3"
processor = WhisperProcessor.from_pretrained(model_name, language="russian", task="transcribe")
model = WhisperForConditionalGeneration.from_pretrained(model_name)
model.config.forced_decoder_ids = processor.get_decoder_prompt_ids(language="russian", task="transcribe")


In [6]:
class MyAudioDataset(TorchDataset):
    def __init__(self, hf_dataset):
        self.dataset = hf_dataset

    def __len__(self):
        return len(self.dataset)

    def __getitem__(self, idx):
        item = self.dataset[idx]
        return {
            "audio": item["audio"]["array"],
            "text": item["text"]
        }

In [7]:
class DataCollatorWhisper:
    def __init__(self, processor):
        self.processor = processor

    def __call__(self, features):
        audios = [f["audio"] for f in features]
        texts = [f["text"] for f in features]

        inputs = self.processor.feature_extractor(audios, sampling_rate=16000, return_tensors="pt")
        labels = self.processor.tokenizer(texts, padding="longest", return_tensors="pt").input_ids
        labels[labels == self.processor.tokenizer.pad_token_id] = -100

        return {
            "input_features": inputs.input_features,
            "labels": labels
        }

In [8]:
training_args = TrainingArguments(
    output_dir="./whisper-finetuned",
    per_device_train_batch_size=1,
    gradient_accumulation_steps=2,
    learning_rate=2e-5,
    warmup_steps=100,
    max_steps=2000,
    logging_dir="./logs",
    logging_steps=50,
    save_strategy="no",             
    fp16=True,
    report_to="none",
    remove_unused_columns=False,
    save_safetensors=False          
)

In [9]:
from bitsandbytes.optim import Adam8bit

In [10]:
train_dataset = MyAudioDataset(raw_dataset)
data_collator = DataCollatorWhisper(processor)


optimizer = Adam8bit(model.parameters(), lr=2e-5)

trainer = Trainer(
    model=model,
    args=training_args,
    train_dataset=train_dataset,
    data_collator=data_collator,
)

trainer.optimizer = optimizer
trainer.train()

Passing a tuple of `past_key_values` is deprecated and will be removed in Transformers v4.43.0. You should pass an instance of `EncoderDecoderCache` instead, e.g. `past_key_values=EncoderDecoderCache.from_legacy_cache(past_key_values)`.


Step,Training Loss
50,4.5183
100,2.9141
150,2.7371
200,2.6095
250,2.6995
300,2.6612
350,2.5049
400,2.7474
450,2.577
500,2.5812


TrainOutput(global_step=2000, training_loss=2.4983707809448243, metrics={'train_runtime': 48303.0021, 'train_samples_per_second': 0.083, 'train_steps_per_second': 0.041, 'total_flos': 1.358999322624e+19, 'train_loss': 2.4983707809448243, 'epoch': 0.008838138334541213})

In [12]:
import gc
import torch
torch.cuda.empty_cache()
gc.collect()
model.save_pretrained("whisper-finetuned", safe_serialization=False)

In [13]:
processor.feature_extractor.save_pretrained("whisper-finetuned")
processor.tokenizer.save_pretrained("whisper-finetuned")

('whisper-finetuned\\tokenizer_config.json',
 'whisper-finetuned\\special_tokens_map.json',
 'whisper-finetuned\\vocab.json',
 'whisper-finetuned\\merges.txt',
 'whisper-finetuned\\normalizer.json',
 'whisper-finetuned\\added_tokens.json')