In [38]:
import json
import os
import random
from datasets import Dataset, DatasetDict, Audio
import pandas as pd
import torch
import torchaudio
import pandas as pd
import numpy as np
from datasets import Dataset, DatasetDict, Audio
from transformers import (
     WhisperFeatureExtractor,
     WhisperTokenizer,
     WhisperProcessor,
     WhisperForConditionalGeneration,
     Seq2SeqTrainingArguments,
     Seq2SeqTrainer)

from dataclasses import dataclass
from typing import Any, Dict, List, Union
import evaluate
import os
import librosa


In [39]:
MODEL_NAME = "openai/whisper-small" 
AUDIO_DIR = "./Audios_ES"
METADATA_FILE = "./final_metadata.csv"
OUTPUT_DIR = "./whisper-finetuned-transcipt-es"
LANGUAGE = "spanish"
TASK = "transcribe"
SUPPORTED_FORMATS = ['.mp3', '.wav', '.flac', '.ogg', '.m4a', '.opus', '.wma']
TARGET_SAMPLE_RATE = 16000 

In [40]:
def validate_audio_files(audio_dir, metadata_df):
    """
    Valida que los archivos de audio existan y sean accesibles
    """
    valid_files = []
    invalid_files = []
    
    for idx, row in metadata_df.iterrows():
        filename = row['file_name']
        
        found = False
        for ext in SUPPORTED_FORMATS:
            if filename.lower().endswith(ext):
                audio_path = os.path.join(audio_dir, filename)
            else:
                audio_path = os.path.join(audio_dir, f"{filename}{ext}")
            
            if os.path.exists(audio_path):
                valid_files.append({
                    'filename': filename,
                    'audio_path': audio_path,
                    'transcription': row['transcription']
                })
                found = True
                break
        
        if not found:
            invalid_files.append(filename)
    
    if invalid_files:
        print(f"\n‚ö†Ô∏è  Advertencia: {len(invalid_files)} archivos no encontrados:")
        for f in invalid_files[:5]:
            print(f"   - {f}")
        if len(invalid_files) > 5:
            print(f"   ... y {len(invalid_files) - 5} m√°s")
    
    print(f"\n‚úÖ Archivos v√°lidos: {len(valid_files)}/{len(metadata_df)}")
    return valid_files

In [41]:


USED_TRACKER_FILE = "used_audio_log.json"

def load_used_audio():
    if os.path.exists(USED_TRACKER_FILE):
        with open(USED_TRACKER_FILE, "r", encoding="utf-8") as f:
            return set(json.load(f))
    return set()

def save_used_audio(used_files):
    with open(USED_TRACKER_FILE, "w", encoding="utf-8") as f:
        json.dump(list(used_files), f, ensure_ascii=False, indent=2)


In [42]:
def prepare_incremental_dataset(batch_size=100, random_seed=42):
    """
    Carga el dataset tomando un lote nuevo de audios no usados.
    """
    df = pd.read_csv(METADATA_FILE, encoding='latin-1')
    valid_files = validate_audio_files(AUDIO_DIR, df)
    if not valid_files:
        raise ValueError("No se encontraron archivos v√°lidos!")

    # Cargar audios ya usados
    used = load_used_audio()
    available = [f for f in valid_files if f['audio_path'] not in used]

    if not available:
        print("‚úÖ Todos los audios ya han sido usados para entrenamiento.")
        return None

    # Tomar hasta `batch_size` nuevos audios
    random.seed(random_seed)
    selected = random.sample(available, min(batch_size, len(available)))

    # Guardar los nuevos como usados
    new_used = used.union({item['audio_path'] for item in selected})
    save_used_audio(new_used)

    print(f"üéß Usando {len(selected)} nuevos audios (total usados: {len(new_used)})")

    # Crear dataset
    data = {
        'audio': [item['audio_path'] for item in selected],
        'transcription': [item['transcription'] for item in selected]
    }
    dataset = Dataset.from_dict(data)
    dataset = dataset.cast_column("audio", Audio(sampling_rate=TARGET_SAMPLE_RATE))

    # Split simple (90% train / 10% valid)
    train_test = dataset.train_test_split(test_size=0.1, seed=random_seed)

    dataset = DatasetDict({
        'train': train_test['train'],
        'validation': train_test['test']
    })

    return dataset


In [43]:
def prepare_dataset_for_training(batch):
    """
    Preprocesa los audios y transcripciones en BATCHES
    Maneja autom√°ticamente diferentes formatos
    """
    audio_arrays = [audio["array"] for audio in batch["audio"]]
    sampling_rates = [audio["sampling_rate"] for audio in batch["audio"]]
    
    input_features = []
    for audio_array, sr in zip(audio_arrays, sampling_rates):
        features = feature_extractor(
            audio_array, 
            sampling_rate=sr
        ).input_features[0]
        input_features.append(features)
    
    batch["input_features"] = input_features
    
    batch["labels"] = [tokenizer(transcription).input_ids for transcription in batch["transcription"]]
    
    return batch
print("\nüîÑ Preprocesando dataset por batches...")


üîÑ Preprocesando dataset por batches...


In [44]:
@dataclass
class DataCollatorSpeechSeq2SeqWithPadding:
    processor: Any

    def __call__(self, features: List[Dict[str, Union[List[int], torch.Tensor]]]) -> Dict[str, torch.Tensor]:

        input_features = [{"input_features": feature["input_features"]} for feature in features]
        label_features = [{"input_ids": feature["labels"]} for feature in features]

        batch = self.processor.feature_extractor.pad(input_features, return_tensors="pt")

        labels_batch = self.processor.tokenizer.pad(label_features, return_tensors="pt")


        labels = labels_batch["input_ids"].masked_fill(
            labels_batch.attention_mask.ne(1), -100)


        if (labels[:, 0] == self.processor.tokenizer.bos_token_id).all().cpu().item():
            labels = labels[:, 1:]

        batch["labels"] = labels

        return batch

data_collator = DataCollatorSpeechSeq2SeqWithPadding(processor=processor)

In [45]:
metric = evaluate.load("wer") 

def compute_metrics(pred):
    pred_ids = pred.predictions
    label_ids = pred.label_ids

    if isinstance(pred_ids, tuple):
        pred_ids = pred_ids[0]
    pred_ids = np.asarray(pred_ids)
    label_ids = np.asarray(label_ids)
    label_ids[label_ids == -100] = tokenizer.pad_token_id
    pred_str = tokenizer.batch_decode(pred_ids, skip_special_tokens=True)
    label_str = tokenizer.batch_decode(label_ids, skip_special_tokens=True)
    wer = 100 * metric.compute(predictions=pred_str, references=label_str)
    return {"wer": wer}

In [46]:
from transformers import Seq2SeqTrainer, Seq2SeqTrainingArguments

def incremental_training_loop(model, processor, max_rounds=10):
    for round_num in range(1, max_rounds + 1):
        print(f"\nüîÅ Entrenamiento incremental #{round_num}")
        dataset = prepare_incremental_dataset(batch_size=100)

        if dataset is None:
            print("üéâ No hay m√°s audios disponibles. Entrenamiento completo.")
            break

        dataset = dataset.map(
            prepare_dataset_for_training,
            remove_columns=dataset.column_names["train"],
            batched=True,
            batch_size=8,
            num_proc=1,
            desc=f"Procesando audios - ronda {round_num}"
        )

        training_args = Seq2SeqTrainingArguments(
            output_dir=f"{OUTPUT_DIR}/round_{round_num}",
            per_device_train_batch_size=8,
            gradient_accumulation_steps=2,
            learning_rate=5e-6,  # m√°s bajo para fine-tuning incremental
            warmup_steps=10,
            num_train_epochs=2,  # puedes ajustarlo seg√∫n tama√±o
            eval_strategy="epoch",
            fp16=True,
            save_strategy="epoch",
            logging_strategy="steps",
            logging_steps=25,
            report_to=["tensorboard"],
            load_best_model_at_end=True,
            metric_for_best_model="wer",
            greater_is_better=False,
            push_to_hub=False,
        )

        trainer = Seq2SeqTrainer(
            args=training_args,
            model=model,
            train_dataset=dataset["train"],
            eval_dataset=dataset["validation"],
            data_collator=data_collator,
            compute_metrics=compute_metrics,
            tokenizer=processor.feature_extractor,
        )

        trainer.train()
        print(f"‚úÖ Ronda {round_num} completada.")
        # Guardar el modelo actualizado
        model.save_pretrained(f"{OUTPUT_DIR}/round_{round_num}/checkpoint")
        processor.save_pretrained(f"{OUTPUT_DIR}/round_{round_num}/checkpoint")


In [47]:
device = "cuda" if torch.cuda.is_available() else "cpu"
print(f"Usando dispositivo: {device}")

Usando dispositivo: cuda


In [48]:
feature_extractor = WhisperFeatureExtractor.from_pretrained(MODEL_NAME)
tokenizer = WhisperTokenizer.from_pretrained(MODEL_NAME, language=LANGUAGE, task=TASK)
processor = WhisperProcessor.from_pretrained(MODEL_NAME, language=LANGUAGE, task=TASK)

model = WhisperForConditionalGeneration.from_pretrained(MODEL_NAME)
model.to(device)

WhisperForConditionalGeneration(
  (model): WhisperModel(
    (encoder): WhisperEncoder(
      (conv1): Conv1d(80, 768, kernel_size=(3,), stride=(1,), padding=(1,))
      (conv2): Conv1d(768, 768, kernel_size=(3,), stride=(2,), padding=(1,))
      (embed_positions): Embedding(1500, 768)
      (layers): ModuleList(
        (0-11): 12 x WhisperEncoderLayer(
          (self_attn): WhisperAttention(
            (k_proj): Linear(in_features=768, out_features=768, bias=False)
            (v_proj): Linear(in_features=768, out_features=768, bias=True)
            (q_proj): Linear(in_features=768, out_features=768, bias=True)
            (out_proj): Linear(in_features=768, out_features=768, bias=True)
          )
          (self_attn_layer_norm): LayerNorm((768,), eps=1e-05, elementwise_affine=True)
          (activation_fn): GELUActivation()
          (fc1): Linear(in_features=768, out_features=3072, bias=True)
          (fc2): Linear(in_features=3072, out_features=768, bias=True)
          (f

In [49]:
incremental_training_loop(model, processor, max_rounds=10)



üîÅ Entrenamiento incremental #1

‚úÖ Archivos v√°lidos: 23521/23521
üéß Usando 100 nuevos audios (total usados: 400)


Procesando audios - ronda 1 (num_proc=1): 100%|‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà| 90/90 [00:01<00:00, 48.41 examples/s]
Procesando audios - ronda 1 (num_proc=1): 100%|‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà| 10/10 [00:00<00:00, 22.85 examples/s]
  trainer = Seq2SeqTrainer(
You're using a WhisperTokenizerFast tokenizer. Please note that with a fast tokenizer, using the `__call__` method is faster than using a method to encode the text followed by a call to the `pad` method to get a padded encoding.


Epoch,Training Loss,Validation Loss


KeyboardInterrupt: 