### **Librerias**

In [79]:
import torch
import torchaudio
import IPython
import matplotlib.pyplot as plt
import os
import random
from datasets import Dataset, DatasetDict, Audio
import pandas as pd
import librosa

from transformers import (
     WhisperFeatureExtractor,
     WhisperTokenizer,
     WhisperProcessor,
     WhisperForConditionalGeneration,
     Seq2SeqTrainingArguments,
     Seq2SeqTrainer)
from dataclasses import dataclass
from typing import Any, Dict, List, Union

print(torch.__version__)
print(torchaudio.__version__)
torch.random.manual_seed(0)
device = torch.device("cuda" if torch.cuda.is_available() else "cpu")
print(device)

2025-10-11 10:06:21.847527: I tensorflow/core/util/port.cc:153] oneDNN custom operations are on. You may see slightly different numerical results due to floating-point round-off errors from different computation orders. To turn them off, set the environment variable `TF_ENABLE_ONEDNN_OPTS=0`.
2025-10-11 10:06:21.949738: I tensorflow/core/platform/cpu_feature_guard.cc:210] This TensorFlow binary is optimized to use available CPU instructions in performance-critical operations.
To enable the following instructions: AVX2 AVX_VNNI FMA, in other operations, rebuild TensorFlow with the appropriate compiler flags.
2025-10-11 10:06:23.391513: I tensorflow/core/util/port.cc:153] oneDNN custom operations are on. You may see slightly different numerical results due to floating-point round-off errors from different computation orders. To turn them off, set the environment variable `TF_ENABLE_ONEDNN_OPTS=0`.


2.7.1+cu128
2.7.1+cu128
cuda


### **Parametros para Pipeline**

In [80]:
MODEL_NAME = "openai/whisper-small" 
AUDIO_DIR = "./Audios_ES"
METADATA_FILE = "./final_metadata.csv"
OUTPUT_DIR = "./whisper-finetuned-transcipt-es"
LANGUAGE = "spanish"
TASK = "transcribe"
SUPPORTED_FORMATS = ['.mp3', '.wav', '.flac', '.ogg', '.m4a', '.opus', '.wma']
TARGET_SAMPLE_RATE = 16000 

### **Funciones**

In [None]:
def load_audio_multiformat(audio_path, target_sr=TARGET_SAMPLE_RATE):
    """
    Carga audio de cualquier formato y lo convierte a 16kHz mono
    
    Args:
        audio_path: Ruta al archivo de audio
        target_sr: Sample rate objetivo (16000 para Whisper)
    
    Returns:
        waveform: numpy array con el audio
        sample_rate: sample rate del audio cargado
    """
    audio_path = str(audio_path)
    file_ext = Path(audio_path).suffix.lower()
    
    try:
        if file_ext in ['.wav', '.flac']:
            waveform, sample_rate = torchaudio.load(audio_path)
            if waveform.shape[0] > 1:
                waveform = torch.mean(waveform, dim=0, keepdim=True)
            waveform = waveform.squeeze().numpy()
        else:
            waveform, sample_rate = librosa.load(
                audio_path, 
                sr=target_sr, 
                mono=True
            )
    except Exception as e:
        print(f"Error cargando {audio_path}: {e}")
        try:
            waveform, sample_rate = librosa.load(audio_path, sr=None, mono=True)
        except Exception as e2:
            print(f"Error cr√≠tico cargando {audio_path}: {e2}")
            return None, None
    
    if sample_rate != target_sr:
        waveform = librosa.resample(
            waveform, 
            orig_sr=sample_rate, 
            target_sr=target_sr
        )
        sample_rate = target_sr
    
    return waveform, sample_rate



def validate_audio_files(audio_dir, metadata_df):
    """
    Valida que los archivos de audio existan y sean accesibles
    """
    valid_files = []
    invalid_files = []
    
    for idx, row in metadata_df.iterrows():
        filename = row['file_name']
        
        found = False
        for ext in SUPPORTED_FORMATS:
            if filename.lower().endswith(ext):
                audio_path = os.path.join(audio_dir, filename)
            else:
                audio_path = os.path.join(audio_dir, f"{filename}{ext}")
            
            if os.path.exists(audio_path):
                valid_files.append({
                    'filename': filename,
                    'audio_path': audio_path,
                    'transcription': row['transcription']
                })
                found = True
                break
        
        if not found:
            invalid_files.append(filename)
    
    if invalid_files:
        print(f"Advertencia: {len(invalid_files)} archivos no encontrados:")
        for f in invalid_files[:5]:
            print(f"   - {f}")
        if len(invalid_files) > 5:
            print(f"   ... y {len(invalid_files) - 5} m√°s")
    
    print(f"Archivos v√°lidos: {len(valid_files)}/{len(metadata_df)}")
    return valid_files




def prepare_dataset(sample_fraction=0.3, random_seed=42):
    """
    Carga el CSV, valida los archivos de audio, toma una fracci√≥n aleatoria
    y prepara el dataset dividido en train / validation / test.
    """
    df = pd.read_csv(METADATA_FILE, encoding='latin-1')
    print(f"Total de registros en CSV: {len(df)}")
    print(f"Buscando archivos en: {AUDIO_DIR}")
    valid_files = validate_audio_files(AUDIO_DIR, df)
    if not valid_files:
        raise ValueError("No se encontraron archivos de audio v√°lidos!")
    
    print(f"Total de archivos v√°lidos encontrados: {len(valid_files)}")


    random.seed(random_seed)
    sample_size = int(len(valid_files) * sample_fraction)
    sampled_files = random.sample(valid_files, sample_size)
    print(f"Usando una muestra aleatoria del {sample_fraction*100:.0f}% "
          f"({sample_size} archivos)")

    data = {
        'audio': [item['audio_path'] for item in sampled_files],
        'transcription': [item['transcription'] for item in sampled_files]
    }
    dataset = Dataset.from_dict(data)
    

    dataset = dataset.cast_column("audio", Audio(sampling_rate=TARGET_SAMPLE_RATE))

    train_test = dataset.train_test_split(test_size=0.3, seed=random_seed)
    test_valid = train_test['test'].train_test_split(test_size=0.5, seed=random_seed)
    
    dataset = DatasetDict({
        'train': train_test['train'],
        'validation': test_valid['train'],
        'test': test_valid['test']
    })

    print(f" Divisi√≥n del dataset:")
    print(f"Entrenamiento: {len(dataset['train'])} muestras")
    print(f"Validaci√≥n:    {len(dataset['validation'])} muestras")
    print(f"Prueba:        {len(dataset['test'])} muestras")
    
    return dataset
dataset = prepare_dataset(sample_fraction=0.10)



def prepare_dataset_for_training(batch):
    """
    Preprocesa los audios y transcripciones en BATCHES
    Maneja autom√°ticamente diferentes formatos
    """
    audio_arrays = [audio["array"] for audio in batch["audio"]]
    sampling_rates = [audio["sampling_rate"] for audio in batch["audio"]]
    
    input_features = []
    for audio_array, sr in zip(audio_arrays, sampling_rates):
        features = feature_extractor(
            audio_array, 
            sampling_rate=sr
        ).input_features[0]
        input_features.append(features)
    
    batch["input_features"] = input_features
    
    batch["labels"] = [tokenizer(transcription).input_ids for transcription in batch["transcription"]]
    
    return batch
print("Preprocesando dataset por batches")


def compute_metrics(pred):
    pred_ids = pred.predictions
    label_ids = pred.label_ids

    if isinstance(pred_ids, tuple):
        pred_ids = pred_ids[0]
    pred_ids = np.asarray(pred_ids)
    label_ids = np.asarray(label_ids)
    label_ids[label_ids == -100] = tokenizer.pad_token_id
    pred_str = tokenizer.batch_decode(pred_ids, skip_special_tokens=True)
    label_str = tokenizer.batch_decode(label_ids, skip_special_tokens=True)
    wer = 100 * metric.compute(predictions=pred_str, references=label_str)
    return {"wer": wer}


def transcribe_audio(audio_path):
    """Transcribe un archivo de audio"""
    waveform, sample_rate = torchaudio.load(audio_path)
    if sample_rate != 16000:
        waveform = torchaudio.functional.resample(waveform, sample_rate, 16000)

    input_features = processor(
        waveform.squeeze().numpy(),
        sampling_rate=16000,
        return_tensors="pt"
    ).input_features
    
    input_features = input_features.to(device)
    
    with torch.no_grad():
        predicted_ids = model.generate(input_features)
    transcription = processor.batch_decode(
        predicted_ids,
        skip_special_tokens=True
    )[0]
    
    return transcription


def transcribe_audio(audio_path, processor=None, model=None, device=None):
    """
    Transcribe audio usando el modelo Whisper fine-tuned
    """
    if processor is None or model is None:
        processor, model, device = load_whisper_model()
    
    print(f"üéß Procesando audio: {audio_path}")
    

    audio, sampling_rate = librosa.load(audio_path, sr=16000)
    
    input_features = processor(
        audio, 
        sampling_rate=16000, 
        return_tensors="pt").input_features.to(device)

    with torch.no_grad():
        predicted_ids = model.generate(input_features)

    transcription = processor.batch_decode(
        predicted_ids, 
        skip_special_tokens=True)[0]
    
    return transcription

def load_whisper_model(model_path=OUTPUT_DIR):
    """
    Carga el modelo Whisper fine-tuned y el procesador
    """
    print(f"üì• Cargando modelo Whisper desde {model_path}...")
    processor = WhisperProcessor.from_pretrained(model_path)
    model = WhisperForConditionalGeneration.from_pretrained(model_path)
    
    device = "cuda" if torch.cuda.is_available() else "cpu"
    model = model.to(device)
    print(f"‚úÖ Modelo cargado en {device}")
    
    return processor, model, device

def analyze_sentiment_multilang(text):
    lang = detect_language(text)
    emotion_classifier = pipeline(
        "text-classification",
        model="SamLowe/roberta-base-go_emotions",
        top_k=5  
    )

    if lang == 'es':
        sentiment_model = "Hate-speech-CNERG/dehatebert-mono-spanish"
    elif lang == 'en':
        sentiment_model = "cardiffnlp/twitter-roberta-base-offensive"
    else:
        sentiment_model = "nlptown/bert-base-multilingual-uncased-sentiment"
    
    sentiment_classifier = pipeline(
        "sentiment-analysis",
        model=sentiment_model
    )
    
    emotions = emotion_classifier(text)[0]
    sentiment = sentiment_classifier(text)[0]
    
    return {
        "language": lang,
        "text": text,
        "emotions": emotions,
        "sentiment": sentiment,
        "alert_level": calculate_alert_level(emotions)
    }
def detect_language(text):
    """
    Detecta autom√°ticamente el idioma del texto
    """
    try:
        lang = detect(text)
        return lang
    except:
        return 'en' 

### **Carga Modelo**

In [None]:
feature_extractor = WhisperFeatureExtractor.from_pretrained(MODEL_NAME)
tokenizer = WhisperTokenizer.from_pretrained(MODEL_NAME, language=LANGUAGE, task=TASK)
processor = WhisperProcessor.from_pretrained(MODEL_NAME, language=LANGUAGE, task=TASK)

model = WhisperForConditionalGeneration.from_pretrained(MODEL_NAME)
model.to(device)

Sample Rate: 16000
Labels: ('-', '|', 'E', 'T', 'A', 'O', 'N', 'I', 'H', 'S', 'R', 'D', 'L', 'U', 'M', 'W', 'C', 'F', 'G', 'Y', 'P', 'B', 'V', 'K', "'", 'X', 'J', 'Q', 'Z')


In [None]:
model.config.forced_decoder_ids = None
model.config.suppress_tokens = []
model.generation_config.language = LANGUAGE
model.generation_config.task = TASK

In [None]:
BATCH_SIZE = 16
try:
    dataset = dataset.map(
        prepare_dataset_for_training,
        remove_columns=dataset.column_names["train"],
        batched=True,  
        batch_size=BATCH_SIZE,  
        num_proc=1, 
        desc="Procesando audios"  
    )
    print("‚úÖ Preprocesamiento completado")
    
except Exception as e:
    print(f"Error durante preprocesamiento: {e}")

Downloading: "https://download.pytorch.org/torchaudio/models/wav2vec2_fairseq_base_ls960_asr_ls960.pth" to /home/santenana/.cache/torch/hub/checkpoints/wav2vec2_fairseq_base_ls960_asr_ls960.pth


100%|‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà| 360M/360M [00:03<00:00, 103MB/s]  


<class 'torchaudio.models.wav2vec2.model.Wav2Vec2Model'>


### **Entrenar modelo**

In [None]:
@dataclass
class DataCollatorSpeechSeq2SeqWithPadding:
    processor: Any

    def __call__(self, features: List[Dict[str, Union[List[int], torch.Tensor]]]) -> Dict[str, torch.Tensor]:

        input_features = [{"input_features": feature["input_features"]} for feature in features]
        label_features = [{"input_ids": feature["labels"]} for feature in features]

        batch = self.processor.feature_extractor.pad(input_features, return_tensors="pt")

        labels_batch = self.processor.tokenizer.pad(label_features, return_tensors="pt")


        labels = labels_batch["input_ids"].masked_fill(
            labels_batch.attention_mask.ne(1), -100)


        if (labels[:, 0] == self.processor.tokenizer.bos_token_id).all().cpu().item():
            labels = labels[:, 1:]

        batch["labels"] = labels

        return batch

data_collator = DataCollatorSpeechSeq2SeqWithPadding(processor=processor)

In [None]:
training_args = Seq2SeqTrainingArguments(output_dir=OUTPUT_DIR,
                                         per_device_train_batch_size=8, 
                                         gradient_accumulation_steps=2,
                                         learning_rate=1e-5,
                                         warmup_steps=50,
                                         max_steps=500,  
                                         gradient_checkpointing=False, #True,
                                         fp16=True,#torch.cuda.is_available(),
                                         eval_strategy="steps",
                                         per_device_eval_batch_size=8,
                                         predict_with_generate=True,
                                         generation_max_length=225,
                                         save_steps=100,
                                         eval_steps=100,
                                         logging_steps=30,
                                         report_to=["tensorboard"],
                                         load_best_model_at_end=True,
                                         metric_for_best_model="wer",
                                         greater_is_better=False,
                                         push_to_hub=False,)

In [None]:
trainer = Seq2SeqTrainer(
    args=training_args,
    model=model,
    train_dataset=dataset["train"],
    eval_dataset=dataset["test"],
    data_collator=data_collator,
    compute_metrics=compute_metrics,
    processing_class=processor.feature_extractor)
#tokenizer=processor.feature_extractor)
print("Iniciando entrenamiento...")
trainer.train()

In [None]:
trainer.save_model(OUTPUT_DIR)
processor.save_pretrained(OUTPUT_DIR)
print(f"Modelo guardado en {OUTPUT_DIR}")


In [None]:
model_path = './whisper-finetuned-transcipt-es'
processor, model, device = load_whisper_model(model_path)

audio_path = '/home/santenana/Proyectos_ML_DC/03_Audio_to_Speech/test_5.wav'

text = transcribe_audio(audio_path, processor, model, device)
text