In [9]:
!pip install --upgrade pip
!pip install --upgrade datasets[audio] transformers accelerate evaluate jiwer tensorboard gradio
PYTORCH_ENABLE_MPS_FALLBACK=1
PYTORCH_MPS_HIGH_WATERMARK_RATIO=0.0

zsh:1: no matches found: datasets[audio]


In [10]:
import warnings
warnings.filterwarnings('ignore')

In [11]:
from datasets import load_dataset, DatasetDict

minds_de_ds = load_dataset('csv', data_files='./MInDS-14/text/de-DE.csv')

train_testvalid = minds_de_ds['train'].train_test_split(test_size=0.2)
# Split the 10% test + valid in half test, half valid
test_valid = train_testvalid['test'].train_test_split(test_size=0.5)
# gather everyone if you want to have a single DatasetDict
ds = DatasetDict({
    'train': train_testvalid['train'],
    'test': test_valid['test'],
    'valid': test_valid['train']})

print(ds)

DatasetDict({
    train: Dataset({
        features: ['filepath', 'text_asr', 'text_translated', 'intent'],
        num_rows: 488
    })
    test: Dataset({
        features: ['filepath', 'text_asr', 'text_translated', 'intent'],
        num_rows: 62
    })
    valid: Dataset({
        features: ['filepath', 'text_asr', 'text_translated', 'intent'],
        num_rows: 61
    })
})


In [12]:
import os
import librosa

def load_audio_data(batch, audio_base_path):
    audio_files = [os.path.join(audio_base_path, filepath) for filepath in batch['filepath']]
    audio_data = [librosa.load(file_path, sr=None) for file_path in audio_files]
    
    # Separate audio data and sample rates
    audio_signals = [data[0] for data in audio_data]
    sample_rates = [data[1] for data in audio_data]
    
    batch['audio'] = [{'path': file_path, 'array': audio, 'sampling_rate': sr} for file_path, audio, sr in zip(audio_files, audio_signals, sample_rates)]
    return batch

In [13]:
# Apply the function to load audio data and add it to the dataset
audio_base_path = "./MInDS-14/audio"
ds = ds.map(load_audio_data, fn_kwargs={'audio_base_path': audio_base_path}, batched=True)
print(ds)

Map: 100%|██████████| 488/488 [00:00<00:00, 699.56 examples/s]
Map: 100%|██████████| 62/62 [00:00<00:00, 297.53 examples/s]
Map: 100%|██████████| 61/61 [00:00<00:00, 640.87 examples/s]

DatasetDict({
    train: Dataset({
        features: ['filepath', 'text_asr', 'text_translated', 'intent', 'audio'],
        num_rows: 488
    })
    test: Dataset({
        features: ['filepath', 'text_asr', 'text_translated', 'intent', 'audio'],
        num_rows: 62
    })
    valid: Dataset({
        features: ['filepath', 'text_asr', 'text_translated', 'intent', 'audio'],
        num_rows: 61
    })
})





In [14]:
# Remove the specified columns
ds = ds.remove_columns(["filepath","text_translated", "intent"])

In [41]:
from datasets import Audio

ds = ds.cast_column("audio", Audio(sampling_rate=16_000))
ds2 = ds
ds['train']['audio']

[{'path': None,
  'array': array([ 3.02155968e-07,  1.34381844e-05, -4.49617801e-07, ...,
         -6.04857951e-02, -4.89357859e-02, -2.37017609e-02]),
  'sampling_rate': 16000},
 {'path': None,
  'array': array([-7.48488947e-06,  1.53290428e-04,  2.53665028e-04, ...,
         -6.82607875e-04, -5.01901493e-04, -2.07384408e-04]),
  'sampling_rate': 16000},
 {'path': None,
  'array': array([-8.79698200e-06, -1.31255510e-05,  8.75718251e-06, ...,
         -7.14519992e-05, -2.77650142e-05,  4.37170165e-05]),
  'sampling_rate': 16000},
 {'path': None,
  'array': array([ 2.21654365e-04,  9.35820135e-05,  2.20698676e-05, ...,
          1.61691132e-04,  1.24481285e-05, -5.63955691e-05]),
  'sampling_rate': 16000},
 {'path': None,
  'array': array([2.67376570e-04, 2.54679151e-04, 2.18419766e-04, ...,
         3.94027826e-04, 2.53349106e-04, 4.98622067e-05]),
  'sampling_rate': 16000},
 {'path': None,
  'array': array([-1.21923404e-05, -2.82285328e-05,  1.19969491e-05, ...,
         -7.47691374e

In [42]:
from transformers import WhisperTokenizer, WhisperFeatureExtractor
MODEL_NAME = "openai/whisper-large-v3"
tokenizer = WhisperTokenizer.from_pretrained(MODEL_NAME, language="german", task="transcribe")
feature_extractor = WhisperTokenizer.from_pretrained(MODEL_NAME, language="german", task="transcribe")

In [43]:
import numpy as np

def time_stretch(audio, rate=1.0):
    return librosa.effects.time_stretch(audio, rate=rate)

def pitch_shift(audio, sr, n_steps):
    return librosa.effects.pitch_shift(audio, sr=sr, n_steps=n_steps)

def add_noise(audio, noise_factor=0.005):
    noise = np.random.randn(len(audio))
    augmented_audio = audio + noise_factor * noise
    return np.clip(augmented_audio, -1, 1)

In [44]:
import librosa
import numpy as np

def preprocess_audio(audio, sr):
    # Trim silence
    audio, _ = librosa.effects.trim(audio, top_db=20)
    
    # Apply pre-emphasis filter
    audio = librosa.effects.preemphasis(audio)
    
    # Normalize audio
    audio = librosa.util.normalize(audio)
    
    # Add noise for robustness (optional)
    noise_factor = 0.005
    noise = np.random.randn(len(audio))
    augmented_audio = audio + noise_factor * noise
    
    # Ensure the audio is in the correct range
    augmented_audio = np.clip(augmented_audio, -1, 1)
    
    return augmented_audio

def apply_augmentation(audio, sr):
    augmented_audio = audio
    
    # Randomly apply augmentations
    if np.random.rand() < 0.5:
        augmented_audio = time_stretch(augmented_audio, rate=np.random.uniform(0.8, 1.2))
    if np.random.rand() < 0.5:
        augmented_audio = pitch_shift(augmented_audio, sr, n_steps=np.random.uniform(-2, 2))
    if np.random.rand() < 0.5:
        augmented_audio = add_noise(augmented_audio, noise_factor=np.random.uniform(0.001, 0.01))
    
    return augmented_audio

In [45]:
import numpy as np
import logging

logging.basicConfig(level=logging.INFO)
logger = logging.getLogger(__name__)

def prepare_dataset(batch):
    try:
        # Extract audio data and sampling rate
        audio_data = batch["audio"]["array"]
        sampling_rate = batch["audio"]["sampling_rate"]
        
        # Apply preprocessing and augmentation
        preprocessed_audio = preprocess_audio(audio_data, sampling_rate)
        augmented_audio = apply_augmentation(preprocessed_audio, sampling_rate)
        
        # Compute log-Mel input features from augmented audio array
        batch["input_features"] = feature_extractor(augmented_audio, sampling_rate=sampling_rate).input_features[0]
        
        # Ensure 'text_asr' is a string
        if isinstance(batch["text_asr"], (list, np.ndarray)):
            text = " ".join(map(str, batch["text_asr"]))
        elif not isinstance(batch["text_asr"], str):
            text = str(batch["text_asr"])
        else:
            text = batch["text_asr"]
        
        # Encode target text to label ids
        batch["labels"] = tokenizer(text).input_ids
        return batch
    except Exception as e:
        logger.error(f"Error processing batch: {e}")
        logger.error(f"Problematic batch: {batch}")
        raise

# Apply the updated prepare_dataset function
ds = ds.map(prepare_dataset, remove_columns=ds.column_names["train"], num_proc=4)

Map (num_proc=4):   0%|          | 0/488 [00:00<?, ? examples/s]ERROR:__main__:Error processing batch: text input must be of type `str` (single example), `List[str]` (batch or single pretokenized example) or `List[List[str]]` (batch of pretokenized examples).
ERROR:__main__:Problematic batch: {'text_asr': 'ja hallo guten Tag ich wollte dir ihre App benutzen und die App schließt sich aber die ganze Zeit wenn ich mich anmelden möchte und da wollte ich mal fragen was war los ist', 'audio': {'path': None, 'array': array([0.00030293, 0.00046724, 0.00018382, ..., 0.00030342, 0.00027104,
       0.00010502]), 'sampling_rate': 16000}}
Map (num_proc=4):   0%|          | 0/488 [00:16<?, ? examples/s]ERROR:__main__:Error processing batch: text input must be of type `str` (single example), `List[str]` (batch or single pretokenized example) or `List[List[str]]` (batch of pretokenized examples).
ERROR:__main__:Problematic batch: {'text_asr': 'ja schönen guten Tag ich habe ein problem mit meiner Karte

ValueError: text input must be of type `str` (single example), `List[str]` (batch or single pretokenized example) or `List[List[str]]` (batch of pretokenized examples).

In [38]:
ds = ds.map(prepare_dataset, remove_columns=ds.column_names["train"], num_proc=4)

Map (num_proc=4):   0%|          | 0/488 [00:15<?, ? examples/s]


ValueError: text input must be of type `str` (single example), `List[str]` (batch or single pretokenized example) or `List[List[str]]` (batch of pretokenized examples).

In [None]:
# mps config
import torch

device = "mps" if torch.backends.mps.is_available() else "cpu"
torch_dtype = torch.float16 if torch.backends.mps.is_available() else torch.float32

In [None]:
from transformers import WhisperFeatureExtractor, WhisperProcessor
MODEL_NAME = "openai/whisper-large-v3"
feature_extractor = WhisperFeatureExtractor.from_pretrained(MODEL_NAME)
processor = WhisperProcessor.from_pretrained(MODEL_NAME, language="german", task="transcribe")

In [None]:
print(ds["train"][0])

In [None]:
def prepare_dataset(batch):
    # load and resample audio data from 48 to 16kHz
    audio = batch["audio"]

    # compute log-Mel input features from input audio array
    batch["input_features"] = feature_extractor(audio["array"], sampling_rate=audio["sampling_rate"]).input_features[0]

    # encode target text to label ids
    batch["labels"] = tokenizer(batch["text_asr"]).input_ids
    return batch

In [None]:
ds = ds.map(prepare_dataset, remove_columns=ds.column_names["train"], num_proc=4)

In [None]:
from transformers import WhisperForConditionalGeneration

model = WhisperForConditionalGeneration.from_pretrained(MODEL_NAME).to(device)
# model = torch.nn.DataParallel(model)
# model = torch.compile(model)

In [None]:
model.generation_config.language = "german"
model.generation_config.task = "transcribe"
model.generation_config.forced_decoder_ids = None
# torch._dynamo.config.suppress_errors = True


# ds = ds.with_format("torch", device="mps")

In [None]:
import torch

from dataclasses import dataclass
from typing import Any, Dict, List, Union

@dataclass
class DataCollatorSpeechSeq2SeqWithPadding:
    processor: Any
    decoder_start_token_id: int

    def __call__(self, features: List[Dict[str, Union[List[int], torch.Tensor]]]) -> Dict[str, torch.Tensor]:
        # split inputs and labels since they have to be of different lengths and need different padding methods
        # first treat the audio inputs by simply returning torch tensors
        input_features = [{"input_features": feature["input_features"]} for feature in features]
        batch = self.processor.feature_extractor.pad(input_features, return_tensors="pt")

        # get the tokenized label sequences
        label_features = [{"input_ids": feature["labels"]} for feature in features]
        # pad the labels to max length
        labels_batch = self.processor.tokenizer.pad(label_features, return_tensors="pt")

        # replace padding with -100 to ignore loss correctly
        labels = labels_batch["input_ids"].masked_fill(labels_batch.attention_mask.ne(1), -100)

        # if bos token is appended in previous tokenization step,
        # cut bos token here as it's append later anyways
        if (labels[:, 0] == self.decoder_start_token_id).all().cpu().item():
            labels = labels[:, 1:]

        batch["labels"] = labels

        return batch

In [None]:
data_collator = DataCollatorSpeechSeq2SeqWithPadding(
    processor=processor,
    decoder_start_token_id=model.config.decoder_start_token_id,
)

In [None]:
import evaluate

metric = evaluate.load("wer")

In [None]:
def compute_metrics(pred):
    pred_ids = pred.predictions
    label_ids = pred.label_ids

    # replace -100 with the pad_token_id
    label_ids[label_ids == -100] = tokenizer.pad_token_id

    # we do not want to group tokens when computing the metrics
    pred_str = tokenizer.batch_decode(pred_ids, skip_special_tokens=True)
    label_str = tokenizer.batch_decode(label_ids, skip_special_tokens=True)

    wer = 100 * metric.compute(predictions=pred_str, references=label_str)

    return {"wer": wer}

In [None]:
from transformers import Seq2SeqTrainingArguments

training_args = Seq2SeqTrainingArguments(
    output_dir="./whisper-large-ger-lr1.5",  
    gradient_accumulation_steps=2,  # increase by 2x for every 2x decrease in batch size
    learning_rate=1e-5,
    warmup_steps=4,
    max_steps=61,
    gradient_checkpointing=True,
    evaluation_strategy="steps",
    per_device_train_batch_size=8,
    per_device_eval_batch_size=8,
    predict_with_generate=True,
    save_steps=8,
    eval_steps=8,
    logging_steps=8,
    load_best_model_at_end=True,
    metric_for_best_model="wer",
    greater_is_better=False,
)

In [None]:
from transformers import Seq2SeqTrainer

trainer = Seq2SeqTrainer(
    args=training_args,
    model=model,
    train_dataset=ds["train"],
    eval_dataset=ds["test"],
    data_collator=data_collator,
    compute_metrics=compute_metrics,
    tokenizer=processor.feature_extractor,
)

In [None]:
processor.save_pretrained(training_args.output_dir)

In [None]:
trainer.train()

In [None]:
import pandas as pd
trainer_history = pd.DataFrame(trainer.state.log_history)
trainer_history.groupby('step').first().reset_index()
trainer_history

In [None]:
import matplotlib.pyplot as plt

# Plot WER over training steps
plt.figure(figsize=(12, 8))

# Plot WER
plt.subplot(2, 1, 1)
plt.plot(trainer_history['step'], trainer_history['eval_wer'], marker='o', label='WER')
plt.title("WER over Training Steps")
plt.xlabel("Steps")
plt.ylabel("WER Evaluation")
plt.legend()
plt.grid(True)

plt.tight_layout()
plt.show()

In [None]:
import time
from transformers import WhisperForConditionalGeneration, WhisperProcessor

# Define the path to the model checkpoint
model_path = "/Users/shabiras/Developer/Learn/AI/NLP/INDONESIA AI/Project_3/whisper-small-ger-lr3.5/checkpoint-61"

# Load the best fine-tuned model
try:
    model = WhisperForConditionalGeneration.from_pretrained(model_path)
    processor = WhisperProcessor.from_pretrained(model_path)
    print("Model and processor loaded successfully.")
except OSError as e:
    print(f"Error loading model or processor: {e}")

# Inference function
def transcribe(audio):
    input_features = processor(audio["array"], sampling_rate=audio["sampling_rate"], return_tensors="pt").input_features
    predicted_ids = model.generate(input_features)
    transcription = processor.batch_decode(predicted_ids, skip_special_tokens=True)
    return transcription[0]

In [None]:
# Run inference on three samples
for i in range(3):
    sample = ds2["test"][i]
    
    start_time = time.time()
    transcription = transcribe(sample["audio"])
    end_time = time.time()
    
    inference_time = end_time - start_time
    
    print(f"Sample {i+1}:")
    print(f"Reference: {sample['text_asr']}")
    print(f"Prediction: {transcription}")
    print(f"Inference time: {inference_time:.4f} seconds")
    print()

# Calculate overall WER for these three samples
wer = metric.compute(predictions=[transcribe(ds["test"][i]["audio"]) for i in range(3)],
                     references=[ds2["test"][i]["text_asr"] for i in range(3)])
print(f"WER for 3 samples: {wer}")

In [None]:
!pip install sounddevice

In [None]:
import sounddevice as sd
import numpy as np

# Function to record audio from the microphone
def record_audio(duration, sample_rate=16000):
    print("Recording...")
    audio = sd.rec(int(duration * sample_rate), samplerate=sample_rate, channels=1, dtype='float32')
    sd.wait()  # Wait until the recording is finished
    print("Recording finished.")
    audio = np.squeeze(audio)  # Remove single-dimensional entries
    return {"array": audio, "sampling_rate": sample_rate}

# Record audio from the microphone
duration = 5  # Record for 5 seconds
audio = record_audio(duration)

# Transcribe the recorded audio
transcription = transcribe(audio)
print("Transcription:", transcription)