In [None]:

from tqdm import tqdm
from datasets import load_dataset, Audio, Dataset
import pandas as pd
from transformers import WhisperProcessor
from transformers import WhisperTokenizer
from transformers import WhisperFeatureExtractor
from transformers import pipeline
import gc


In [None]:
def prepare_dataset(batch, feature_extractor, tokenizer):
    audio = batch["audio"]
    batch["input_features"] = feature_extractor(audio["array"], sampling_rate=audio["sampling_rate"]).input_features[0]
    batch["labels"] = tokenizer(batch["sentence"]).input_ids
    del batch["audio"]
    del batch["sentence"]
    gc.collect()  # Free memory
    return batch

In [None]:
df = pd.read_csv("audio_df_umlauts.csv")
audio_dataset = Dataset.from_pandas(df).cast_column("audio", Audio())

In [None]:
audio_dataset = audio_dataset.train_test_split(test_size=0.2, seed=42)
temp_ds = audio_dataset["test"].train_test_split(test_size = 0.5, seed=42)
audio_dataset["valid"]=temp_ds["train"]
audio_dataset["test"]=temp_ds["test"]

In [None]:
model_name = "openai/whisper-small"

In [None]:
tokenizer = WhisperTokenizer.from_pretrained(model_name, language="German", task="transcribe")
processor = WhisperProcessor.from_pretrained(model_name, language="German", task="transcribe")
feature_extractor = WhisperFeatureExtractor.from_pretrained(model_name)


In [None]:
audio_dataset = audio_dataset.cast_column("audio", Audio(sampling_rate=16000))

In [11]:

audio_dataset_mapped = audio_dataset.map(prepare_dataset, remove_columns=audio_dataset.column_names["train"],# num_proc=4, 
                                  fn_kwargs ={"feature_extractor":feature_extractor, "tokenizer": tokenizer})

Map:   0%|          | 0/60984 [00:00<?, ? examples/s]

Map:   0%|          | 0/7623 [00:00<?, ? examples/s]

Map:   0%|          | 0/7623 [00:00<?, ? examples/s]

In [13]:
audio_dataset_mapped.save_to_disk("processed_data/processed_audio_dataset_"+model_name) 

Saving the dataset (0/118 shards):   0%|          | 0/60984 [00:00<?, ? examples/s]

Saving the dataset (0/15 shards):   0%|          | 0/7623 [00:00<?, ? examples/s]

Saving the dataset (0/15 shards):   0%|          | 0/7623 [00:00<?, ? examples/s]

In [16]:
len(audio_dataset_mapped["train"]),len(audio_dataset_mapped["valid"]),len(audio_dataset_mapped["test"])

(60984, 7623, 7623)