In [None]:
# Imports
from google.colab import drive
from datasets import load_dataset, DatasetDict
from transformers import WhisperFeatureExtractor, WhisperTokenizer, WhisperProcessor
from datasets import Audio

# Install image package
!add-apt-repository -y ppa:jonathonf/ffmpeg-4
!apt update
!apt install -y ffmpeg

# pip installs
!pip install datasets>=2.6.1
!pip install git+https://github.com/huggingface/transformers
!pip install librosa
!pip install evaluate>=0.30
!pip install jiwer
!pip install gradio

In [None]:
# Mount drive
drive.mount('/content/drive')

In [None]:
# Load dataset
common_voice = DatasetDict()

# I like how the sentences don't make much sense, the labels are wrong, and even I can't understand some the voices
# The dataset is too large, so only load part of it (25%)
load_percentage = 15
common_voice["train"] = load_dataset("mozilla-foundation/common_voice_11_0", "nl", split="train:"+load_percentage+"%+validation[:"+load_percentage+"%]", use_auth_token=True)
common_voice["test"] = load_dataset("mozilla-foundation/common_voice_11_0", "nl", split="test[:"+load_percentage+"%]", use_auth_token=True)

# Remove unneeded columns
common_voice = common_voice.remove_columns(["accent", "age", "client_id", "down_votes", "gender", "locale", "path", "segment", "up_votes"])

print(common_voice)

In [None]:
# Load model
# To improve performance, one could go for a larger model, but this would require a lot more time and resources to train
feature_extractor = WhisperFeatureExtractor.from_pretrained("openai/whisper-tiny")
tokenizer = WhisperTokenizer.from_pretrained("openai/whisper-tiny", language="Dutch", task="transcribe")
processor = WhisperProcessor.from_pretrained("openai/whisper-tiny", language="Dutch", task="transcribe")

In [None]:
# Pre-process data
# pip install "torchaudio<0.12"
print(common_voice["train"][0])

common_voice = common_voice.cast_column("audio", Audio(sampling_rate=16000))

def prepare_dataset(batch):
    # load and resample audio data from 48 to 16kHz
    audio = batch["audio"]

    # compute log-Mel input features from input audio array
    batch["input_features"] = feature_extractor(audio["array"], sampling_rate=audio["sampling_rate"]).input_features[0]

    # encode target text to label ids
    batch["labels"] = tokenizer(batch["sentence"]).input_ids
    return batch

# run the prepare_dataset code on the dataset, this takes *a lot* of time
common_voice = common_voice.map(prepare_dataset, remove_columns=common_voice.column_names["train"], num_proc=2)

In [None]:
# Time to save the result, so we don't lose it after having waited for 2 hours
# This required more storage than Google allows for, so this costed me $3 (for Google Drive storage)
print(type(common_voice))
common_voice.save_to_disk("/content/drive/MyDrive/Scalable/lab2/common_voice_processed.hf")