In [3]:
from google.colab import drive
drive.mount('/content/drive')


Drive already mounted at /content/drive; to attempt to forcibly remount, call drive.mount("/content/drive", force_remount=True).


Install dependencies

In [None]:
!add-apt-repository -y ppa:jonathonf/ffmpeg-4
!apt update
!apt install -y ffmpeg
!pip install huggingface_hub
!pip install datasets>=2.6.1
!pip install git+https://github.com/huggingface/transformers
!pip install librosa
!pip install evaluate>=0.30
!pip install jiwer
!pip install gradio
!pip install huggingface
!pip install transformers

from IPython.display import clear_output
clear_output()


**Hugging face login**

In [4]:
from huggingface_hub import notebook_login

notebook_login()

VBox(children=(HTML(value='<center> <img\nsrc=https://huggingface.co/front/assets/huggingface_logo-noborder.sv…

**Download the swedish "common voice" dataset and split dataset into train and test**

Train consists of 7,3k rows and validation is 5.05k rows of data. They are put together in our "train" part

In [5]:
from datasets import load_dataset, DatasetDict

common_voice = DatasetDict()

common_voice["train"] = load_dataset("mozilla-foundation/common_voice_11_0", "sv-SE", split="train+validation", use_auth_token=True)
common_voice["test"] = load_dataset("mozilla-foundation/common_voice_11_0", "sv-SE", split="test", use_auth_token=True)




Downloading builder script:   0%|          | 0.00/8.13k [00:00<?, ?B/s]

Downloading readme:   0%|          | 0.00/14.4k [00:00<?, ?B/s]

Downloading extra modules:   0%|          | 0.00/3.44k [00:00<?, ?B/s]

Downloading extra modules:   0%|          | 0.00/60.9k [00:00<?, ?B/s]

Downloading data:   0%|          | 0.00/12.2k [00:00<?, ?B/s]

Downloading data files:   0%|          | 0/5 [00:00<?, ?it/s]

Downloading data:   0%|          | 0.00/197M [00:00<?, ?B/s]

Downloading data:   0%|          | 0.00/139M [00:00<?, ?B/s]

Downloading data:   0%|          | 0.00/152M [00:00<?, ?B/s]

Downloading data:   0%|          | 0.00/153M [00:00<?, ?B/s]

Downloading data:   0%|          | 0.00/42.6M [00:00<?, ?B/s]

Extracting data files:   0%|          | 0/5 [00:00<?, ?it/s]

Downloading data files:   0%|          | 0/5 [00:00<?, ?it/s]

Downloading data:   0%|          | 0.00/1.71M [00:00<?, ?B/s]

Downloading data:   0%|          | 0.00/1.14M [00:00<?, ?B/s]

Downloading data:   0%|          | 0.00/1.13M [00:00<?, ?B/s]

Downloading data:   0%|          | 0.00/1.34M [00:00<?, ?B/s]

Downloading data:   0%|          | 0.00/312k [00:00<?, ?B/s]

Extracting data files:   0%|          | 0/5 [00:00<?, ?it/s]

Generating train split: 0 examples [00:00, ? examples/s]


Reading metadata...: 7308it [00:00, 75524.82it/s]


Generating validation split: 0 examples [00:00, ? examples/s]


Reading metadata...: 5052it [00:00, 86018.49it/s]


Generating test split: 0 examples [00:00, ? examples/s]


Reading metadata...: 5069it [00:00, 98638.45it/s]


Generating other split: 0 examples [00:00, ? examples/s]


Reading metadata...: 5699it [00:00, 77400.67it/s]


Generating invalidated split: 0 examples [00:00, ? examples/s]


Reading metadata...: 1346it [00:00, 43479.69it/s]


**Remove redundant columns**

In [6]:
common_voice = common_voice.remove_columns(["accent", "age", "client_id", "down_votes", "gender", "locale", "path", "segment", "up_votes"])

In [7]:
common_voice

DatasetDict({
    train: Dataset({
        features: ['audio', 'sentence'],
        num_rows: 12360
    })
    test: Dataset({
        features: ['audio', 'sentence'],
        num_rows: 5069
    })
})

**Audio is the audio sample, Sentence is the transcribed text**

ASR Pipeline

Step 1. Feature extractor that pre-processes the audio-inputs. Does two things:

  -Pads/truncates every sample to 30s.

  -Converts the audio to log-Mel spectograms which is a visual representation of the audio. This is the form that the Whisper model expects as input.


Step 2. The model which performs seq-seq mapping (audio signals to text) by Encoding and Decoding. Model: whisper-small. Output is not in text format.

Step 3. A tokenizer which post-processes the model outputs to text format

In [8]:
from transformers import WhisperFeatureExtractor, WhisperTokenizer

#Will be used to perform step 1
feature_extractor = WhisperFeatureExtractor.from_pretrained("openai/whisper-small")

#Will be used to perform step 3?
tokenizer = WhisperTokenizer.from_pretrained("openai/whisper-small", language="Swedish", task="transcribe")


preprocessor_config.json:   0%|          | 0.00/185k [00:00<?, ?B/s]

tokenizer_config.json:   0%|          | 0.00/805 [00:00<?, ?B/s]

vocab.json:   0%|          | 0.00/836k [00:00<?, ?B/s]

tokenizer.json:   0%|          | 0.00/2.48M [00:00<?, ?B/s]

merges.txt:   0%|          | 0.00/494k [00:00<?, ?B/s]

normalizer.json:   0%|          | 0.00/52.7k [00:00<?, ?B/s]

added_tokens.json:   0%|          | 0.00/34.6k [00:00<?, ?B/s]

special_tokens_map.json:   0%|          | 0.00/2.08k [00:00<?, ?B/s]

Downsample audio input from 48 kHz to 16 kHz since the model is expecting 16 kHz input. The resample is not done in-place, but rather the first time the dataset is loaded.

In [9]:
from datasets import Audio

common_voice = common_voice.cast_column("audio", Audio(sampling_rate=16000))

In [10]:
def prepare_dataset(batch):
    # load and resample audio data from 48kHz to 16kHz
    audio = batch["audio"]

    # compute log-Mel input features from input audio array
    batch["input_features"] = feature_extractor(audio["array"], sampling_rate=audio["sampling_rate"]).input_features[0]

    # encode target text to label ids
    batch["labels"] = tokenizer(batch["sentence"]).input_ids
    return batch

Run the prepare_dataset function on every sample

In [11]:
common_voice = common_voice.map(prepare_dataset, remove_columns=common_voice.column_names["train"], num_proc=1)


Map:   0%|          | 0/12360 [00:00<?, ? examples/s]

Map:   0%|          | 0/5069 [00:00<?, ? examples/s]

In [12]:
new_common_voice = DatasetDict()

num_rows_tr = 12360
num_rows_test = 5069
num_rows_to_remove_tr = int(0.5 * num_rows_tr)
num_rows_to_remove_test = int(0.5 * num_rows_test)
indices_to_remove_tr = range(num_rows_to_remove_tr)
indices_to_remove_test = range(num_rows_to_remove_test)

new_common_voice_train = common_voice["train"].select([i for i in range(num_rows_tr) if i not in indices_to_remove_tr])
new_common_voice_test = common_voice["test"].select([i for i in range(num_rows_test) if i not in indices_to_remove_test])

new_common_voice["train"] = new_common_voice_train
new_common_voice["test"] = new_common_voice_test
new_common_voice

DatasetDict({
    train: Dataset({
        features: ['input_features', 'labels'],
        num_rows: 6180
    })
    test: Dataset({
        features: ['input_features', 'labels'],
        num_rows: 2535
    })
})

In [13]:
new_common_voice.save_to_disk("drive/MyDrive/Colab_Notebooks/lab2/CommonVoice/common_voice_swedish_preprocessed.hf")

Saving the dataset (0/12 shards):   0%|          | 0/6180 [00:00<?, ? examples/s]

Saving the dataset (0/5 shards):   0%|          | 0/2535 [00:00<?, ? examples/s]

In [None]:
common_voice

DatasetDict({
    train: Dataset({
        features: ['input_features', 'labels'],
        num_rows: 12360
    })
    test: Dataset({
        features: ['input_features', 'labels'],
        num_rows: 5069
    })
})