## Setup

In [1]:
%%capture
!pip install datasets==2.14.1
!pip install transformers==4.4.0
!pip install torchaudio
!pip install librosa
!pip install jiwer
!pip install evaluate 
!pip install wandb

In [2]:
from huggingface_hub import login

## Hugging face

In [3]:
from kaggle_secrets import UserSecretsClient
secret_label = "hf"
secret_value = UserSecretsClient().get_secret(secret_label)

In [4]:
login(token=secret_value)

Token will not been saved to git credential helper. Pass `add_to_git_credential=True` if you want to set the git credential as well.
Token is valid (permission: write).
Your token has been saved to /root/.cache/huggingface/token
Login successful


In [5]:
import os

model_name_or_path = "openai/whisper-small"
language = "Swahili"
language_abbr = "sw"
task = "transcribe"

In [6]:
import pandas as pd
import numpy as np
from datasets import Dataset, Audio, Value, Features, ClassLabel

## Dataset

## train_df = pd.read_csv("/kaggle/input/details/train.tsv",sep='\t')

In [9]:
missing_audio =  ['common_voice_sw_30318282.mp3','common_voice_sw_30035155.mp3','common_voice_sw_30377114.mp3']

train_df = train_df[train_df.path != missing_audio[0]]
train_df = train_df[train_df.path != missing_audio[1]]
train_df = train_df[train_df.path != missing_audio[2]]

train_df = train_df[train_df["down_votes"] < 2 ]
train_df.to_csv("train",index=False)
train_df["path"] = train_df["path"].apply(lambda x: f"/kaggle/input/commonvoice/train/{x}")

In [10]:
audio_dataset = Dataset.from_dict({"audio": train_df["path"], "sentence":train_df["sentence"]}).cast_column("audio", Audio())

In [11]:
audio_dataset

Dataset({
    features: ['audio', 'sentence'],
    num_rows: 78222
})

In [12]:
from datasets import Audio

audio_dataset = audio_dataset.cast_column("audio", Audio(sampling_rate=16000))

In [13]:
common_voice_train = audio_dataset.train_test_split(test_size=0.3)["train"]
common_voice_test = audio_dataset.train_test_split(test_size=0.3)["test"]

In [14]:
from transformers import WhisperFeatureExtractor, WhisperTokenizer, WhisperProcessor

feature_extractor = WhisperFeatureExtractor.from_pretrained(model_name_or_path)
tokenizer = WhisperTokenizer.from_pretrained(model_name_or_path, language=language, task=task)
processor = WhisperProcessor.from_pretrained(model_name_or_path, language=language, task=task)

Downloading (…)rocessor_config.json:   0%|          | 0.00/185k [00:00<?, ?B/s]

Downloading (…)okenizer_config.json:   0%|          | 0.00/805 [00:00<?, ?B/s]

Downloading (…)olve/main/vocab.json:   0%|          | 0.00/836k [00:00<?, ?B/s]

Downloading (…)/main/tokenizer.json:   0%|          | 0.00/2.48M [00:00<?, ?B/s]

Downloading (…)olve/main/merges.txt:   0%|          | 0.00/494k [00:00<?, ?B/s]

Downloading (…)main/normalizer.json:   0%|          | 0.00/52.7k [00:00<?, ?B/s]

Downloading (…)in/added_tokens.json:   0%|          | 0.00/34.6k [00:00<?, ?B/s]

Downloading (…)cial_tokens_map.json:   0%|          | 0.00/2.08k [00:00<?, ?B/s]

### Preprocesses

In [15]:
common_voice_test

Dataset({
    features: ['audio', 'sentence'],
    num_rows: 23467
})

In [23]:
# train_batches = common_voice_train.train_test_split(test_size=0.5)
test_batches = common_voice_test.train_test_split(test_size=0.5)


# train_batch_A = train_batches["train"]
# train_batch_B = train_batches["test"]

test_batch_A = test_batches["train"]
test_batch_B = test_batches["test"]

# train_batch_A_split = train_batch_A.train_test_split(test_size=0.5)
# train_batch_B_split = train_batch_B.train_test_split(test_size=0.5)

# train_batch_1 = train_batch_A_split["train"]
# train_batch_2 = train_batch_A_split["test"]
# train_batch_3 = train_batch_B_split["train"]
# train_batch_4 = train_batch_B_split["test"]

In [24]:
def prepare_dataset(batch):
    # load audio data
    audio = batch["audio"]

    # compute input length
    batch["input_length"] = len(batch["audio"])

    # compute log-Mel input features from input audio array 
    batch["input_features"] = feature_extractor(audio["array"], sampling_rate=audio["sampling_rate"]).input_features[0]

    # encode target text to label ids 
    batch["labels"] = tokenizer(batch["sentence"]).input_ids

    # compute labels length
    batch["labels_length"] = len(batch["labels"])
    return batch

In [25]:
# pre-process
# common_voice_train = common_voice_train.map(prepare_dataset)

In [27]:
# pre-process
Test_batch_A_preprocessed = test_batch_A.map(prepare_dataset)

Map:   0%|          | 0/11733 [00:00<?, ? examples/s]

KeyboardInterrupt: 

In [None]:
Test_batch_A_preprocessed.push_to_hub("Jayem-11/mozilla_commonvoice_hackathon_preprocessed_train_batch_5") 