In [1]:
from datasets import load_from_disk

  from .autonotebook import tqdm as notebook_tqdm


In [4]:
# run this cell two times: one with kyc and one with crm dataset
dataset=load_from_disk('dataset/crm_16khz-huggingface-dataset')
dataset

DatasetDict({
    train: Dataset({
        features: ['audio', 'transcription', 'client_id', 'locale'],
        num_rows: 780588
    })
    test: Dataset({
        features: ['audio', 'transcription', 'client_id', 'locale'],
        num_rows: 86732
    })
})

In [5]:
dataset = dataset.remove_columns(["locale", "client_id"])

print(dataset)

DatasetDict({
    train: Dataset({
        features: ['audio', 'transcription'],
        num_rows: 780588
    })
    test: Dataset({
        features: ['audio', 'transcription'],
        num_rows: 86732
    })
})


In [6]:
from transformers import WhisperFeatureExtractor

feature_extractor = WhisperFeatureExtractor.from_pretrained("openai/whisper-medium",cache_dir='v3')

In [7]:
from transformers import WhisperTokenizer

tokenizer = WhisperTokenizer.from_pretrained("openai/whisper-medium", language="Persian", task="transcribe",cache_dir='v3')

In [8]:
from transformers import WhisperProcessor

processor = WhisperProcessor.from_pretrained("openai/whisper-medium", language="Persian", task="transcribe",cache_dir='v3')

In [9]:
from datasets import Audio

dataset = dataset.cast_column("audio", Audio(sampling_rate=16000))

In [10]:
def prepare_dataset(batch):
    # load and resample audio data from 48 to 16kHz
    audio = batch["audio"]

    # compute log-Mel input features from input audio array
    batch["input_features"] = feature_extractor(audio["array"], sampling_rate=audio["sampling_rate"]).input_features[0]

    # encode target text to label ids
    batch["labels"] = tokenizer(batch["transcription"]).input_ids
    return batch

In [11]:
from datasets import Audio

dataset = dataset.cast_column("audio", Audio(sampling_rate=16000))

In [None]:
dataset = dataset.map(prepare_dataset, remove_columns=dataset.column_names["train"], num_proc=10)

Map (num_proc=10):  28%|██▊       | 221334/780588 [3:06:57<7:28:15, 20.79 examples/s] 

In [None]:
dataset.save_to_disk('dataset/whisper_processed_data/crm_16khz-processed.hf')

In [None]:
dataset

In [38]:
dataset

DatasetDict({
    train: Dataset({
        features: ['input_features', 'labels'],
        num_rows: 780809
    })
    test: Dataset({
        features: ['input_features', 'labels'],
        num_rows: 86757
    })
})

In [45]:
dataset_filtered=dataset['test'].select(range(5000,7500)).filter(lambda x: len(x['labels'])==0)
dataset_filtered

Filter: 100%|██████████| 2500/2500 [04:16<00:00,  9.76 examples/s]


Dataset({
    features: ['input_features', 'labels'],
    num_rows: 0
})

In [46]:
dataset_filtered

Dataset({
    features: ['input_features', 'labels'],
    num_rows: 0
})