In [1]:
import librosa
from tqdm import tqdm
import pandas as pd
from datasets import Dataset, DatasetDict
import torchaudio
from datasets import load_dataset
from transformers import WhisperProcessor, WhisperForConditionalGeneration
from safetensors.torch import load_file

## Datasets

In [6]:
# Datasets
def load_audio_data(file_path):
    audio_array, sampling_rate = librosa.load(file_path, sr=None)
    return audio_array, sampling_rate


def create_dataset(csv_file):
    df = pd.read_csv(csv_file) 

    processed_data = []
    for _, row in tqdm(df.iterrows(), total=df.shape[0], desc="Processing audio files"):
        audio_path = row['audio_path']
        label = row['label']
        audio_array, sampling_rate = load_audio_data(audio_path)
        
        item = {
            'audio': {
                'path': audio_path,
                'array': audio_array,
                'sampling_rate': sampling_rate
            },
            'sentence': label
        }
        processed_data.append(item)
    
    dataset = Dataset.from_pandas(pd.DataFrame(processed_data))
    train_dataset, test_dataset = dataset.train_test_split(test_size=0.05).values()
        
    dataset_dict = DatasetDict({
        'train': train_dataset,
        'test': test_dataset
    })

    return dataset_dict


# prepare csv for testing-data

level = "word" # "word"|"phn" 
csv_file = f'testingset/{level}.csv'
dataset = create_dataset(csv_file)

print('loading dataset....')
print(dataset)

Processing audio files: 100%|██████████| 10/10 [00:00<00:00, 1824.40it/s]

loading dataset....
DatasetDict({
    train: Dataset({
        features: ['audio', 'sentence'],
        num_rows: 9
    })
    test: Dataset({
        features: ['audio', 'sentence'],
        num_rows: 1
    })
})





#### Preview

In [7]:
audio_sample = dataset['train'][1]['audio']
text = dataset['train'][1]['sentence']
print(text)

It tells the story of a woman who has probably [INS] been abused


In [8]:
import numpy as np

audio = audio_sample['array']
audio = np.array(audio)

import IPython.display as ipd
ipd.display(ipd.Audio(audio, rate=16000, normalize=False))

In [5]:
from transformers import WhisperTokenizer
processor = WhisperProcessor.from_pretrained("openai/whisper-large")
tokenizer = WhisperTokenizer.from_pretrained("openai/whisper-large", language="en", task="transcribe")
model = WhisperForConditionalGeneration.from_pretrained("openai/whisper-small")

level = "word" # "word"|"phn"

new_tokens_word = ["[REP]", "[DEL]", "[PAU]", "[INS]"]
new_tokens_phn = ["[REP]", "[DEL]", "[PRO]", "[SUB]", "jh", "dh"]

tokenizer.add_tokens(list(f"new_tokens_{level}"))
model.resize_token_embeddings(len(tokenizer))

print("loading model weights....")
state_dict = load_file(f'pretrained/TbDD_{level}.safetensors')
model.load_state_dict(state_dict, strict=False)

Special tokens have been added in the vocabulary, make sure the associated word embeddings are fine-tuned or trained.


Special tokens have been added in the vocabulary, make sure the associated word embeddings are fine-tuned or trained.


loading model weights....


_IncompatibleKeys(missing_keys=['proj_out.weight'], unexpected_keys=[])

In [6]:
input_features = processor(
    audio_sample["array"], sampling_rate=audio_sample["sampling_rate"], return_tensors="pt"
).input_features


predicted_ids = model.generate(input_features, language='en')
print(predicted_ids)

transcription = tokenizer.decode(predicted_ids[0], skip_special_tokens=True)
print(transcription)

tensor([[50258, 50259, 50359, 50363,  6462,   366,   220, 51865,   291,  4735,
         50257]])
How are [REP] you sir
