In [1]:
# !pip install -q transformers librosa jiwer torchaudio jsonlines datasets accelerate audiomentations # Audio Augmentation
# !pip install -q Cython

In [2]:
import os
os.environ["CUBLAS_WORKSPACE_CONFIG"] = ":4096:8"
import jsonlines
import torchaudio
from torchaudio import transforms
from transformers import AutoModelForSpeechSeq2Seq, AutoProcessor, pipeline
from datasets import Dataset
import torch
import librosa
import jiwer
import json
import re
from concurrent.futures import ThreadPoolExecutor
from tqdm import tqdm

print("CUDA available:", torch.cuda.is_available())
print("Number of GPUs:", torch.cuda.device_count())
print("GPU Name:", torch.cuda.get_device_name(0))

CUDA available: True
Number of GPUs: 1
GPU Name: Tesla T4


### Defining Directories

In [3]:
cur_dir = os.getcwd()
src_dir = os.path.dirname(cur_dir)
til_dir = os.path.dirname(os.path.dirname(src_dir))
home_dir = os.path.dirname(til_dir)
test_dir = os.path.join(home_dir, 'novice')
audio_dir = os.path.join(test_dir, 'audio')
data_dir = os.path.join(cur_dir, 'data')
model_path = os.path.join(src_dir, "models", "whisper")

# paths for converting datasets to manifest files
train_manifest_path = os.path.join(data_dir, 'train.json')
val_manifest_path = os.path.join(data_dir, 'val.json')
test_manifest_path = os.path.join(data_dir, 'test.json')

test_dir

'/home/jupyter/novice'

### Get max length

In [4]:
import gc

def calculate_max_length(dataset, audio_dir):
    max_length = 0
    all_tensors = []

    for example in dataset:
        audio_path = os.path.join(audio_dir, example['audio'])
        try:
            speech_tensor, sampling_rate = torchaudio.load(audio_path)

            # Resample to 16kHz if necessary
            if sampling_rate != 16000:
                resample_transform = torchaudio.transforms.Resample(orig_freq=sampling_rate, new_freq=16000)
                speech_tensor = resample_transform(speech_tensor)
                del resample_transform

            length = speech_tensor.shape[1]
            if length > max_length:
                max_length = length

            # Append the tensors to the list
            all_tensors.append((speech_tensor, sampling_rate))

        except Exception as e:
            print(f"Error processing {audio_path}: {e}")

    # Clear the list and call garbage collector
    del all_tensors
    gc.collect()

    return max_length


# # Running on Test Dataset gave 
# max_length = calculate_max_length(dataset, audio_dir) # Maximum length for padding: 219847
# print(f"Maximum length for padding: {max_length}")

### Preprocess Data

In [9]:
# Constants
chars_to_ignore_regex = '[,?.!\-;:\"]'
max_length = 220000  # 13.75 seconds at 16kHz

# MelSpectrogram transformation
mel_transform = transforms.MelSpectrogram(
    sample_rate=16000,
    n_fft=400,
    win_length=400,
    hop_length=160,
    n_mels=80
)

def preprocess_audio(example, processor, max_length):
    audio_path = os.path.join(audio_dir, example['audio'])
    transcript = example['transcript']
    
    try:
        # Load and process the audio file
        speech_tensor, sampling_rate = torchaudio.load(audio_path)
        duration = speech_tensor.shape[1] / sampling_rate

        # Resample to 16kHz if necessary
        if sampling_rate != 16000:
            resample_transform = torchaudio.transforms.Resample(orig_freq=sampling_rate, new_freq=16000)
            speech_tensor = resample_transform(speech_tensor)

        # Pad or truncate the audio tensor to the maximum length
        if speech_tensor.shape[1] < max_length:
            padding = torch.zeros((speech_tensor.shape[0], max_length - speech_tensor.shape[1]))
            speech_tensor = torch.cat((speech_tensor, padding), dim=1)
        elif speech_tensor.shape[1] > max_length:
            speech_tensor = speech_tensor[:, :max_length]

        # Convert to Mel spectrogram
        mel_spectrogram = mel_transform(speech_tensor)

        # Process the transcript
        transcript = re.sub(chars_to_ignore_regex, '', transcript)
        labels = processor.tokenizer(transcript, return_tensors="pt").input_ids[0]

        return {
            'audio_filepath': audio_path,
            'duration': duration,
            'text': transcript,
            'labels': labels.tolist(),
            'mel_spectrogram': mel_spectrogram.squeeze(0).tolist()
        }
    except Exception as e:
        print(f"Error processing {audio_path}: {e}")
        return None

def preprocess_data(dataset, output_path, processor, max_length, batch_size=100):
    # Clear the output file at the start
    open(output_path, 'w').close()

    # Split the dataset into individual examples
    keys = dataset['key']
    audios = dataset['audio']
    transcripts = dataset['transcript']
    
    examples = [{'key': key, 'audio': audio, 'transcript': transcript}
                for key, audio, transcript in zip(keys, audios, transcripts)]
    
    # Process the dataset in batches
    for i in range(0, len(examples), batch_size):
        batch = examples[i:i+batch_size]
        manifest = []
        
        for example in batch:
            result = preprocess_audio(example, processor, max_length)
            if result:
                manifest.append(result)

        # Write the manifest to the output file after each batch
        with open(output_path, 'a') as f:
            for entry in manifest:
                try:
                    json.dump(entry, f)
                    f.write('\n')
                except Exception as e:
                    print(f"Error writing entry to file: {e}")

## Load Model

In [6]:
device = "cuda:0" if torch.cuda.is_available() else "cpu"
torch_dtype = torch.float16 if torch.cuda.is_available() else torch.float32

# model_id = "distil-whisper/distil-medium.en"

# model = AutoModelForSpeechSeq2Seq.from_pretrained(
#     model_id, torch_dtype=torch_dtype, low_cpu_mem_usage=True, use_safetensors=True
# )
# model.to(device)
# processor = AutoProcessor.from_pretrained(model_id)

# model.save_pretrained(model_path)
# processor.save_pretrained(model_path)

model = AutoModelForSpeechSeq2Seq.from_pretrained(model_path, torch_dtype=torch_dtype, low_cpu_mem_usage=True, use_safetensors=True)
model.to(device)
processor = AutoProcessor.from_pretrained(model_path)

Special tokens have been added in the vocabulary, make sure the associated word embeddings are fine-tuned or trained.


In [7]:
# vocab = processor.tokenizer.get_vocab()
# print(vocab)

## Load and preprocess data - Ran once to create manifest files

In [10]:
max_length = 220000

MAX_FILE_COUNT = None # Set if only want max files

data = {'key': [], 'audio': [], 'transcript': []}
data_path = os.path.join(test_dir, "asr.jsonl")
with jsonlines.open(data_path) as reader:
    for obj in reader:
        # for key, value in obj.items():
        #     print(value)
        if MAX_FILE_COUNT and len(data['key']) >= MAX_FILE_COUNT:
            break
        for key, value in obj.items():
            data[key].append(value)

# Convert to a Hugging Face dataset
dataset = Dataset.from_dict(data)

# Shuffle the dataset
dataset = dataset.shuffle(seed=42)

# Split the dataset into training, validation, and test sets
train_size = int(0.8 * len(dataset))
val_size = int(0.1 * len(dataset))
test_size = len(dataset) - train_size - val_size

train_dataset = dataset.select(range(train_size))
val_dataset = dataset.select(range(train_size, train_size + val_size))
test_dataset = dataset.select(range(train_size + val_size, train_size + val_size + test_size))

preprocess_data(train_dataset, train_manifest_path, processor, max_length)
preprocess_data(val_dataset, val_manifest_path, processor, max_length)
preprocess_data(test_dataset, test_manifest_path, processor, max_length)

Reason for Max Length = 220000

In [None]:
# max_length = calculate_max_length(dataset, audio_dir)
# print(f"Maximum length for padding: {max_length}")

Maximum length for padding: 219847
<br>
Use Max Length = 220000, which is around 13.75s for a video at 16000 samples/s 